Add Gitea Actions setup documentation and update .gitignore for CI/CD scripts
This commit is contained in:
350
xml_to_markdown.py
Normal file
350
xml_to_markdown.py
Normal file
@@ -0,0 +1,350 @@
|
||||
import xml.etree.ElementTree as ET
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
def xml_text_to_markdown(elem):
|
||||
"""Konvertiert XML-Elemente zu Markdown-Text"""
|
||||
if elem is None:
|
||||
return ""
|
||||
|
||||
result = []
|
||||
|
||||
def process_element(e, in_list=False):
|
||||
"""Rekursive Verarbeitung von XML-Elementen"""
|
||||
if e is None:
|
||||
return
|
||||
|
||||
tag = e.tag if hasattr(e, 'tag') else None
|
||||
|
||||
# Text-Inhalt
|
||||
if e.text:
|
||||
text = e.text.strip()
|
||||
if text:
|
||||
result.append(text)
|
||||
|
||||
# Elemente rekursiv verarbeiten
|
||||
for child in e:
|
||||
child_tag = child.tag if hasattr(child, 'tag') else None
|
||||
|
||||
# Absätze <P>
|
||||
if child_tag == 'P':
|
||||
if result and result[-1] != '\n\n':
|
||||
result.append('\n\n')
|
||||
process_element(child)
|
||||
result.append('\n\n')
|
||||
|
||||
# Zeilenumbrüche <BR/>
|
||||
elif child_tag == 'BR':
|
||||
result.append('\n')
|
||||
if child.text:
|
||||
result.append(child.text.strip())
|
||||
|
||||
# Listen <DL>
|
||||
elif child_tag == 'DL':
|
||||
list_type = child.get('Type', '') if hasattr(child, 'get') else ''
|
||||
is_arabic = list_type == 'arabic'
|
||||
process_list(child, is_arabic)
|
||||
|
||||
# Listen-Einträge <DT> und <DD>
|
||||
elif child_tag == 'DT':
|
||||
if result and result[-1] not in ['\n', '\n\n']:
|
||||
result.append('\n')
|
||||
process_element(child)
|
||||
# Kein zusätzlicher Punkt, da Nummern bereits im Text enthalten sind
|
||||
|
||||
elif child_tag == 'DD':
|
||||
process_element(child)
|
||||
result.append('\n')
|
||||
|
||||
# Links/Anker <LA>
|
||||
elif child_tag == 'LA':
|
||||
process_element(child)
|
||||
|
||||
# Tabellen (vereinfacht)
|
||||
elif child_tag in ['table', 'tgroup', 'tbody', 'thead', 'row', 'entry']:
|
||||
if child_tag == 'table' or child_tag == 'tgroup':
|
||||
result.append('\n[Tabelle]\n')
|
||||
# Tabellen-Inhalt überspringen
|
||||
|
||||
# Andere Elemente rekursiv verarbeiten
|
||||
else:
|
||||
process_element(child)
|
||||
|
||||
# Tail-Text nach Elementen
|
||||
if child.tail:
|
||||
tail_text = child.tail.strip()
|
||||
if tail_text:
|
||||
result.append(tail_text)
|
||||
|
||||
def process_list(dl_elem, is_arabic):
|
||||
"""Verarbeitet eine Liste"""
|
||||
dt_count = 0
|
||||
for child in dl_elem:
|
||||
if child.tag == 'DT':
|
||||
dt_count += 1
|
||||
if result and result[-1] not in ['\n', '\n\n']:
|
||||
result.append('\n')
|
||||
process_element(child)
|
||||
# Kein zusätzlicher Punkt, da Nummern bereits im Text enthalten sind
|
||||
elif child.tag == 'DD':
|
||||
process_element(child)
|
||||
result.append('\n')
|
||||
|
||||
process_element(elem)
|
||||
|
||||
# Ergebnis zusammenfügen
|
||||
text = ''.join(result)
|
||||
|
||||
# HTML-Entities dekodieren
|
||||
text = text.replace('"', '"')
|
||||
text = text.replace('&', '&')
|
||||
text = text.replace('<', '<')
|
||||
text = text.replace('>', '>')
|
||||
|
||||
# Absätze besser formatieren - (1), (2) etc. als neue Zeilen
|
||||
text = re.sub(r'\((\d+)\)', r'\n\n(\1)', text)
|
||||
|
||||
# Mehrfache Leerzeilen reduzieren
|
||||
text = re.sub(r'\n{3,}', '\n\n', text)
|
||||
|
||||
# Führende/abschließende Leerzeichen entfernen
|
||||
text = text.strip()
|
||||
|
||||
return text
|
||||
|
||||
def extract_paragraph_number(enbez_elem):
|
||||
"""Extrahiert die Paragraphennummer aus dem enbez-Element"""
|
||||
if enbez_elem is None or enbez_elem.text is None:
|
||||
return None
|
||||
|
||||
text = enbez_elem.text.strip()
|
||||
# Entferne "§" und extrahiere Nummer
|
||||
match = re.search(r'§\s*(\d+)', text)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
# Auch für Bereiche wie "§§ 1 bis 9"
|
||||
match = re.search(r'§§\s*(\d+)', text)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
return None
|
||||
|
||||
def extract_title(titel_elem):
|
||||
"""Extrahiert den Titel eines Paragraphen"""
|
||||
if titel_elem is None:
|
||||
return None
|
||||
if titel_elem.text:
|
||||
return titel_elem.text.strip()
|
||||
return ET.tostring(titel_elem, encoding='unicode', method='text').strip()
|
||||
|
||||
def convert_xml_to_markdown(xml_file_path, output_base_dir='laws_md'):
|
||||
"""Konvertiert eine XML-Datei zu Markdown-Dateien (eine pro Paragraph)"""
|
||||
|
||||
try:
|
||||
tree = ET.parse(xml_file_path)
|
||||
root = tree.getroot()
|
||||
except ET.ParseError as e:
|
||||
print(f"FEHLER: XML-Parse-Fehler in {xml_file_path}: {e}")
|
||||
return False
|
||||
|
||||
# Gesetzesname aus Dateipfad extrahieren
|
||||
# Format: laws/<gesetz>/<gesetz>_<datum>_<hash>.xml
|
||||
path_parts = Path(xml_file_path).parts
|
||||
if len(path_parts) < 2:
|
||||
print(f"FEHLER: Ungültiger Dateipfad: {xml_file_path}")
|
||||
return False
|
||||
|
||||
gesetz_name = path_parts[-2] # Ordnername ist der Gesetzesname
|
||||
|
||||
# Ausgabeverzeichnis erstellen
|
||||
output_dir = os.path.join(output_base_dir, gesetz_name)
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# Metadaten des Gesetzes extrahieren (erste norm)
|
||||
first_norm = root.find('.//norm')
|
||||
gesetz_title = None
|
||||
if first_norm is not None:
|
||||
meta = first_norm.find('.//metadaten')
|
||||
if meta is not None:
|
||||
langue = meta.find('langue')
|
||||
if langue is not None and langue.text:
|
||||
gesetz_title = langue.text.strip()
|
||||
|
||||
# README.md für das Gesetz erstellen
|
||||
readme_path = os.path.join(output_dir, 'README.md')
|
||||
with open(readme_path, 'w', encoding='utf-8') as f:
|
||||
f.write(f"# {gesetz_name.upper()}\n\n")
|
||||
if gesetz_title:
|
||||
f.write(f"**{gesetz_title}**\n\n")
|
||||
f.write(f"---\n\n")
|
||||
f.write(f"Dieses Verzeichnis enthält die einzelnen Paragraphen dieses Gesetzes.\n\n")
|
||||
|
||||
paragraphs = []
|
||||
|
||||
# Alle norm-Elemente durchgehen
|
||||
for norm in root.findall('.//norm'):
|
||||
meta = norm.find('metadaten')
|
||||
if meta is None:
|
||||
continue
|
||||
|
||||
# Paragraphennummer extrahieren
|
||||
enbez = meta.find('enbez')
|
||||
para_num = extract_paragraph_number(enbez)
|
||||
|
||||
# Überspringe wenn kein Paragraph
|
||||
if para_num is None:
|
||||
continue
|
||||
|
||||
# Titel extrahieren
|
||||
titel = meta.find('titel')
|
||||
para_title = extract_title(titel)
|
||||
|
||||
# Text extrahieren
|
||||
textdaten = norm.find('textdaten')
|
||||
if textdaten is None:
|
||||
continue
|
||||
|
||||
text_elem = textdaten.find('text')
|
||||
if text_elem is None:
|
||||
continue
|
||||
|
||||
content = text_elem.find('Content')
|
||||
if content is None:
|
||||
continue
|
||||
|
||||
# Zu Markdown konvertieren
|
||||
markdown_text = xml_text_to_markdown(content)
|
||||
|
||||
if not markdown_text or markdown_text.strip() == '':
|
||||
continue
|
||||
|
||||
paragraphs.append({
|
||||
'num': para_num,
|
||||
'title': para_title,
|
||||
'text': markdown_text
|
||||
})
|
||||
|
||||
# Paragraphen nach Nummer sortieren
|
||||
paragraphs.sort(key=lambda x: int(x['num']))
|
||||
|
||||
# Markdown-Dateien erstellen
|
||||
for para in paragraphs:
|
||||
filename = f"§{para['num']}.md"
|
||||
filepath = os.path.join(output_dir, filename)
|
||||
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
# Überschrift
|
||||
f.write(f"# § {para['num']}")
|
||||
if para['title']:
|
||||
f.write(f" {para['title']}")
|
||||
f.write("\n\n")
|
||||
|
||||
# Text
|
||||
f.write(para['text'])
|
||||
f.write("\n")
|
||||
|
||||
# Zu README hinzufügen
|
||||
with open(readme_path, 'a', encoding='utf-8') as f:
|
||||
link_text = f"§ {para['num']}"
|
||||
if para['title']:
|
||||
link_text += f" {para['title']}"
|
||||
f.write(f"- [{link_text}](§{para['num']}.md)\n")
|
||||
|
||||
print(f"✓ Konvertiert: {gesetz_name} ({len(paragraphs)} Paragraphen)")
|
||||
return True
|
||||
|
||||
def convert_all_laws(laws_dir='laws', output_dir='laws_md'):
|
||||
"""Konvertiert alle XML-Dateien im laws-Verzeichnis"""
|
||||
|
||||
if not os.path.exists(laws_dir):
|
||||
print(f"FEHLER: Verzeichnis {laws_dir} existiert nicht!")
|
||||
return False
|
||||
|
||||
print(f"Konvertiere Gesetze von {laws_dir} nach {output_dir}...\n")
|
||||
|
||||
converted_count = 0
|
||||
error_count = 0
|
||||
|
||||
# Durch alle Gesetze iterieren
|
||||
for gesetz_dir in os.listdir(laws_dir):
|
||||
gesetz_path = os.path.join(laws_dir, gesetz_dir)
|
||||
|
||||
if not os.path.isdir(gesetz_path):
|
||||
continue
|
||||
|
||||
# XML-Dateien im Gesetzesverzeichnis finden
|
||||
xml_files = [f for f in os.listdir(gesetz_path) if f.endswith('.xml')]
|
||||
|
||||
if not xml_files:
|
||||
continue
|
||||
|
||||
# Neueste XML-Datei verwenden (falls mehrere Versionen vorhanden)
|
||||
xml_file = sorted(xml_files)[-1]
|
||||
xml_path = os.path.join(gesetz_path, xml_file)
|
||||
|
||||
if convert_xml_to_markdown(xml_path, output_dir):
|
||||
converted_count += 1
|
||||
else:
|
||||
error_count += 1
|
||||
|
||||
print("\n" + "=" * 50)
|
||||
print("ZUSAMMENFASSUNG")
|
||||
print("=" * 50)
|
||||
print(f"Erfolgreich konvertiert: {converted_count} Gesetze")
|
||||
if error_count > 0:
|
||||
print(f"Fehler: {error_count} Gesetze")
|
||||
print(f"\nAusgabe-Verzeichnis: {output_dir}/")
|
||||
|
||||
return converted_count > 0
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
# Kommandozeilenargumente parsen
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Konvertiert XML-Gesetze zu Markdown-Dateien',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Beispiele:
|
||||
python3 xml_to_markdown.py # Verwendet test_laws (falls vorhanden)
|
||||
python3 xml_to_markdown.py --prod # Verwendet laws Verzeichnis
|
||||
"""
|
||||
)
|
||||
parser.add_argument(
|
||||
'--prod',
|
||||
action='store_true',
|
||||
help='Verwendet das laws Verzeichnis statt test_laws'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Verzeichnis basierend auf Flag wählen
|
||||
if args.prod:
|
||||
# Produktionsmodus: laws verwenden
|
||||
laws_dir = 'laws'
|
||||
output_dir = 'laws_md'
|
||||
if not os.path.exists(laws_dir):
|
||||
print(f"FEHLER: Produktions-Verzeichnis '{laws_dir}' existiert nicht!")
|
||||
sys.exit(1)
|
||||
print("Produktionsmodus: Verwende laws Verzeichnis")
|
||||
else:
|
||||
# Testmodus: test_laws verwenden (falls vorhanden)
|
||||
if os.path.exists('test_laws'):
|
||||
laws_dir = 'test_laws'
|
||||
output_dir = 'test_laws_md'
|
||||
print("Testmodus: Verwende test_laws Verzeichnis")
|
||||
elif os.path.exists('laws'):
|
||||
laws_dir = 'laws'
|
||||
output_dir = 'laws_md'
|
||||
print("Hinweis: test_laws nicht gefunden, verwende laws Verzeichnis")
|
||||
else:
|
||||
print("FEHLER: Weder 'laws' noch 'test_laws' Verzeichnis gefunden!")
|
||||
print("Tipp: Verwende --prod um das laws Verzeichnis zu verwenden")
|
||||
sys.exit(1)
|
||||
|
||||
success = convert_all_laws(laws_dir, output_dir)
|
||||
sys.exit(0 if success else 1)
|
||||
|
||||
Reference in New Issue
Block a user