update_laws.py: Nutze RSS-Summary statt Title für Matching, 129 Gesetze aktualisiert

- RSS entry.summary enthält den Gesetzes-Namen, entry.title nur BGBl-Nr
- Kern-Gesetznamen werden aus Änderungs-Beschreibungen extrahiert
- ETag/Last-Modified Caching für effiziente Updates
- Metadaten werden in laws_metadata.json gespeichert
- last_update.txt auf Oktober 2025 zurückgesetzt für erneuten Test
This commit is contained in:
2026-02-20 20:29:11 +01:00
parent b03835f75e
commit b0c100f312
262 changed files with 5513 additions and 68 deletions

View File

@@ -5,90 +5,305 @@ from urllib.parse import urlparse
import zipfile
import io
import os
import re
import json
import time
import hashlib
from datetime import datetime, timedelta
import difflib
import hashlib
RSS_URL = 'https://www.gesetze-im-internet.de/aktuDienst-rss-feed.xml'
TOC_URL = 'https://www.gesetze-im-internet.de/gii-toc.xml'
OUTPUT_DIR = 'laws'
LAST_UPDATE_FILE = 'last_update.txt'
METADATA_FILE = 'laws_metadata.json'
# Letztes globales Update laden
if os.path.exists(LAST_UPDATE_FILE):
with open(LAST_UPDATE_FILE, 'r') as f:
last_update = datetime.fromisoformat(f.read().strip())
else:
last_update = datetime.now() - timedelta(days=1)
# Delay zwischen Downloads (Rate-Limiting vermeiden)
DOWNLOAD_DELAY = 0.5
feed = feedparser.parse(RSS_URL)
new_changes = []
for entry in feed.entries:
pub_date = datetime(*entry.published_parsed[:6])
if pub_date > last_update:
title = entry.title.strip()
new_changes.append((title, pub_date.strftime('%Y-%m-%d')))
def load_last_update():
"""Letztes Update-Datum laden"""
if os.path.exists(LAST_UPDATE_FILE):
with open(LAST_UPDATE_FILE, 'r') as f:
return datetime.fromisoformat(f.read().strip())
return datetime.now() - timedelta(days=7)
if not new_changes:
print("No new changes.")
exit(0)
# TOC laden
toc_response = requests.get(TOC_URL)
toc_root = ET.fromstring(toc_response.content)
toc_laws = {item.find('title').text.strip(): item.find('link').text.strip() for item in toc_root.findall('item')}
def save_last_update():
"""Aktuelles Datum als letztes Update speichern"""
with open(LAST_UPDATE_FILE, 'w') as f:
f.write(datetime.now().isoformat())
updated = False
for change_title, change_date in new_changes:
match = max(toc_laws.keys(), key=lambda t: difflib.SequenceMatcher(None, t.lower(), change_title.lower()).ratio(), default=None)
if match and difflib.SequenceMatcher(None, match.lower(), change_title.lower()).ratio() > 0.8:
link = toc_laws[match]
abbrev = os.path.basename(urlparse(link).path).replace('/xml.zip', '')
zip_url = link
zip_response = requests.get(zip_url)
if zip_response.status_code == 200:
with zipfile.ZipFile(io.BytesIO(zip_response.content)) as z:
xml_files = [f for f in z.namelist() if f.endswith('.xml')]
if xml_files:
xml_content = z.read(xml_files[0])
xml_root = ET.fromstring(xml_content)
def load_metadata():
"""Gespeicherte Metadaten laden (ETags, Last-Modified, etc.)"""
if os.path.exists(METADATA_FILE):
with open(METADATA_FILE, 'r') as f:
return json.load(f)
return {}
# Stand extrahieren
meta = xml_root.find('.//metadaten')
stand_comment = ''
stand_elem = meta.find('standangabe')
if stand_elem is not None:
comment_elem = stand_elem.find('standkommentar')
if comment_elem is not None:
stand_comment = ET.tostring(comment_elem, encoding='unicode', method='text').strip()
new_stand_hash = hashlib.md5(stand_comment.encode()).hexdigest()
# Lokalen Stand checken (in last_stand.txt im law_dir)
law_dir = os.path.join(OUTPUT_DIR, abbrev)
os.makedirs(law_dir, exist_ok=True)
stand_file = os.path.join(law_dir, 'last_stand.txt')
old_stand_hash = ''
if os.path.exists(stand_file):
with open(stand_file, 'r') as f:
old_stand_hash = f.read().strip()
def save_metadata(metadata):
"""Metadaten speichern"""
with open(METADATA_FILE, 'w') as f:
json.dump(metadata, f, indent=2, ensure_ascii=False)
if new_stand_hash != old_stand_hash:
# Datum extrahieren
date_str = meta.find('ausfertigung-datum').text if meta.find('ausfertigung-datum') is not None else change_date
file_path = os.path.join(law_dir, f"{abbrev}_{change_date}_{new_stand_hash[:8]}.xml")
with open(file_path, 'wb') as f:
f.write(xml_content)
with open(stand_file, 'w') as f:
f.write(new_stand_hash)
print(f"Updated: {match} as {file_path}")
updated = True
def get_abbrev_from_url(url):
"""Extrahiert die Gesetzesabkürzung aus der TOC-URL.
z.B. http://www.gesetze-im-internet.de/bgb/xml.zip -> bgb
"""
path = urlparse(url).path
# Pfad ist z.B. /bgb/xml.zip
parts = path.strip('/').split('/')
if parts:
return parts[0]
return None
# Globales Update speichern
with open(LAST_UPDATE_FILE, 'w') as f:
f.write(datetime.now().isoformat())
if updated:
print("Changes committed.")
def extract_core_law_name(summary):
"""Extrahiert den Kern-Gesetznamen aus dem RSS-Summary.
z.B. 'Drittes Gesetz zur Änderung des Gesetzes gegen den unlauteren Wettbewerb vom 12. Februar 2026'
-> 'Gesetzes gegen den unlauteren Wettbewerb'
"""
# Entferne Datum am Ende (z.B. "vom 12. Februar 2026")
text = re.sub(r'\s+vom\s+\d{1,2}\.\s+\w+\s+\d{4}\s*$', '', summary.strip())
# Versuche den Kern-Gesetznamen zu extrahieren
# Pattern: "... Änderung des/der [GESETZNAME]"
patterns = [
r'(?:Änderung|Neufassung|Aufhebung)\s+(?:des|der)\s+(.+?)(?:\s*$)',
r'(?:Änderung|Neufassung|Aufhebung)\s+(.+?)(?:\s*$)',
]
for pattern in patterns:
match = re.search(pattern, text)
if match:
return match.group(1).strip()
return text
def load_toc():
"""TOC (Inhaltsverzeichnis) aller Gesetze laden"""
print("Lade TOC von gesetze-im-internet.de...")
toc_response = requests.get(TOC_URL, timeout=30)
toc_response.raise_for_status()
toc_root = ET.fromstring(toc_response.content)
toc = {}
for item in toc_root.findall('item'):
title = item.find('title').text.strip()
link = item.find('link').text.strip()
abbrev = get_abbrev_from_url(link)
if abbrev:
toc[abbrev] = {
'title': title,
'link': link,
}
print(f" {len(toc)} Gesetze in der TOC gefunden")
return toc
def find_matching_law(summary, toc):
"""Findet das passende Gesetz in der TOC basierend auf dem RSS-Summary.
Gibt (abbrev, toc_entry) zurück oder (None, None).
"""
# 1. Versuch: Direkt den Summary gegen TOC-Titel matchen
best_abbrev = None
best_ratio = 0
for abbrev, entry in toc.items():
ratio = difflib.SequenceMatcher(None, entry['title'].lower(), summary.lower()).ratio()
if ratio > best_ratio:
best_ratio = ratio
best_abbrev = abbrev
if best_ratio >= 0.8:
return best_abbrev, toc[best_abbrev]
# 2. Versuch: Kern-Gesetznamen extrahieren und matchen
core_name = extract_core_law_name(summary)
if core_name != summary:
for abbrev, entry in toc.items():
ratio = difflib.SequenceMatcher(None, entry['title'].lower(), core_name.lower()).ratio()
if ratio > best_ratio:
best_ratio = ratio
best_abbrev = abbrev
if best_ratio >= 0.7:
return best_abbrev, toc[best_abbrev]
return None, None
def download_and_save_law(abbrev, link, change_date, metadata):
"""Lädt ein Gesetz herunter und speichert es, wenn es sich geändert hat.
Gibt True zurück wenn das Gesetz aktualisiert wurde.
"""
try:
# Prüfe ob sich die Datei geändert hat (via ETag/Last-Modified)
headers = {}
if abbrev in metadata:
if 'etag' in metadata[abbrev]:
headers['If-None-Match'] = metadata[abbrev]['etag']
if 'last_modified' in metadata[abbrev]:
headers['If-Modified-Since'] = metadata[abbrev]['last_modified']
response = requests.get(link, headers=headers, timeout=60)
if response.status_code == 304:
# Nicht geändert
return False
if response.status_code != 200:
print(f" WARNUNG: HTTP {response.status_code} für {abbrev}")
return False
# Metadaten aktualisieren
metadata[abbrev] = {
'etag': response.headers.get('ETag', ''),
'last_modified': response.headers.get('Last-Modified', ''),
'last_checked': datetime.now().isoformat(),
}
# ZIP entpacken
with zipfile.ZipFile(io.BytesIO(response.content)) as z:
xml_files = [f for f in z.namelist() if f.endswith('.xml')]
if not xml_files:
print(f" WARNUNG: Keine XML-Datei in {abbrev}/xml.zip")
return False
xml_content = z.read(xml_files[0])
# XML parsen und Stand prüfen
xml_root = ET.fromstring(xml_content)
content_hash = hashlib.md5(xml_content).hexdigest()[:8]
# Stand extrahieren
meta = xml_root.find('.//metadaten')
stand_comment = ''
if meta is not None:
stand_elem = meta.find('standangabe')
if stand_elem is not None:
comment_elem = stand_elem.find('standkommentar')
if comment_elem is not None:
stand_comment = ET.tostring(comment_elem, encoding='unicode', method='text').strip()
stand_hash = hashlib.md5(stand_comment.encode()).hexdigest()
# Lokalen Stand prüfen
law_dir = os.path.join(OUTPUT_DIR, abbrev)
os.makedirs(law_dir, exist_ok=True)
stand_file = os.path.join(law_dir, 'last_stand.txt')
old_stand_hash = ''
if os.path.exists(stand_file):
with open(stand_file, 'r') as f:
old_stand_hash = f.read().strip()
if stand_hash == old_stand_hash:
return False
# Speichern
file_path = os.path.join(law_dir, f"{abbrev}_{change_date}_{content_hash}.xml")
with open(file_path, 'wb') as f:
f.write(xml_content)
with open(stand_file, 'w') as f:
f.write(stand_hash)
return True
except Exception as e:
print(f" FEHLER bei {abbrev}: {e}")
return False
def main():
last_update = load_last_update()
metadata = load_metadata()
print(f"Letztes Update: {last_update.strftime('%Y-%m-%d %H:%M')}")
print()
# 1. RSS-Feed laden
print("Lade RSS-Feed...")
feed = feedparser.parse(RSS_URL)
new_entries = []
for entry in feed.entries:
pub_date = datetime(*entry.published_parsed[:6])
if pub_date > last_update:
summary = entry.get('summary', entry.title).strip()
new_entries.append({
'title': entry.title.strip(),
'summary': summary,
'date': pub_date.strftime('%Y-%m-%d'),
'link': entry.get('link', ''),
})
print(f" {len(new_entries)} neue Einträge seit letztem Update")
if not new_entries:
print("Keine neuen Änderungen.")
save_last_update()
save_metadata(metadata)
exit(0)
# 2. TOC laden
toc = load_toc()
print()
# 3. Gesetze matchen und herunterladen
updated_count = 0
skipped_count = 0
not_found_count = 0
seen_abbrevs = set() # Duplikate vermeiden
for i, entry in enumerate(new_entries):
summary = entry['summary']
change_date = entry['date']
# Gesetz in TOC finden
abbrev, toc_entry = find_matching_law(summary, toc)
if abbrev is None:
print(f" [{i+1}/{len(new_entries)}] NICHT GEFUNDEN: {summary[:70]}")
not_found_count += 1
continue
if abbrev in seen_abbrevs:
# Schon in diesem Run verarbeitet
continue
seen_abbrevs.add(abbrev)
print(f" [{i+1}/{len(new_entries)}] {abbrev}: {toc_entry['title'][:60]}")
# Herunterladen und speichern
updated = download_and_save_law(abbrev, toc_entry['link'], change_date, metadata)
if updated:
print(f" -> AKTUALISIERT")
updated_count += 1
else:
print(f" -> unverändert")
skipped_count += 1
time.sleep(DOWNLOAD_DELAY)
# 4. Ergebnis speichern
save_last_update()
save_metadata(metadata)
print()
print("=" * 50)
print("ZUSAMMENFASSUNG")
print("=" * 50)
print(f" Neue RSS-Einträge: {len(new_entries)}")
print(f" Aktualisiert: {updated_count}")
print(f" Unverändert: {skipped_count}")
print(f" Nicht gefunden: {not_found_count}")
if updated_count > 0:
print(f"\n{updated_count} Gesetze aktualisiert.")
else:
print("\nKeine Gesetze aktualisiert.")
if __name__ == "__main__":
main()