lawgit/update_laws.py

import feedparser
import requests
import xml.etree.ElementTree as ET
from urllib.parse import urlparse
import zipfile
import io
import os
from datetime import datetime, timedelta
import difflib
import hashlib

RSS_URL = 'https://www.gesetze-im-internet.de/aktuDienst-rss-feed.xml'
TOC_URL = 'https://www.gesetze-im-internet.de/gii-toc.xml'
OUTPUT_DIR = 'laws'
LAST_UPDATE_FILE = 'last_update.txt'

# Letztes globales Update laden
if os.path.exists(LAST_UPDATE_FILE):
    with open(LAST_UPDATE_FILE, 'r') as f:
        last_update = datetime.fromisoformat(f.read().strip())
else:
    last_update = datetime.now() - timedelta(days=1)

feed = feedparser.parse(RSS_URL)
new_changes = []

for entry in feed.entries:
    pub_date = datetime(*entry.published_parsed[:6])
    if pub_date > last_update:
        title = entry.title.strip()
        new_changes.append((title, pub_date.strftime('%Y-%m-%d')))

if not new_changes:
    print("No new changes.")
    exit(0)

# TOC laden
toc_response = requests.get(TOC_URL)
toc_root = ET.fromstring(toc_response.content)
toc_laws = {item.find('title').text.strip(): item.find('link').text.strip() for item in toc_root.findall('item')}

updated = False
for change_title, change_date in new_changes:
    match = max(toc_laws.keys(), key=lambda t: difflib.SequenceMatcher(None, t.lower(), change_title.lower()).ratio(), default=None)
    if match and difflib.SequenceMatcher(None, match.lower(), change_title.lower()).ratio() > 0.8:
        link = toc_laws[match]
        abbrev = os.path.basename(urlparse(link).path).replace('/xml.zip', '')
        zip_url = link

        zip_response = requests.get(zip_url)
        if zip_response.status_code == 200:
            with zipfile.ZipFile(io.BytesIO(zip_response.content)) as z:
                xml_files = [f for f in z.namelist() if f.endswith('.xml')]
                if xml_files:
                    xml_content = z.read(xml_files[0])
                    xml_root = ET.fromstring(xml_content)

                    # Stand extrahieren
                    meta = xml_root.find('.//metadaten')
                    stand_comment = ''
                    stand_elem = meta.find('standangabe')
                    if stand_elem is not None:
                        comment_elem = stand_elem.find('standkommentar')
                        if comment_elem is not None:
                            stand_comment = ET.tostring(comment_elem, encoding='unicode', method='text').strip()
                    new_stand_hash = hashlib.md5(stand_comment.encode()).hexdigest()

                    # Lokalen Stand checken (in last_stand.txt im law_dir)
                    law_dir = os.path.join(OUTPUT_DIR, abbrev)
                    os.makedirs(law_dir, exist_ok=True)
                    stand_file = os.path.join(law_dir, 'last_stand.txt')
                    old_stand_hash = ''
                    if os.path.exists(stand_file):
                        with open(stand_file, 'r') as f:
                            old_stand_hash = f.read().strip()

                    if new_stand_hash != old_stand_hash:
                        # Datum extrahieren
                        date_str = meta.find('ausfertigung-datum').text if meta.find('ausfertigung-datum') is not None else change_date

                        file_path = os.path.join(law_dir, f"{abbrev}_{change_date}_{new_stand_hash[:8]}.xml")
                        with open(file_path, 'wb') as f:
                            f.write(xml_content)
                        with open(stand_file, 'w') as f:
                            f.write(new_stand_hash)
                        print(f"Updated: {match} as {file_path}")
                        updated = True

# Globales Update speichern
with open(LAST_UPDATE_FILE, 'w') as f:
    f.write(datetime.now().isoformat())

if updated:
    print("Changes committed.")