import feedparser import requests import xml.etree.ElementTree as ET from urllib.parse import urlparse import zipfile import io import os from datetime import datetime, timedelta import difflib import hashlib RSS_URL = 'https://www.gesetze-im-internet.de/aktuDienst-rss-feed.xml' TOC_URL = 'https://www.gesetze-im-internet.de/gii-toc.xml' OUTPUT_DIR = 'laws' LAST_UPDATE_FILE = 'last_update.txt' # Letztes globales Update laden if os.path.exists(LAST_UPDATE_FILE): with open(LAST_UPDATE_FILE, 'r') as f: last_update = datetime.fromisoformat(f.read().strip()) else: last_update = datetime.now() - timedelta(days=1) feed = feedparser.parse(RSS_URL) new_changes = [] for entry in feed.entries: pub_date = datetime(*entry.published_parsed[:6]) if pub_date > last_update: title = entry.title.strip() new_changes.append((title, pub_date.strftime('%Y-%m-%d'))) if not new_changes: print("No new changes.") exit(0) # TOC laden toc_response = requests.get(TOC_URL) toc_root = ET.fromstring(toc_response.content) toc_laws = {item.find('title').text.strip(): item.find('link').text.strip() for item in toc_root.findall('item')} updated = False for change_title, change_date in new_changes: match = max(toc_laws.keys(), key=lambda t: difflib.SequenceMatcher(None, t.lower(), change_title.lower()).ratio(), default=None) if match and difflib.SequenceMatcher(None, match.lower(), change_title.lower()).ratio() > 0.8: link = toc_laws[match] abbrev = os.path.basename(urlparse(link).path).replace('/xml.zip', '') zip_url = link zip_response = requests.get(zip_url) if zip_response.status_code == 200: with zipfile.ZipFile(io.BytesIO(zip_response.content)) as z: xml_files = [f for f in z.namelist() if f.endswith('.xml')] if xml_files: xml_content = z.read(xml_files[0]) xml_root = ET.fromstring(xml_content) # Stand extrahieren meta = xml_root.find('.//metadaten') stand_comment = '' stand_elem = meta.find('standangabe') if stand_elem is not None: comment_elem = stand_elem.find('standkommentar') if comment_elem is not None: stand_comment = ET.tostring(comment_elem, encoding='unicode', method='text').strip() new_stand_hash = hashlib.md5(stand_comment.encode()).hexdigest() # Lokalen Stand checken (in last_stand.txt im law_dir) law_dir = os.path.join(OUTPUT_DIR, abbrev) os.makedirs(law_dir, exist_ok=True) stand_file = os.path.join(law_dir, 'last_stand.txt') old_stand_hash = '' if os.path.exists(stand_file): with open(stand_file, 'r') as f: old_stand_hash = f.read().strip() if new_stand_hash != old_stand_hash: # Datum extrahieren date_str = meta.find('ausfertigung-datum').text if meta.find('ausfertigung-datum') is not None else change_date file_path = os.path.join(law_dir, f"{abbrev}_{change_date}_{new_stand_hash[:8]}.xml") with open(file_path, 'wb') as f: f.write(xml_content) with open(stand_file, 'w') as f: f.write(new_stand_hash) print(f"Updated: {match} as {file_path}") updated = True # Globales Update speichern with open(LAST_UPDATE_FILE, 'w') as f: f.write(datetime.now().isoformat()) if updated: print("Changes committed.")