Files
lawgit/update_laws.py

94 lines
3.8 KiB
Python

import feedparser
import requests
import xml.etree.ElementTree as ET
from urllib.parse import urlparse
import zipfile
import io
import os
from datetime import datetime, timedelta
import difflib
import hashlib
RSS_URL = 'https://www.gesetze-im-internet.de/aktuDienst-rss-feed.xml'
TOC_URL = 'https://www.gesetze-im-internet.de/gii-toc.xml'
OUTPUT_DIR = 'laws'
LAST_UPDATE_FILE = 'last_update.txt'
# Letztes globales Update laden
if os.path.exists(LAST_UPDATE_FILE):
with open(LAST_UPDATE_FILE, 'r') as f:
last_update = datetime.fromisoformat(f.read().strip())
else:
last_update = datetime.now() - timedelta(days=1)
feed = feedparser.parse(RSS_URL)
new_changes = []
for entry in feed.entries:
pub_date = datetime(*entry.published_parsed[:6])
if pub_date > last_update:
title = entry.title.strip()
new_changes.append((title, pub_date.strftime('%Y-%m-%d')))
if not new_changes:
print("No new changes.")
exit(0)
# TOC laden
toc_response = requests.get(TOC_URL)
toc_root = ET.fromstring(toc_response.content)
toc_laws = {item.find('title').text.strip(): item.find('link').text.strip() for item in toc_root.findall('item')}
updated = False
for change_title, change_date in new_changes:
match = max(toc_laws.keys(), key=lambda t: difflib.SequenceMatcher(None, t.lower(), change_title.lower()).ratio(), default=None)
if match and difflib.SequenceMatcher(None, match.lower(), change_title.lower()).ratio() > 0.8:
link = toc_laws[match]
abbrev = os.path.basename(urlparse(link).path).replace('/xml.zip', '')
zip_url = link
zip_response = requests.get(zip_url)
if zip_response.status_code == 200:
with zipfile.ZipFile(io.BytesIO(zip_response.content)) as z:
xml_files = [f for f in z.namelist() if f.endswith('.xml')]
if xml_files:
xml_content = z.read(xml_files[0])
xml_root = ET.fromstring(xml_content)
# Stand extrahieren
meta = xml_root.find('.//metadaten')
stand_comment = ''
stand_elem = meta.find('standangabe')
if stand_elem is not None:
comment_elem = stand_elem.find('standkommentar')
if comment_elem is not None:
stand_comment = ET.tostring(comment_elem, encoding='unicode', method='text').strip()
new_stand_hash = hashlib.md5(stand_comment.encode()).hexdigest()
# Lokalen Stand checken (in last_stand.txt im law_dir)
law_dir = os.path.join(OUTPUT_DIR, abbrev)
os.makedirs(law_dir, exist_ok=True)
stand_file = os.path.join(law_dir, 'last_stand.txt')
old_stand_hash = ''
if os.path.exists(stand_file):
with open(stand_file, 'r') as f:
old_stand_hash = f.read().strip()
if new_stand_hash != old_stand_hash:
# Datum extrahieren
date_str = meta.find('ausfertigung-datum').text if meta.find('ausfertigung-datum') is not None else change_date
file_path = os.path.join(law_dir, f"{abbrev}_{change_date}_{new_stand_hash[:8]}.xml")
with open(file_path, 'wb') as f:
f.write(xml_content)
with open(stand_file, 'w') as f:
f.write(new_stand_hash)
print(f"Updated: {match} as {file_path}")
updated = True
# Globales Update speichern
with open(LAST_UPDATE_FILE, 'w') as f:
f.write(datetime.now().isoformat())
if updated:
print("Changes committed.")