diff --git a/dist/hfdb_xml_converter-0.1-py3-none-any.whl b/dist/hfdb_xml_converter-0.1-py3-none-any.whl new file mode 100644 index 0000000000000000000000000000000000000000..1d80db8d34da817d439a54f203a6e55900719273 Binary files /dev/null and b/dist/hfdb_xml_converter-0.1-py3-none-any.whl differ diff --git a/dist/hfdb_xml_converter-0.1.tar.gz b/dist/hfdb_xml_converter-0.1.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..e70851cd19491f7baff4a36296c1f61a6761d61d Binary files /dev/null and b/dist/hfdb_xml_converter-0.1.tar.gz differ diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..1b68d94ecd0dc92bb45c7722df8642e0afe836e1 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +requires = ["setuptools>=42", "wheel"] +build-backend = "setuptools.build_meta" \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000000000000000000000000000000000000..50f4798907552b50c9950bb7f3b1924ce65d811c --- /dev/null +++ b/setup.cfg @@ -0,0 +1,3 @@ +[metadata] +name = hfdb_xml_converter +version = 0.1 \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..d20859edac6257b4f10b3da3b3bf6d9d89bad2ca --- /dev/null +++ b/setup.py @@ -0,0 +1,14 @@ +from setuptools import setup, find_packages + +setup( + name="hfdb_xml_converter", + version="0.1", + package_dir={"": "src"}, + packages=find_packages(where="src"), + install_requires=[], + author="Jan Seipel", + author_email="jan.seipel@swr.de", + description="Ein Paket zum Parsen der XML-Datei einer HFDB Merkliste und Konvertieren in JSON", + long_description=open("README.md").read(), + long_description_content_type="text/markdown", +) \ No newline at end of file diff --git a/src/hfdb_xml_converter.egg-info/PKG-INFO b/src/hfdb_xml_converter.egg-info/PKG-INFO new file mode 100644 index 0000000000000000000000000000000000000000..74ef0e807bba7ece1e09612684e876d278254d62 --- /dev/null +++ b/src/hfdb_xml_converter.egg-info/PKG-INFO @@ -0,0 +1,57 @@ +Metadata-Version: 2.1 +Name: hfdb_xml_converter +Version: 0.1 +Summary: Ein Paket zum Parsen der XML-Datei einer HFDB Merkliste und Konvertieren in JSON +Author: Jan Seipel +Author-email: jan.seipel@swr.de +Description-Content-Type: text/markdown + +# hfdb_xml_converter + +XML-Datei einer Merkliste auf Korpusebene auslesen und in ein JSON-Format übersetzen. + +## Installation + +## Verwendung + +1. XML-Datei einer HFDB Merkliste aus dem HFDB-Rich-Client herunterladen (Datei -> Export -> XML) +2. Datei mit Python-Script laden und als JSON ausgeben lassen: + +```python +from hfdb_xml_converter import xml_to_json + +input_file = './data/konf_data.xml' +output_dir = './output' +output_file = process_xml_to_json(input_file, output_dir) + +# Output + +[ + { + "id": 1, + "traegertitel": "Reaktion Bischof Ackermann auf Bericht zu Missbrauch und Bischof Stein", + "tracktitel": "Reaktion Bischof Ackermann auf Bericht zu Missbrauch und Bischof Stein", + "keywords": [ + "Bistumsgeschichte", + "Deutlichkeit", + "Mehrzahl", + ... + ], + "transkript": "Ich sage schon auch, dass es für mich bedrückend ist, dass ...", + "link": "https://linkresolver2.ivz.cn.ard.de/linkresolver/resolve?context=hfdb2&hfdb-recherche/permalink?typ=vollinfoAnzeige&vollinfoArt=vollinfokonf&ak=43073187&ko=53232625&amo=17350850&gkonf=53232624&destination=swrhfdb1" + }, + { + "id": 2, + "traegertitel": "Unwetter am Bodensee und im Allgäu. Bericht aus Meckenbeuren", + "tracktitel": "Unwetter am Bodensee und im Allgäu. Bericht aus Meckenbeuren", + "keywords": [ + "Hochwasser", + "Pegel", + "Feuerwehr", + ... + ], + "transkript": "Es war eine dramatische Nacht in Meckenbeuren im Bodenseekreis ...", + "link": "https://linkresolver2.ivz.cn.ard.de/linkresolver/resolve?context=hfdb2&hfdb-recherche/permalink?typ=vollinfoAnzeige&vollinfoArt=vollinfokonf&ak=42598879&ko=52736037&amo=17124572&gkonf=52736033&destination=swrhfdb1" + } +] +``` diff --git a/src/hfdb_xml_converter.egg-info/SOURCES.txt b/src/hfdb_xml_converter.egg-info/SOURCES.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e8a0edfae8efec642d8f9058505be0e6d359f8b --- /dev/null +++ b/src/hfdb_xml_converter.egg-info/SOURCES.txt @@ -0,0 +1,10 @@ +README.md +pyproject.toml +setup.cfg +setup.py +src/hfdb_xml_converter/__init__.py +src/hfdb_xml_converter/parser.py +src/hfdb_xml_converter.egg-info/PKG-INFO +src/hfdb_xml_converter.egg-info/SOURCES.txt +src/hfdb_xml_converter.egg-info/dependency_links.txt +src/hfdb_xml_converter.egg-info/top_level.txt \ No newline at end of file diff --git a/src/hfdb_xml_converter.egg-info/dependency_links.txt b/src/hfdb_xml_converter.egg-info/dependency_links.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/src/hfdb_xml_converter.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/src/hfdb_xml_converter.egg-info/top_level.txt b/src/hfdb_xml_converter.egg-info/top_level.txt new file mode 100644 index 0000000000000000000000000000000000000000..df33646883c5cb574acfbd1e19c70bc3e54a27a2 --- /dev/null +++ b/src/hfdb_xml_converter.egg-info/top_level.txt @@ -0,0 +1 @@ +hfdb_xml_converter diff --git a/src/hfdb_xml_converter/__init__.py b/src/hfdb_xml_converter/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a2e86ead82640c23093b7fbaa9b5015816064598 --- /dev/null +++ b/src/hfdb_xml_converter/__init__.py @@ -0,0 +1 @@ +from .parser import parse_xml, xml_to_json \ No newline at end of file diff --git a/src/hfdb_xml_converter/parser.py b/src/hfdb_xml_converter/parser.py new file mode 100644 index 0000000000000000000000000000000000000000..ca0c1d2ca57a572e5da7a776f08f4475181b53c7 --- /dev/null +++ b/src/hfdb_xml_converter/parser.py @@ -0,0 +1,86 @@ +import xml.etree.ElementTree as ET +import json +import os + +def parse_xml(xml_file): + tree = ET.parse(xml_file) + root = tree.getroot() + + # Namespace-Definition + ns = {'ns': 'http://ard.de/sad/hfdb/vollinfo'} + + result = [] + id_counter = 1 + + for index, vi in enumerate(root.findall('ns:VI', ns)): + item = {'id': id_counter} + id_counter += 1 + + # Traegertitel + ak_full = vi.find(".//ns:AK[@voll='j']", ns) + if ak_full is not None: + rhti = ak_full.find('ns:RHTI', ns) + if rhti is not None and rhti.text: + item['traegertitel'] = rhti.text.strip() + else: + print("Kein RHTI Element in AK[@voll='j'] gefunden oder leerer Text") + else: + print("Kein AK Element mit voll='j' gefunden") + + # Tracktitel + ak = vi.find('ns:AK', ns) + if ak is not None: + rhti = ak.find('ns:RHTI', ns) + if rhti is not None and rhti.text: + item['tracktitel'] = rhti.text.strip() + else: + print("Kein RHTI Element in AK gefunden oder leerer Text") + else: + print("Kein AK Element gefunden") + + # Nur das erste KONF-Element verarbeiten + konf = vi.find('.//ns:KONF', ns) + if konf is not None: + # Keywords + keywords = [] + for kwe in konf.findall('.//ns:KWE/ns:TEXT', ns): + if kwe.text: + keywords.append(kwe.text.strip()) + item['keywords'] = keywords + + # Transkript + transcript = "" + for speaker in konf.findall('.//ns:TRANSCRIPT/ns:SPEAKER', ns): + if speaker.text: + transcript += speaker.text.strip() + " " + item['transkript'] = transcript.strip() + else: + print("Kein KONF Element gefunden") + + # Link aus PLUGPARA erstellen + plugpara = vi.find('.//ns:PLUGPARA', ns) + if plugpara is not None: + instance = plugpara.get('instance') + konf = plugpara.get('konf') + amo = plugpara.get('amo') + ak = plugpara.get('ak') + gk = plugpara.get('gk') + + link = f"https://linkresolver2.ivz.cn.ard.de/linkresolver/resolve?context=hfdb2&hfdb-recherche/permalink?typ=vollinfoAnzeige&vollinfoArt=vollinfokonf&ak={ak}&ko={konf}&amo={amo}&gkonf={gk}&destination={instance}" + item['link'] = link + else: + print("Kein PLUGPARA Element gefunden") + + result.append(item) + + return result + +def xml_to_json(input_file, output_dir): + data = parse_xml(input_file) + output_file = os.path.join(output_dir, os.path.splitext(os.path.basename(input_file))[0] + '.json') + + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=2) + + print(f"Die JSON-Datei wurde erstellt: {output_file}") + return output_file