Hochladen des gebauten python pakets

b3cc6b0d · Jan Seipel · c768313c · b3cc6b0d · b3cc6b0d · b3cc6b0d
Commit b3cc6b0d authored 4 months ago by Jan Seipel
--- a/dist/hfdb_xml_converter-0.1-py3-none-any.whl
+++ b/dist/hfdb_xml_converter-0.1-py3-none-any.whl
--- a/dist/hfdb_xml_converter-0.1.tar.gz
+++ b/dist/hfdb_xml_converter-0.1.tar.gz
--- a/pyproject.toml
+++ b/pyproject.toml
+[build-system]
+requires = ["setuptools>=42", "wheel"]
+build-backend = "setuptools.build_meta"
\ No newline at end of file
--- a/setup.cfg
+++ b/setup.cfg
+[metadata]
+name = hfdb_xml_converter
+version = 0.1
\ No newline at end of file
--- a/setup.py
+++ b/setup.py
+from setuptools import setup, find_packages
+setup(
+    name="hfdb_xml_converter",
+    version="0.1",
+    package_dir={"": "src"},
+    packages=find_packages(where="src"),
+    install_requires=[],
+    author="Jan Seipel",
+    author_email="jan.seipel@swr.de",
+    description="Ein Paket zum Parsen der XML-Datei einer HFDB Merkliste und Konvertieren in JSON",
+    long_description=open("README.md").read(),
+    long_description_content_type="text/markdown",
+)
\ No newline at end of file
--- a/src/hfdb_xml_converter.egg-info/PKG-INFO
+++ b/src/hfdb_xml_converter.egg-info/PKG-INFO
+Metadata-Version: 2.1
+Name: hfdb_xml_converter
+Version: 0.1
+Summary: Ein Paket zum Parsen der XML-Datei einer HFDB Merkliste und Konvertieren in JSON
+Author: Jan Seipel
+Author-email: jan.seipel@swr.de
+Description-Content-Type: text/markdown
+# hfdb_xml_converter
+XML-Datei einer Merkliste auf Korpusebene auslesen und in ein JSON-Format übersetzen.
+## Installation
+## Verwendung
+1. XML-Datei einer HFDB Merkliste aus dem HFDB-Rich-Client herunterladen (Datei -> Export -> XML)
+2. Datei mit Python-Script laden und als JSON ausgeben lassen: 
+```python
+from hfdb_xml_converter import xml_to_json
+input_file = './data/konf_data.xml'
+output_dir = './output'
+output_file = process_xml_to_json(input_file, output_dir)
+# Output
+[
+  {
+    "id": 1,
+    "traegertitel": "Reaktion Bischof Ackermann auf Bericht zu Missbrauch und Bischof Stein",
+    "tracktitel": "Reaktion Bischof Ackermann auf Bericht zu Missbrauch und Bischof Stein",
+    "keywords": [
+      "Bistumsgeschichte",
+      "Deutlichkeit",
+      "Mehrzahl",
+      ...
+    ],
+    "transkript": "Ich sage schon auch, dass es für mich bedrückend ist, dass ...",
+    "link": "https://linkresolver2.ivz.cn.ard.de/linkresolver/resolve?context=hfdb2&hfdb-recherche/permalink?typ=vollinfoAnzeige&vollinfoArt=vollinfokonf&ak=43073187&ko=53232625&amo=17350850&gkonf=53232624&destination=swrhfdb1"
+  },
+  {
+    "id": 2,
+    "traegertitel": "Unwetter am Bodensee und im Allgäu. Bericht aus Meckenbeuren",
+    "tracktitel": "Unwetter am Bodensee und im Allgäu. Bericht aus Meckenbeuren",
+    "keywords": [
+      "Hochwasser",
+      "Pegel",
+      "Feuerwehr",
+      ...
+    ],
+    "transkript": "Es war eine dramatische Nacht in Meckenbeuren im Bodenseekreis ...",
+    "link": "https://linkresolver2.ivz.cn.ard.de/linkresolver/resolve?context=hfdb2&hfdb-recherche/permalink?typ=vollinfoAnzeige&vollinfoArt=vollinfokonf&ak=42598879&ko=52736037&amo=17124572&gkonf=52736033&destination=swrhfdb1"
+  }
+]
+```
--- a/src/hfdb_xml_converter.egg-info/SOURCES.txt
+++ b/src/hfdb_xml_converter.egg-info/SOURCES.txt
+README.md
+pyproject.toml
+setup.cfg
+setup.py
+src/hfdb_xml_converter/__init__.py
+src/hfdb_xml_converter/parser.py
+src/hfdb_xml_converter.egg-info/PKG-INFO
+src/hfdb_xml_converter.egg-info/SOURCES.txt
+src/hfdb_xml_converter.egg-info/dependency_links.txt
+src/hfdb_xml_converter.egg-info/top_level.txt
\ No newline at end of file
--- a/src/hfdb_xml_converter.egg-info/dependency_links.txt
+++ b/src/hfdb_xml_converter.egg-info/dependency_links.txt
--- a/src/hfdb_xml_converter.egg-info/top_level.txt
+++ b/src/hfdb_xml_converter.egg-info/top_level.txt
+hfdb_xml_converter
--- a/src/hfdb_xml_converter/__init__.py
+++ b/src/hfdb_xml_converter/__init__.py
+from .parser import parse_xml, xml_to_json
\ No newline at end of file
--- a/src/hfdb_xml_converter/parser.py
+++ b/src/hfdb_xml_converter/parser.py
+import xml.etree.ElementTree as ET
+import json
+import os
+def parse_xml(xml_file):
+    tree = ET.parse(xml_file)
+    root = tree.getroot()
+    # Namespace-Definition
+    ns = {'ns': 'http://ard.de/sad/hfdb/vollinfo'}
+    result = []
+    id_counter = 1
+    for index, vi in enumerate(root.findall('ns:VI', ns)):
+        item = {'id': id_counter}
+        id_counter += 1
+        # Traegertitel
+        ak_full = vi.find(".//ns:AK[@voll='j']", ns)
+        if ak_full is not None:
+            rhti = ak_full.find('ns:RHTI', ns)
+            if rhti is not None and rhti.text:
+                item['traegertitel'] = rhti.text.strip()
+            else:
+                print("Kein RHTI Element in AK[@voll='j'] gefunden oder leerer Text")
+        else:
+            print("Kein AK Element mit voll='j' gefunden")
+        # Tracktitel
+        ak = vi.find('ns:AK', ns)
+        if ak is not None:
+            rhti = ak.find('ns:RHTI', ns)
+            if rhti is not None and rhti.text:
+                item['tracktitel'] = rhti.text.strip()
+            else:
+                print("Kein RHTI Element in AK gefunden oder leerer Text")
+        else:
+            print("Kein AK Element gefunden")
+        # Nur das erste KONF-Element verarbeiten
+        konf = vi.find('.//ns:KONF', ns)
+        if konf is not None:
+            # Keywords
+            keywords = []
+            for kwe in konf.findall('.//ns:KWE/ns:TEXT', ns):
+                if kwe.text:
+                    keywords.append(kwe.text.strip())
+            item['keywords'] = keywords
+            # Transkript
+            transcript = ""
+            for speaker in konf.findall('.//ns:TRANSCRIPT/ns:SPEAKER', ns):
+                if speaker.text:
+                    transcript += speaker.text.strip() + " "
+            item['transkript'] = transcript.strip()
+        else:
+            print("Kein KONF Element gefunden")
+        # Link aus PLUGPARA erstellen
+        plugpara = vi.find('.//ns:PLUGPARA', ns)
+        if plugpara is not None:
+            instance = plugpara.get('instance')
+            konf = plugpara.get('konf')
+            amo = plugpara.get('amo')
+            ak = plugpara.get('ak')
+            gk = plugpara.get('gk')
+            link = f"https://linkresolver2.ivz.cn.ard.de/linkresolver/resolve?context=hfdb2&hfdb-recherche/permalink?typ=vollinfoAnzeige&vollinfoArt=vollinfokonf&ak={ak}&ko={konf}&amo={amo}&gkonf={gk}&destination={instance}"
+            item['link'] = link
+        else:
+            print("Kein PLUGPARA Element gefunden")
+        result.append(item)
+    return result
+def xml_to_json(input_file, output_dir):
+    data = parse_xml(input_file)
+    output_file = os.path.join(output_dir, os.path.splitext(os.path.basename(input_file))[0] + '.json')
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=2)
+    print(f"Die JSON-Datei wurde erstellt: {output_file}")
+    return output_file