Skip to content
Snippets Groups Projects
Commit a7707897 authored by Jan Seipel's avatar Jan Seipel
Browse files

Package um Mitwirkende und Urherber + weitere Features erweitert

parent c0f08cd0
No related branches found
No related tags found
No related merge requests found
File added
File added
[metadata]
name = hfdb_xml_converter
version = 0.1
\ No newline at end of file
version = 0.2
\ No newline at end of file
......@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
setup(
name="hfdb_xml_converter",
version="0.1",
version="0.2",
package_dir={"": "src"},
packages=find_packages(where="src"),
install_requires=[],
......
Metadata-Version: 2.1
Name: hfdb_xml_converter
Version: 0.1
Version: 0.2
Summary: Ein Paket zum Parsen der XML-Datei einer HFDB Merkliste und Konvertieren in JSON
Author: Jan Seipel
Author-email: jan.seipel@swr.de
......@@ -12,6 +12,8 @@ XML-Datei einer Merkliste auf Korpusebene auslesen und in ein JSON-Format übers
## Installation
pip install git+https://gitlab.ard.de/ida/hfdb_xml_converter.git
## Verwendung
1. XML-Datei einer HFDB Merkliste aus dem HFDB-Rich-Client herunterladen (Datei -> Export -> XML)
......
......@@ -7,4 +7,5 @@ src/hfdb_xml_converter/parser.py
src/hfdb_xml_converter.egg-info/PKG-INFO
src/hfdb_xml_converter.egg-info/SOURCES.txt
src/hfdb_xml_converter.egg-info/dependency_links.txt
src/hfdb_xml_converter.egg-info/top_level.txt
\ No newline at end of file
src/hfdb_xml_converter.egg-info/top_level.txt
test/test.py
\ No newline at end of file
No preview for this file type
......@@ -2,7 +2,159 @@ import xml.etree.ElementTree as ET
import json
import os
def parse_xml(xml_file):
def id(obj, ns):
anr = obj.find(".//ns:ANR", ns)
trk = obj.find(".//ns:TRK", ns)
if anr is not None:
return f"{anr.text.strip()}#{trk.text.strip().zfill(3)}"
else:
print("Keine Archivnummer gefunden")
return None
def traegertitel(obj, ns):
ak_full = obj.find(".//ns:AK[@voll='j']", ns)
if ak_full is not None:
rhti = ak_full.find('ns:RHTI', ns)
if rhti is not None and rhti.text:
return rhti.text.strip()
else:
print("Kein RHTI Element in AK[@voll='j'] gefunden oder leerer Text")
return None
else:
print("Kein AK Element mit voll='j' gefunden")
return None
def tracktitel(obj, ns):
ak = obj.find('ns:AK', ns)
if ak is not None:
rhti = ak.find('ns:RHTI', ns)
if rhti is not None and rhti.text:
return rhti.text.strip()
else:
print("Kein RHTI Element in AK gefunden oder leerer Text")
return None
else:
print("Kein AK Element gefunden")
return None
def konfkorpus(obj, ns):
"""
Gibt das erste Element der Konfektionierung aus - dieses
repräsentiert den Korpus
"""
konf = obj.find('.//ns:KONF', ns)
if konf is not None:
return konf
else:
print("kein Konfektionierungselement gefunden")
return konf
def keywords(obj, ns, flat=False, sep="#"):
"""
Gibt die Ergebnisse der mdh-Keyword-Extraction zurück
"""
konf = konfkorpus(obj, ns)
if konf is not None:
# Keywords
keywords = []
for kwe in konf.findall('.//ns:KWE/ns:TEXT', ns):
if kwe.text:
keywords.append(kwe.text.strip())
if flat:
return f" {sep} ".join(keywords)
else:
return keywords
def transcript(obj, ns, add_speakers=False):
"""
Gibt die Ergebnisse des MDH-Mining-Transkripts zurück
"""
konf = konfkorpus(obj, ns)
if konf is not None:
transcript = ""
for speaker in konf.findall('.//ns:TRANSCRIPT/ns:SPEAKER', ns):
if speaker.text:
if add_speakers:
transcript += f"SPEAKER {speaker.attrib['id']}: {speaker.text.strip()} "
else:
transcript += speaker.text.strip() + " "
return transcript.strip()
def personen(elem, ns):
"""
Extrahiert Personenangaben aus dem Mitwirkenden- oder dem
Urheber-Feld
"""
person = {
'name': elem.find('.//ns:NAM', ns),
'vorname': elem.find('.//ns:VNAM', ns),
'anmerkung': elem.find('.//ns:ANM', ns),
'funktion': elem.find('.//ns:FN', ns)
}
# über das Dictionary iterieren und die Texte extrahieren, sofern vorhanden
return dict([(key, value.text if value is not None else None) for key, value in person.items()])
def mitwirkende(obj, ns):
"""
Extrahier Personenangaben aus dem Mitwirkenden-Feld
"""
results = []
# nur das erste AK-Objekt zurückgeben (= Korpus)
ak_korpus = obj.find('.//ns:AK', ns)
bsg = ak_korpus.findall('.//ns:BSG', ns)
if bsg is not None:
for elem in bsg:
results.append(personen(elem, ns))
return results
else:
print("Keine Mitwirkenden gefunden")
return None
def urheber(obj, ns):
"""
Extrahier Personenangaben aus dem Urheber-Feld
"""
results = []
# nur das erste AK-Objekt zurückgeben (= Korpus)
ak_korpus = obj.find('.//ns:AK', ns)
bsg = ak_korpus.findall('.//ns:URH', ns)
if bsg is not None:
for elem in bsg:
results.append(personen(elem, ns))
return results
else:
print("Keine Mitwirkenden gefunden")
return None
def permalink(obj, ns):
plugpara = obj.find('.//ns:PLUGPARA', ns)
if plugpara is not None:
instance = plugpara.get('instance')
instance_num = instance[-1]
konf = plugpara.get('konf')
amo = plugpara.get('amo')
ak = plugpara.get('ak')
gk = plugpara.get('gk')
link = f"https://linkresolver{instance_num}.ivz.cn.ard.de/linkresolver/resolve?context=hfdb{instance_num}&hfdb-recherche/permalink?typ=vollinfoAnzeige&vollinfoArt=vollinfokonf&ak={ak}&ko={konf}&amo={amo}&gkonf={gk}&destination={instance}"
return link
else:
print("Fehler beim Auffinden des Plugpara-Elemente. Kein Permalink erstellt")
return None
def parse_xml(xml_file, **kwargs):
config = {
'flat': False,
'sep': "#",
'add_speakers': False
}
config.update(kwargs)
tree = ET.parse(xml_file)
root = tree.getroot()
......@@ -10,74 +162,42 @@ def parse_xml(xml_file):
ns = {'ns': 'http://ard.de/sad/hfdb/vollinfo'}
result = []
id_counter = 1
for index, vi in enumerate(root.findall('ns:VI', ns)):
item = {'id': id_counter}
id_counter += 1
# Laufende Nummerierung
item = {'lfd': index + 1}
# Archivnummer
item['id'] = id(vi, ns)
# Traegertitel
ak_full = vi.find(".//ns:AK[@voll='j']", ns)
if ak_full is not None:
rhti = ak_full.find('ns:RHTI', ns)
if rhti is not None and rhti.text:
item['traegertitel'] = rhti.text.strip()
else:
print("Kein RHTI Element in AK[@voll='j'] gefunden oder leerer Text")
else:
print("Kein AK Element mit voll='j' gefunden")
item['traegertitel'] = traegertitel(vi, ns)
# Tracktitel
ak = vi.find('ns:AK', ns)
if ak is not None:
rhti = ak.find('ns:RHTI', ns)
if rhti is not None and rhti.text:
item['tracktitel'] = rhti.text.strip()
else:
print("Kein RHTI Element in AK gefunden oder leerer Text")
else:
print("Kein AK Element gefunden")
item['tracktitel'] = tracktitel(vi, ns)
# Nur das erste KONF-Element verarbeiten
konf = vi.find('.//ns:KONF', ns)
if konf is not None:
# Keywords
keywords = []
for kwe in konf.findall('.//ns:KWE/ns:TEXT', ns):
if kwe.text:
keywords.append(kwe.text.strip())
item['keywords'] = keywords
# Transkript
transcript = ""
for speaker in konf.findall('.//ns:TRANSCRIPT/ns:SPEAKER', ns):
if speaker.text:
transcript += speaker.text.strip() + " "
item['transkript'] = transcript.strip()
else:
print("Kein KONF Element gefunden")
# Link aus PLUGPARA erstellen
plugpara = vi.find('.//ns:PLUGPARA', ns)
if plugpara is not None:
instance = plugpara.get('instance')
instance_num = instance[-1]
konf = plugpara.get('konf')
amo = plugpara.get('amo')
ak = plugpara.get('ak')
gk = plugpara.get('gk')
link = f"https://linkresolver{instance_num}.ivz.cn.ard.de/linkresolver/resolve?context=hfdb{instance_num}&hfdb-recherche/permalink?typ=vollinfoAnzeige&vollinfoArt=vollinfokonf&ak={ak}&ko={konf}&amo={amo}&gkonf={gk}&destination={instance}"
item['link'] = link
else:
print("Kein PLUGPARA Element gefunden")
# Keywords
item['keywords'] = keywords(vi, ns, flat=config['flat'], sep=config['sep'])
# Transkript
item['transcript'] = transcript(vi, ns, add_speakers=config['add_speakers'])
# Mitwirkende
item['mitwirkende'] = mitwirkende(vi, ns)
# Urheber
item['urheber'] = urheber(vi, ns)
# Permalink
item['permalink'] = permalink(vi, ns)
result.append(item)
return result
def xml_to_json(input_file, output_dir):
data = parse_xml(input_file)
def xml_to_json(input_file, output_dir, **kwargs):
data = parse_xml(input_file, **kwargs)
output_file = os.path.join(output_dir, os.path.splitext(os.path.basename(input_file))[0] + '.json')
with open(output_file, 'w', encoding='utf-8') as f:
......
This diff is collapsed.
......@@ -7,4 +7,4 @@ from src.hfdb_xml_converter.parser import xml_to_json
filepath = "/root/hfdb_xml_converter/hfdb_xml_converter/test/bericht.xml"
output_dir = "/root/hfdb_xml_converter/hfdb_xml_converter/test/"
xml_to_json(filepath,output_dir)
\ No newline at end of file
xml_to_json(filepath, output_dir)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment