From b3cc6b0d5e6f4f4f235286e0de9585da1b39c98c Mon Sep 17 00:00:00 2001 From: Jan Seipel <jan.seipel@swr.de> Date: Fri, 27 Dec 2024 22:47:13 +0100 Subject: [PATCH] Hochladen des gebauten python pakets --- dist/hfdb_xml_converter-0.1-py3-none-any.whl | Bin 0 -> 3331 bytes dist/hfdb_xml_converter-0.1.tar.gz | Bin 0 -> 2977 bytes pyproject.toml | 3 + setup.cfg | 3 + setup.py | 14 +++ src/hfdb_xml_converter.egg-info/PKG-INFO | 57 ++++++++++++ src/hfdb_xml_converter.egg-info/SOURCES.txt | 10 ++ .../dependency_links.txt | 1 + src/hfdb_xml_converter.egg-info/top_level.txt | 1 + src/hfdb_xml_converter/__init__.py | 1 + src/hfdb_xml_converter/parser.py | 86 ++++++++++++++++++ 11 files changed, 176 insertions(+) create mode 100644 dist/hfdb_xml_converter-0.1-py3-none-any.whl create mode 100644 dist/hfdb_xml_converter-0.1.tar.gz create mode 100644 pyproject.toml create mode 100644 setup.cfg create mode 100644 setup.py create mode 100644 src/hfdb_xml_converter.egg-info/PKG-INFO create mode 100644 src/hfdb_xml_converter.egg-info/SOURCES.txt create mode 100644 src/hfdb_xml_converter.egg-info/dependency_links.txt create mode 100644 src/hfdb_xml_converter.egg-info/top_level.txt create mode 100644 src/hfdb_xml_converter/__init__.py create mode 100644 src/hfdb_xml_converter/parser.py diff --git a/dist/hfdb_xml_converter-0.1-py3-none-any.whl b/dist/hfdb_xml_converter-0.1-py3-none-any.whl new file mode 100644 index 0000000000000000000000000000000000000000..1d80db8d34da817d439a54f203a6e55900719273 GIT binary patch literal 3331 zcma);c{J4f8^;F`W2wdvHL@?m3_{9M*0F@_`<6n6#$bkw7&K%nlw>I)OO*Ypu}m~s zvc#ZC+K{De6EZbf`%U+rd+WOW&guI(&-t9sAK%yWob&m-pXY65#<l|t001}vwv{)W z#V2afvdoh_GY&Ar&)55uXE^Gt=V^3cm`@1SCq&896B&radU`4b;mj0zDE0#iCNTLE zaGS2D@Gd2JYrx8cLI?&6jxmfh#YZ{>;|~C={z>7)OrdkR%3o#&05HG}06>17BFHNQ z;}gP6aNhxio;b35tG~LY%+LlN*NoQ`N1UJO8Ku>px_ABjmAk-3Z`MGNR-MQz_x9*k z;f%u`E}z@rpz%v*@^QG%{zRDLtGX;0?<4=1lUMaQhWdm18oV-a#tZiOm##i{$vGi= zA$!_wzf(bN+86?6(*Ow-!gD6a`-vRM^nFe9FM)m3j8?#=q;27!Y`WD5Qyxu4auEU& z<G~IE=vP-cXb<w$MiX%X0V{(+U7=t@Wi{=6Rg*Kp4c16`?+TJTTR9mFT_r^aQ6yYo z_Tq5AYnyL^>zt4>-aC|{9_(p2C6>aYMjW?|W8nZTt9>3^pIM}Mh!PrM&qC7$RrcOY zRBq@bR}K2z#EVsDl04jM8sFEjqV~WlJG7nK5xEjyjT+=O0+k`Bz>~9^)z?rnM{&E{ z_gYQDs%UXuAns$fpxhvL`(Xz$|8inN=|bSQ?5tIhGRfjU{N5c@l$7Oxkwy&YcjXxQ zD4*%01y5c{C_ALQ(V?@I$J&;;aU(ZBg42b8>6v-Q%|*Z$D#aAjjl1pQN=~q-ZS@Vh z5|=S^MVEE*hi7tXDlSxAan((jf@hqbbE&|ksx0zYE*8ur)Y)F#Bs8o>bK!0foC}@f z7=nJMV=Y?ykMYX4@#gz%a%PuR@Pec#)eY7$2;Vzu8gCtwnoin&{ZfJT=4&^Fo~_EC zGk9U~iN{VtKd>rG<zlo9{qU^~t+9lVi!Ipzk9QAf+T%FBY>yK6gXgKbRr}@lzu@j} zyp}rDS(PQDr+vQ0%akN_a01&bTYdu)&boG={((68@-3Qp3;mvsJ*O&cDN96yZ^xZ4 z5~_?@`huEwoIWY)lod*$whu#_?y8x{8c)#_OItKwJgMB;l<<BW?lsbv)rxNw9!q89 z-7Kvv?R$1AzS<uDsv&a_xiYL?tzMJf8PZt%BCNz*>CD(Yby8axG_Ai-kt>ZJUV4IR z6;_dXQ(oFw^PX+q^V)v0!SN0r=Qky$<nc0HRtURY@;S1-nEDu@gr{e<@Zn+DUeT=M zHVJWRPHvxP&J}nya$Fws;?tb;{xXQ~ZS!9c>Rl9b7`S)qekG$!`)GcgYyLt+5|Ai0 z^0@{g*F&u$O6WDXzYE*Z3G7DFxrhQgn=v#~@rKUg4lUQ(z-oruz4h7hCPWu9z~$LV zZn#<ArjyCDe0%O(p(yc{$4xJ*DRy47;-Dt&_B}*uN1xD3zr~~11f_J3`UKsqpC32n znmv4SGoo#(T&!i|_)-_I2}I`gqb!Eb(Q#`AYceIk=M!up7y5J%-AGei6i(ysDoKuW zi}&!FBrA<d^F9f-^?VGxEzuj*Q%*UhaxnfP`(ZQj$Ci-UndViSS4(%NLn+7KJQRGi zMOxR~?>5;E9l;bzmhrWU)S?th-A!$r*M?7g(D~6{I+^{&=p#)D%LV`x?*ss#zw9q? zWyOPv-bf4<jtulgD_J1y^bGav^q$-G1->@g-LeAZ02WBan3`S_&|sxfQ&CdLX(4M@ zFefVxk=o9i)!hDZsLqa#S`%tn?a(xsljVifCwv)TOM090^5x=<O<$QAggl8eT5rC1 zP%^Q|BS;zu(R*O>;g4qhK2Q;QFruD(?5>&hv%7L}?rG!t{tbc}5hHf&l@$iOP?xWv zpH7D>Y9H4vM}B+%X@f_`4<nstT!sbes5MHvXd2)~Eyg_A?_^RgiIBAly^`w;?;nbH z;w(2>a-QjWp1Sz9#|X;tVFsjNW;yfth0%HHu??g<h&SzAi-GP6wF0|a8+XZZ>Ch0H zi8Y+F7_N|jp_0x4bHs{dA2qOxc<giK+_j5=kq52u&Lsw(XGPy%(T9=;{MiJ8<25Vk zDx496V**JZi(CZnYix*Rz#=Q9Y6(0jgmxteT8JAi=4>UTG!HpZ6nZwa6D9rJ;^{#T zOXcT+T(W?<8BY!595cpWUw@hrnQD?M#u*qG*t6zTQ&SpV93hsnW~3gWy1tNohw3wZ zhP!-s=aSl7O_yflV2W_HL~_)=>>iAjJc|F>0^<wq-B9k1yc<zhwqA;_?}5~5e|Qn) zi6)Gn1}yE50>E<~7!7FotB(TR>L#@1Rfxw5x>k|n=1o$n+(#@zinFLq_%+pfJB)+6 zc`*i~a98SHz=M=qHo>9u=|{W26}{1@a_lTkRf(>Edl%_?3ebJQl*XX_=2e&ql7wsO zkZ1+}(*+}Me{|CAGLI-t<pISrQOX43j;<tW)=4j_Y1j@Zl5K!I@ow~Lg`9V4c~paJ z$Q%BCw1M1bzUHf%A0l%YCHq4)NyM2;5?mcuNN)|3QsXSjGrvvDpWRyx`Z%kl<Gb2{ z+DZ`QpJQ3ml-`?}_VCkuNMr!Ga_@UPw3TA*!!g%ZMlI;Igs3Q^xedSQJ<{J7K9IJ) zoDx1D7%$AYxT(kR9zZE+U_WRD-kBj$Lsip|bmer9lp|YsdgkZ^dWLr9p+3c35_m+F z8pC(PRF3<Iz5nWxu>dWT+^3^gH4jn9O;77x@HAb`ZW_&&a$Ody*~#M0e}fR=lI65~ zq<29FOpPaG)_g7_P=UGGu+TUK^0?o1e9~pE1<or{Q{N11mdB^NyO(_6<>4dIgS1`5 zt8*Ps2wtQNyUF=zR~xeIffn+ue@pik%hez^res~2A@e)Zbu>XB%-P8Yfo;k>>;vw1 z21a;PxC-H4Sxa1@;yLbL>sch(oezYpw-7gD<P}J?(wsYLLu^1Lulz>wwLQM~m#6U- zL^VFTIyX)o7i#=gC#Oxz4?6briAGAUesaEH^uOz4%M*bJFze!HhT`wkheZc@p7jay zIje{b$M%|-nI>0v^(9&MQxweL@}}?!)}Or5_ujP=1E@UA0swqv#&;X`U;DMKciA8e zENu+qDcyk=bOAB%nklSZGQEX62Wn&3dvBLzarIgmoh(q6pwGTeG0jk4W^d7ncAkRX z%-!b;7q+&vZjIZ`Z!SL7lq1|@=mu4iz15$~HT*nFmT$_Q7F?93={7c`Z{Mfz2{IJ( z%3>exq<66O7NWUW)aqMGdmDshn06@boL`IZ8Z8O+$4EL_Y6C1|c2I{_Uw1{7#b^St zDtc{NBer*0*V#NdZ1zIeh|rZ~Ig{&~vGu2Yz;+IgSDez^G0z`9GJ37rJ<<Ytb1A)E zYZPZR;MlXGf0kwG%lMemPUY!%d5AK^1@YPPEo2nd)Crn5VFW8I67p=wR$9tUOOQLL zodIgMbBp++3?FMJ>xM2LG9FpAf7w+lkd(Bycj%bb9ADzc$S8GZMy{{UE{Y_J(;|Vl z4c5Y9WyZn^X8Zr<mw5qye#NohxBuS7&sO+n)~{UdUnJ&Kiaq^n)<2&2XVS0c*$+}1 z(?$Qcs%<ai@9DN3yIpHPuz1ej!2YSc?ab}^_`y8D^&8B8sbxERyEOi0*T+Wx4*M^m VSedbNeE*Syxjr#9whsJ$`X66atRMgY literal 0 HcmV?d00001 diff --git a/dist/hfdb_xml_converter-0.1.tar.gz b/dist/hfdb_xml_converter-0.1.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..e70851cd19491f7baff4a36296c1f61a6761d61d GIT binary patch literal 2977 zcmV;S3tsdeiwFoKAa7>^|7d1pVqbV|Y+qw<ZgypIbY*faFfK7JbYXG;?H$`r<H(X{ ze?`k9k)S1x<J=ge>@G7fJF^T70(Q?z2xU#&Nj&jw+ie1a_?z<?`!+vzs@hKMkZ@Z; z&dgFJ0d{wF^{p;l-Cf408r7c<?2Gr@cDPu17BEw|?XPLp8~HP)*G#Kst+0!gB|s8O zTR`<IFB7Z{=1bS-om#uztk;{b+cl%ru9+M4wzfj>uZLmZIqP2dUbhzp=UhlGDyC7Z zKDqq2o6XzhAD(NCcD>%ROtS&HpjL0Unk%fiO!*gKDDT3%FaJ9QgY-Y?f71V?|7Z38 z!R{}W5Bu*vJ(>Ks8jaiJzm9pc`oC$mYb(rLru@H9{lCYh?by<;{LV$}hQTJYjGDG@ z`+Sql^M*FX0w&Rp65qGQ<tE#419o7KxMW`vAD%?a0|w@V{jvA4vTaN5GVTWO{{G$e z8@9*A$a7=KSrRyGH>K@z0cC*BuScKuwXH-BL$S$zwF7p<-H3ZxQic1r>us{nc3{LR z<L7uR42NsmJnjiMlF+)!+c1zkkd@=h2)ZHpg{(qDM@~2nH0{-*erY#bnm5u;`p`%b zB{4ta0lvo`#|~oap}{<Bbmd(rd|R@wf1kl1aruP@2GsV7eF%WXJkQ3asRS6rF%O(1 z7-(9}xNnf4J{U`1C07pJ-mvo4bD?)^$VCGE6dqXafR$4cl^@y8MHC8&Pe6H1vkaDI z`L1LKmoSP!<wy+`OG{!uY%f-wz}g1bPsod*lbcM_{`~XLNEOriBJ>xG9&>$FBPPnQ z`#Hvbx<Qo4Zr}Ad>#&khMJB6`!l2*9Cj)RxT9`;ga$J$(r0>)8tZ*d49*^U0fqXfq zzRq$gYdE`~)T^eQXpCLK4`aIP=$ovzp1w$7^8s?4<Q^pJhuj|F)UY>h+#7~{w$&SP zn8_eeQ~!nw7<1%ow&%w2nXr@IP>tCH{|E**&~xg&(KFE=@yl^2oERvds1!&8M49+7 zXMm5ic!msF+dPpTq>Z>M^VB^)6kqJ2H%&1NLp`15JqY3v@(1?d12ibM2b^Jhu+_cc zI&*+G7>`;1>)!&{3Q*3tBfkC(okN+3WrhNzCvErKV1)FBQbzG+wTkb8$Ds!-WEt-H z7o!&#ws1g{XY#7q!%sbw=?mFG_On*;cSZ0X$N>CQBOFvL_LIDfI_IGW&;4*K_`=-* zU!A@SIn6^(T(w7?M%`@JYa8v=QP^qLt-95+nya=Ss`Ze$(KJ^Fh&st@tU5fFZlHv@ z1F8obQq!T2*XuX<&ANr(ehbE2qSV+vdlP~H#yr)8uFtkSZ}9cMi81926#hMq<9fyu z(2$GS>`6#2;;#2$Z#YIpAaVvefZpWXqrc+`s<WYZAVclMY-|gq>X;*J(3r^TobB7# zIMAiVwqXY*9RikczYmtOny)uD+OHK$S?zkuthY0ks#%R@+nTUcnpeM_rL<F2zR$W2 z|0euDt2OWc)ye-`ivMTL`hShrHNzy&@1^^HhdW!_dpm~jEW`h^TD8po0}W}`(f_lW zCi#EM0P=>&8zOIrydm<2$Q$}Dydhe$k^e{j-?RCD?e+%wf8_r~m$1(K%zM(1q3=Df z|7X_f&8h#_Y*--wt#*yh|CRzLXNl`MmG}~zEB{mz{IA4?6#+OJCpx%oNhHJ2i+=<& zuCFtFJmj$2In~laXLh>8?XUu>q4K&um$pvgMfQJ%?*Dw#_Mh(mko}*x|4N@5z5ZYc z`@dlSt(*6MTBb$ze<}9g&n^I+YJu(J2IGSL;Pdp>4`kxCAMJz>23_>=Pc>-|y2{`~ zo_YaNNW|1gdyxGn|Np!5|2NDQ`Tu19Q~MuXKAHUAW&hhC>{I{0-9r1{wCMbIDWDE9 zbMJj}c&Sp>S>FwuZUk<=J>apXsnoJMZ@|SuN1x}O>^LG@F?4ibrYtEVgC~=nD;<w2 zV$svDXENi?OpWF7kPD~=5!X^gH$6V?PBtn!C)vS@t?mbR^!&=OUd-sKi^Cm#?&5HU z<>c3gJNi@KXU^*I!T=UXg^I$62W5SF*Fs-21eoFS+G6x>FTK{>m6s6=%FMN9s9}x# zf3p8i<NvqF|9|QBU-YU^D*um*|7<jy4T}GKG5#Ovf71U1-<tlP&#L}&`G1uDx6Fps zBK=SMf29AHz5my4()}OO|F`M?Zr2T5+3h}VXYjuLKeK7&`+qgi1(wxpv}pftY5RXh zgj-zG$OZL=GrwL8Vjx-hHv-vzvi~#ozxA5zKiU6V?0;$}9(yi$kNt19Ei<?O^%~m$ zy4fcCzYNHO6%8&0=f;l5eICeTcxHCYc8;|SMFlcu&ta@-4)5nHg);I91|Y6&s<SI3 z#X3+rU0USXDi_BRa#ei;s1{z!X^#5!isAQ%;>hmt$~N!gk`seP)v+VqOui}wacPs4 zFw|+YTFru*V%t$HTb)E8m9BG=g2#!6b)KB2$A}JawTc&ow3?>U`XQ`<0*7C$vvXI) zvIdFIg)Mm*TWH|Xr0scSsD1PI52bZz%v#o6MN)EKHO_UQad`#TIYwUT4_NCYefsc< z9T&pmrpflG+fO_Xh@ZQv6=mJ1R-qMJyC*+mZ#$*WrBhusD6gzPE$7B8fC1kR1D-cm zjml8EKsCclNn`I0-ydiFn?VPXsziYvvjayZ8h9KzOMNmZC#7Q0U+ENToR%<RDcq<$ zTMzeQK2ItVZXnD0E=Cw*16jtYI8!qN!3Mma1Q^R3;+d_-xqv^%P@A6DGhauM{?_i4 zI>rf4NKYvj2+dAj7KT`uy27lL)RdJL%+mv==vikcYxXnE&C^cE-E**?2#mI6T;PHH z?x+2Cl}SOcVYWEq(9vL807DOpGnC2|7Mq_Yg?sBny<)pr$mmS+B#ty!Jrh86Ga7Ro zBBY$Fm#hu{*}*@@JAWLfO<6E~eI#(p+!iZ0>?q>Uy$SYYW^XW%;)Y2X#S5cFUAocj z<7q(aOoobcO#|yYOxNeS9!K0Bq0hOv`^Se{`$umNKO7w6+tI<!*6z;X9Rrb(yJIj4 z4Fc_3#GecY`+@09*A_}h)<rccIi)!fxtoiTuNLuffqy@`!AJ!qAAJ1nmxHatt(22J zufVjCmkhwH+VhM%DKyW#(=xX*u)0|j^wc*7GGsK)<<cahG|MiT=*;3`;tf3859iPz z?IzY}9%*y~XE2X5xQR3Kf&t4=r2owYMPJ#YYYd3K!h~xT4t)i`uCrL^s{v#e<Djo5 zb8x+gJ+TRjG4bJntT<(_hp5$b^)%5USm|1EX(ba8WH{-v@B!bZNLz*%&v8zt%g7u% ztzq`t7hw#`NjWq=hi*{LiLFmF;>dGxzRHt?Gdt#ZRjo*P?Zi9<Q7;+TzEYtlV^<ED zx|3C?cAb^RAWl5!g^n8xI;BMRD;uRXTpaiF@kLsOllV~?`?SvbSR_sa@7i(Cbvy5D zn2vR|z6@l?D%6_JN?+f0dHUs)^n%$q5e}aSzuaV3g*IJhlavj8A?MMaJDva2`9H;f z(f-ea_J0gM7*x=XRUa+?k30Vd@i*!GpZ5Pr|1W$0AN4=&|C9c|NB=+S{{uYk`~P*& z|8>i%)BgW5fWH4vU%aF*UeXsY>5G^2#Y_6)C4K*&{6F&l$p0h%?;ii}=+ke9Z+DIi zc_AMr|99R0ZZz6A{J#dM0P_Eq0n?8UXtSUE)TUnm(<a{n(e4il(C)i8s6BLJL3;!T z-izySJh<dR@3M;)J$_Ij8Ae@?pL6eFOiug1WdF(j6TA%jzeuD0WBYHKwHx-oRm1=P zP4@r!fkgdf1KEGF|78Ek{y)<GKgv0}*Zy1eoBltdll}j{%s&Y~0RaI40RaI40RaI4 X0RaI40RaKQzXARS<HO0~0H6Q>q?{c! literal 0 HcmV?d00001 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..1b68d94 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +requires = ["setuptools>=42", "wheel"] +build-backend = "setuptools.build_meta" \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..50f4798 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,3 @@ +[metadata] +name = hfdb_xml_converter +version = 0.1 \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..d20859e --- /dev/null +++ b/setup.py @@ -0,0 +1,14 @@ +from setuptools import setup, find_packages + +setup( + name="hfdb_xml_converter", + version="0.1", + package_dir={"": "src"}, + packages=find_packages(where="src"), + install_requires=[], + author="Jan Seipel", + author_email="jan.seipel@swr.de", + description="Ein Paket zum Parsen der XML-Datei einer HFDB Merkliste und Konvertieren in JSON", + long_description=open("README.md").read(), + long_description_content_type="text/markdown", +) \ No newline at end of file diff --git a/src/hfdb_xml_converter.egg-info/PKG-INFO b/src/hfdb_xml_converter.egg-info/PKG-INFO new file mode 100644 index 0000000..74ef0e8 --- /dev/null +++ b/src/hfdb_xml_converter.egg-info/PKG-INFO @@ -0,0 +1,57 @@ +Metadata-Version: 2.1 +Name: hfdb_xml_converter +Version: 0.1 +Summary: Ein Paket zum Parsen der XML-Datei einer HFDB Merkliste und Konvertieren in JSON +Author: Jan Seipel +Author-email: jan.seipel@swr.de +Description-Content-Type: text/markdown + +# hfdb_xml_converter + +XML-Datei einer Merkliste auf Korpusebene auslesen und in ein JSON-Format übersetzen. + +## Installation + +## Verwendung + +1. XML-Datei einer HFDB Merkliste aus dem HFDB-Rich-Client herunterladen (Datei -> Export -> XML) +2. Datei mit Python-Script laden und als JSON ausgeben lassen: + +```python +from hfdb_xml_converter import xml_to_json + +input_file = './data/konf_data.xml' +output_dir = './output' +output_file = process_xml_to_json(input_file, output_dir) + +# Output + +[ + { + "id": 1, + "traegertitel": "Reaktion Bischof Ackermann auf Bericht zu Missbrauch und Bischof Stein", + "tracktitel": "Reaktion Bischof Ackermann auf Bericht zu Missbrauch und Bischof Stein", + "keywords": [ + "Bistumsgeschichte", + "Deutlichkeit", + "Mehrzahl", + ... + ], + "transkript": "Ich sage schon auch, dass es für mich bedrückend ist, dass ...", + "link": "https://linkresolver2.ivz.cn.ard.de/linkresolver/resolve?context=hfdb2&hfdb-recherche/permalink?typ=vollinfoAnzeige&vollinfoArt=vollinfokonf&ak=43073187&ko=53232625&amo=17350850&gkonf=53232624&destination=swrhfdb1" + }, + { + "id": 2, + "traegertitel": "Unwetter am Bodensee und im Allgäu. Bericht aus Meckenbeuren", + "tracktitel": "Unwetter am Bodensee und im Allgäu. Bericht aus Meckenbeuren", + "keywords": [ + "Hochwasser", + "Pegel", + "Feuerwehr", + ... + ], + "transkript": "Es war eine dramatische Nacht in Meckenbeuren im Bodenseekreis ...", + "link": "https://linkresolver2.ivz.cn.ard.de/linkresolver/resolve?context=hfdb2&hfdb-recherche/permalink?typ=vollinfoAnzeige&vollinfoArt=vollinfokonf&ak=42598879&ko=52736037&amo=17124572&gkonf=52736033&destination=swrhfdb1" + } +] +``` diff --git a/src/hfdb_xml_converter.egg-info/SOURCES.txt b/src/hfdb_xml_converter.egg-info/SOURCES.txt new file mode 100644 index 0000000..8e8a0ed --- /dev/null +++ b/src/hfdb_xml_converter.egg-info/SOURCES.txt @@ -0,0 +1,10 @@ +README.md +pyproject.toml +setup.cfg +setup.py +src/hfdb_xml_converter/__init__.py +src/hfdb_xml_converter/parser.py +src/hfdb_xml_converter.egg-info/PKG-INFO +src/hfdb_xml_converter.egg-info/SOURCES.txt +src/hfdb_xml_converter.egg-info/dependency_links.txt +src/hfdb_xml_converter.egg-info/top_level.txt \ No newline at end of file diff --git a/src/hfdb_xml_converter.egg-info/dependency_links.txt b/src/hfdb_xml_converter.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/hfdb_xml_converter.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/src/hfdb_xml_converter.egg-info/top_level.txt b/src/hfdb_xml_converter.egg-info/top_level.txt new file mode 100644 index 0000000..df33646 --- /dev/null +++ b/src/hfdb_xml_converter.egg-info/top_level.txt @@ -0,0 +1 @@ +hfdb_xml_converter diff --git a/src/hfdb_xml_converter/__init__.py b/src/hfdb_xml_converter/__init__.py new file mode 100644 index 0000000..a2e86ea --- /dev/null +++ b/src/hfdb_xml_converter/__init__.py @@ -0,0 +1 @@ +from .parser import parse_xml, xml_to_json \ No newline at end of file diff --git a/src/hfdb_xml_converter/parser.py b/src/hfdb_xml_converter/parser.py new file mode 100644 index 0000000..ca0c1d2 --- /dev/null +++ b/src/hfdb_xml_converter/parser.py @@ -0,0 +1,86 @@ +import xml.etree.ElementTree as ET +import json +import os + +def parse_xml(xml_file): + tree = ET.parse(xml_file) + root = tree.getroot() + + # Namespace-Definition + ns = {'ns': 'http://ard.de/sad/hfdb/vollinfo'} + + result = [] + id_counter = 1 + + for index, vi in enumerate(root.findall('ns:VI', ns)): + item = {'id': id_counter} + id_counter += 1 + + # Traegertitel + ak_full = vi.find(".//ns:AK[@voll='j']", ns) + if ak_full is not None: + rhti = ak_full.find('ns:RHTI', ns) + if rhti is not None and rhti.text: + item['traegertitel'] = rhti.text.strip() + else: + print("Kein RHTI Element in AK[@voll='j'] gefunden oder leerer Text") + else: + print("Kein AK Element mit voll='j' gefunden") + + # Tracktitel + ak = vi.find('ns:AK', ns) + if ak is not None: + rhti = ak.find('ns:RHTI', ns) + if rhti is not None and rhti.text: + item['tracktitel'] = rhti.text.strip() + else: + print("Kein RHTI Element in AK gefunden oder leerer Text") + else: + print("Kein AK Element gefunden") + + # Nur das erste KONF-Element verarbeiten + konf = vi.find('.//ns:KONF', ns) + if konf is not None: + # Keywords + keywords = [] + for kwe in konf.findall('.//ns:KWE/ns:TEXT', ns): + if kwe.text: + keywords.append(kwe.text.strip()) + item['keywords'] = keywords + + # Transkript + transcript = "" + for speaker in konf.findall('.//ns:TRANSCRIPT/ns:SPEAKER', ns): + if speaker.text: + transcript += speaker.text.strip() + " " + item['transkript'] = transcript.strip() + else: + print("Kein KONF Element gefunden") + + # Link aus PLUGPARA erstellen + plugpara = vi.find('.//ns:PLUGPARA', ns) + if plugpara is not None: + instance = plugpara.get('instance') + konf = plugpara.get('konf') + amo = plugpara.get('amo') + ak = plugpara.get('ak') + gk = plugpara.get('gk') + + link = f"https://linkresolver2.ivz.cn.ard.de/linkresolver/resolve?context=hfdb2&hfdb-recherche/permalink?typ=vollinfoAnzeige&vollinfoArt=vollinfokonf&ak={ak}&ko={konf}&amo={amo}&gkonf={gk}&destination={instance}" + item['link'] = link + else: + print("Kein PLUGPARA Element gefunden") + + result.append(item) + + return result + +def xml_to_json(input_file, output_dir): + data = parse_xml(input_file) + output_file = os.path.join(output_dir, os.path.splitext(os.path.basename(input_file))[0] + '.json') + + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=2) + + print(f"Die JSON-Datei wurde erstellt: {output_file}") + return output_file -- GitLab