xml 中相同标签彼此重叠时如何删除第一个标签
How to delete first tag when to same tags are over each other in xml
朋友们,当我有两个或三个标签 <word>
时,它会影响我对文件的阅读,如下例所示
<word wordid="<bon_Aljanuwb_1" value="إبْن الجَنُوب" synsetid="Aljanuwbiy_n1AR" frequency="1" corpus="manchester20060717" authorshipid="12030" />
<word wordid="<bon__1" value="إبْن البَلَد" synsetid="<ibon_Albalad_n1AR" frequency="" corpus="" authorshipid="12031" />
<word wordid="<bonap_1" value="إبْنَة" synsetid="<ibonap_n2AR" frequency="1" corpus="manchester20060717" authorshipid="12032" />
<word wordid="<bonu__1" value="إبْنُ عُرْس" synsetid="<ibonu_Euros_n1AR" frequency="" corpus="" authorshipid="12033" />
<word wordid="<borAhAm__1" value="إبْراهام لينْكون" synsetid="<iborAhAm_lynokwn_n1AR" frequency="" corpus="" authorshipid="12034" />
<word wordid="<botahaja_1" value="إبْتَهَجَ" synsetid="fariHa_v1AR" frequency="" corpus="" authorshipid="12035" />
<form value="بهج" wordid="<botahaja_1" type="root" authorshipid="12035" />
我想做的是删除所有单词标签并保留后面跟有标签的标签以生成输出
<word wordid="<botahaja_1" value="إبْتَهَجَ" synsetid="fariHa_v1AR" frequency="" corpus="" authorshipid="12035" />
<form value="بهج" wordid="<botahaja_1" type="root" authorshipid="12035" />
但是,文件的更大视图附在下一行中,正确的结果应该是没有第一行的任何ides
<word wordid="<borAhAm__1" value="إبْراهام لينْكون" synsetid="<iborAhAm_lynokwn_n1AR" frequency="" corpus="" authorshipid="12034" />
<word wordid="<botahaja_1" value="إبْتَهَجَ" synsetid="fariHa_v1AR" frequency="" corpus="" authorshipid="12035" />
<form value="بهج" wordid="<botahaja_1" type="root" authorshipid="12035" />
<word wordid="<botaz~a_1" value="إبْتَزَّ" synsetid="<ibotaz~a_v1AR" frequency="" corpus="" authorshipid="12036" />
<form value="بزز" wordid="<botaz~a_1" type="root" authorshipid="12036" />
您可以使用 xpath in lxml 到 select 任何 word
当第一个后续兄弟是另一个 word
并将其删除。
示例...
XML 输入 (input.xml)
<?xml version="1.0" encoding="UTF-8"?>
<doc>
<test>
<word wordid="<bon_Aljanuwb_1" value="إبْن الجَنُوب" synsetid="Aljanuwbiy_n1AR" frequency="1" corpus="manchester20060717" authorshipid="12030" />
<word wordid="<bon__1" value="إبْن البَلَد" synsetid="<ibon_Albalad_n1AR" frequency="" corpus="" authorshipid="12031" />
<word wordid="<bonap_1" value="إبْنَة" synsetid="<ibonap_n2AR" frequency="1" corpus="manchester20060717" authorshipid="12032" />
<word wordid="<bonu__1" value="إبْنُ عُرْس" synsetid="<ibonu_Euros_n1AR" frequency="" corpus="" authorshipid="12033" />
<word wordid="<borAhAm__1" value="إبْراهام لينْكون" synsetid="<iborAhAm_lynokwn_n1AR" frequency="" corpus="" authorshipid="12034" />
<word wordid="<botahaja_1" value="إبْتَهَجَ" synsetid="fariHa_v1AR" frequency="" corpus="" authorshipid="12035" />
<form value="بهج" wordid="<botahaja_1" type="root" authorshipid="12035" />
</test>
<test>
<word wordid="<borAhAm__1" value="إبْراهام لينْكون" synsetid="<iborAhAm_lynokwn_n1AR" frequency="" corpus="" authorshipid="12034" />
<word wordid="<botahaja_1" value="إبْتَهَجَ" synsetid="fariHa_v1AR" frequency="" corpus="" authorshipid="12035" />
<form value="بهج" wordid="<botahaja_1" type="root" authorshipid="12035" />
<word wordid="<botaz~a_1" value="إبْتَزَّ" synsetid="<ibotaz~a_v1AR" frequency="" corpus="" authorshipid="12036" />
<form value="بزز" wordid="<botaz~a_1" type="root" authorshipid="12036" />
</test>
</doc>
Python
from lxml import etree
tree = etree.parse("input.xml")
for to_remove in tree.xpath("//word[following-sibling::*[1][self::word]]"):
to_remove.getparent().remove(to_remove)
tree.write("output.xml", encoding="utf-8", xml_declaration=True)
XML 输出 (output.xml)
<?xml version='1.0' encoding='UTF-8'?>
<doc>
<test>
<word wordid="<botahaja_1" value="إبْتَهَجَ" synsetid="fariHa_v1AR" frequency="" corpus="" authorshipid="12035"/>
<form value="بهج" wordid="<botahaja_1" type="root" authorshipid="12035"/>
</test>
<test>
<word wordid="<botahaja_1" value="إبْتَهَجَ" synsetid="fariHa_v1AR" frequency="" corpus="" authorshipid="12035"/>
<form value="بهج" wordid="<botahaja_1" type="root" authorshipid="12035"/>
<word wordid="<botaz~a_1" value="إبْتَزَّ" synsetid="<ibotaz~a_v1AR" frequency="" corpus="" authorshipid="12036"/>
<form value="بزز" wordid="<botaz~a_1" type="root" authorshipid="12036"/>
</test>
</doc>
朋友们,当我有两个或三个标签 <word>
时,它会影响我对文件的阅读,如下例所示
<word wordid="<bon_Aljanuwb_1" value="إبْن الجَنُوب" synsetid="Aljanuwbiy_n1AR" frequency="1" corpus="manchester20060717" authorshipid="12030" />
<word wordid="<bon__1" value="إبْن البَلَد" synsetid="<ibon_Albalad_n1AR" frequency="" corpus="" authorshipid="12031" />
<word wordid="<bonap_1" value="إبْنَة" synsetid="<ibonap_n2AR" frequency="1" corpus="manchester20060717" authorshipid="12032" />
<word wordid="<bonu__1" value="إبْنُ عُرْس" synsetid="<ibonu_Euros_n1AR" frequency="" corpus="" authorshipid="12033" />
<word wordid="<borAhAm__1" value="إبْراهام لينْكون" synsetid="<iborAhAm_lynokwn_n1AR" frequency="" corpus="" authorshipid="12034" />
<word wordid="<botahaja_1" value="إبْتَهَجَ" synsetid="fariHa_v1AR" frequency="" corpus="" authorshipid="12035" />
<form value="بهج" wordid="<botahaja_1" type="root" authorshipid="12035" />
我想做的是删除所有单词标签并保留后面跟有标签的标签以生成输出
<word wordid="<botahaja_1" value="إبْتَهَجَ" synsetid="fariHa_v1AR" frequency="" corpus="" authorshipid="12035" />
<form value="بهج" wordid="<botahaja_1" type="root" authorshipid="12035" />
但是,文件的更大视图附在下一行中,正确的结果应该是没有第一行的任何ides
<word wordid="<borAhAm__1" value="إبْراهام لينْكون" synsetid="<iborAhAm_lynokwn_n1AR" frequency="" corpus="" authorshipid="12034" />
<word wordid="<botahaja_1" value="إبْتَهَجَ" synsetid="fariHa_v1AR" frequency="" corpus="" authorshipid="12035" />
<form value="بهج" wordid="<botahaja_1" type="root" authorshipid="12035" />
<word wordid="<botaz~a_1" value="إبْتَزَّ" synsetid="<ibotaz~a_v1AR" frequency="" corpus="" authorshipid="12036" />
<form value="بزز" wordid="<botaz~a_1" type="root" authorshipid="12036" />
您可以使用 xpath in lxml 到 select 任何 word
当第一个后续兄弟是另一个 word
并将其删除。
示例...
XML 输入 (input.xml)
<?xml version="1.0" encoding="UTF-8"?>
<doc>
<test>
<word wordid="<bon_Aljanuwb_1" value="إبْن الجَنُوب" synsetid="Aljanuwbiy_n1AR" frequency="1" corpus="manchester20060717" authorshipid="12030" />
<word wordid="<bon__1" value="إبْن البَلَد" synsetid="<ibon_Albalad_n1AR" frequency="" corpus="" authorshipid="12031" />
<word wordid="<bonap_1" value="إبْنَة" synsetid="<ibonap_n2AR" frequency="1" corpus="manchester20060717" authorshipid="12032" />
<word wordid="<bonu__1" value="إبْنُ عُرْس" synsetid="<ibonu_Euros_n1AR" frequency="" corpus="" authorshipid="12033" />
<word wordid="<borAhAm__1" value="إبْراهام لينْكون" synsetid="<iborAhAm_lynokwn_n1AR" frequency="" corpus="" authorshipid="12034" />
<word wordid="<botahaja_1" value="إبْتَهَجَ" synsetid="fariHa_v1AR" frequency="" corpus="" authorshipid="12035" />
<form value="بهج" wordid="<botahaja_1" type="root" authorshipid="12035" />
</test>
<test>
<word wordid="<borAhAm__1" value="إبْراهام لينْكون" synsetid="<iborAhAm_lynokwn_n1AR" frequency="" corpus="" authorshipid="12034" />
<word wordid="<botahaja_1" value="إبْتَهَجَ" synsetid="fariHa_v1AR" frequency="" corpus="" authorshipid="12035" />
<form value="بهج" wordid="<botahaja_1" type="root" authorshipid="12035" />
<word wordid="<botaz~a_1" value="إبْتَزَّ" synsetid="<ibotaz~a_v1AR" frequency="" corpus="" authorshipid="12036" />
<form value="بزز" wordid="<botaz~a_1" type="root" authorshipid="12036" />
</test>
</doc>
Python
from lxml import etree
tree = etree.parse("input.xml")
for to_remove in tree.xpath("//word[following-sibling::*[1][self::word]]"):
to_remove.getparent().remove(to_remove)
tree.write("output.xml", encoding="utf-8", xml_declaration=True)
XML 输出 (output.xml)
<?xml version='1.0' encoding='UTF-8'?>
<doc>
<test>
<word wordid="<botahaja_1" value="إبْتَهَجَ" synsetid="fariHa_v1AR" frequency="" corpus="" authorshipid="12035"/>
<form value="بهج" wordid="<botahaja_1" type="root" authorshipid="12035"/>
</test>
<test>
<word wordid="<botahaja_1" value="إبْتَهَجَ" synsetid="fariHa_v1AR" frequency="" corpus="" authorshipid="12035"/>
<form value="بهج" wordid="<botahaja_1" type="root" authorshipid="12035"/>
<word wordid="<botaz~a_1" value="إبْتَزَّ" synsetid="<ibotaz~a_v1AR" frequency="" corpus="" authorshipid="12036"/>
<form value="بزز" wordid="<botaz~a_1" type="root" authorshipid="12036"/>
</test>
</doc>