如何将大型 XML 文件转换为 CSV 文件?
How do I convert a large XML file to a CSV file?
由于需要构建知识图谱,我想将一个3.39GB的XML数据文件转换成CSV格式,但是当我尝试Python时,我的电脑运行内存不足。
如下代码:
import glob
import pandas as pd
import xml.etree.ElementTree as ET
import os
def xml_to_csv(path):
xml_list = []
for xml_file in glob.glob(path + '/Badges.xml'):
print('**********************************')
print(xml_file)
tree = ET.parse(xml_file)
root = tree.getroot()
# for member in root.findall('row'):
for member in root.findall('row'):
value = (member.attrib.get('Id', ''),
member.attrib.get('UserId', ''),
member.attrib.get('Name', ''),
member.attrib.get('Date', ''),
member.attrib.get('Class', ''),
member.attrib.get('TagBased', ''),
)
xml_list.append(value)
column_name = ['Id', 'UserId', 'Name', 'Date', 'Class','TagBased']
xml_df = pd.DataFrame(xml_list, columns = column_name)
# print('----------------')
# print(xml_df)
return xml_df
if __name__ == "__main__":
xml_path = 'D:/【论文】/【数据集】/1-Whosebug数据集-2008-2021/whosebug.com-Badges'
print(os.path.exists(xml_path))
xml_df = xml_to_csv(xml_path)
print('**********************************')
print(xml_df)
xml_df.to_csv('D:/【论文】/【数据集】/1-Whosebug数据集-2008-2021/whosebug.com-Badges/Badges.csv', index = None)
print('Successfully converted xml to csv.')
XML的流式处理有多种技术。其中之一是 XSLT 3.0,您可以在其中编写
<xsl:mode streamable="yes"/>
<xsl:output method="text"/>
<xsl:template match="row">
<xsl:value-of select="@Id, @UserId, @Name, @Class, @TagBased"
separator=","/>
<xsl:text>
</xsl:text>
</xsl:template>
由于需要构建知识图谱,我想将一个3.39GB的XML数据文件转换成CSV格式,但是当我尝试Python时,我的电脑运行内存不足。
如下代码:
import glob
import pandas as pd
import xml.etree.ElementTree as ET
import os
def xml_to_csv(path):
xml_list = []
for xml_file in glob.glob(path + '/Badges.xml'):
print('**********************************')
print(xml_file)
tree = ET.parse(xml_file)
root = tree.getroot()
# for member in root.findall('row'):
for member in root.findall('row'):
value = (member.attrib.get('Id', ''),
member.attrib.get('UserId', ''),
member.attrib.get('Name', ''),
member.attrib.get('Date', ''),
member.attrib.get('Class', ''),
member.attrib.get('TagBased', ''),
)
xml_list.append(value)
column_name = ['Id', 'UserId', 'Name', 'Date', 'Class','TagBased']
xml_df = pd.DataFrame(xml_list, columns = column_name)
# print('----------------')
# print(xml_df)
return xml_df
if __name__ == "__main__":
xml_path = 'D:/【论文】/【数据集】/1-Whosebug数据集-2008-2021/whosebug.com-Badges'
print(os.path.exists(xml_path))
xml_df = xml_to_csv(xml_path)
print('**********************************')
print(xml_df)
xml_df.to_csv('D:/【论文】/【数据集】/1-Whosebug数据集-2008-2021/whosebug.com-Badges/Badges.csv', index = None)
print('Successfully converted xml to csv.')
XML的流式处理有多种技术。其中之一是 XSLT 3.0,您可以在其中编写
<xsl:mode streamable="yes"/>
<xsl:output method="text"/>
<xsl:template match="row">
<xsl:value-of select="@Id, @UserId, @Name, @Class, @TagBased"
separator=","/>
<xsl:text>
</xsl:text>
</xsl:template>