Python lxml 访问孙标签
Python lxml accessing grandchild tag
我有一个 xml 文件(下面的示例)。我这辈子都无法访问孙子标签。我也包括了我的尝试。我一直在尝试通过查看 lxml xpath strucutres 和 this 答案来获得正确答案。
有人能指出我正确的方向吗?我的输出csv只有returns headers.
谢谢
样本XML
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<TrainingCenterDatabase xmlns="http://www.garmin.com/xmlschemas/TrainingCenterDatabase/v2" xmlns:ns2="http://www.garmin.com/xmlschemas/UserProfile/v2" xmlns:ns3="http://www.garmin.com/xmlschemas/ActivityExtension/v2" xmlns:ns4="http://www.garmin.com/xmlschemas/ProfileExtension/v1" xmlns:ns5="http://www.garmin.com/xmlschemas/ActivityGoals/v1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.garmin.com/xmlschemas/TrainingCenterDatabase/v2 http://www.garmin.com/xmlschemas/TrainingCenterDatabasev2.xsd">
<Trackpoint>
<Time>2018-04-13T13:06:10Z</Time>
<DistanceMeters>80.14</DistanceMeters>
<Cadence>5</Cadence>
<Extensions>
<TPX xmlns="http://www.garmin.com/xmlschemas/ActivityExtension/v2">
<Speed>2.01</Speed>
<Watts>2</Watts>
</TPX>
</Extensions>
<HeartRateBpm>
<Value>96</Value>
</HeartRateBpm>
</Trackpoint>
我尝试的解决方案:
import lxml.etree as et
import csv
x = et.parse('sample.tcx', parser = et.XMLParser(remove_comments = True))
y = et.tostring(x, pretty_print = True, encoding = 'unicode')
#root = x.getroot()
NS = 'http://www.garmin.com/xmlschemas/TrainingCenterDatabase/v2'
NS2 = 'http://www.garmin.com/xmlschemas/ActivityExtension/v2'
header1 = ('Speed', 'Watts')
with open('output1.csv', 'w') as g:
writer = csv.writer(g)
writer.writerow(header1)
root = et.fromstring(y)
for item in root.iter('./Trackpoint/Extensions/TPX {%s}' % NS2):
Speed = item.find('Speed').text #3
Watts = item.find('Watts').text #3
row = Speed, Watts
writer.writerow(row)
找到了 BeautifulSoup 的解决方案,它并不理想,因为我必须将所有元素都转换为字符串,然后去除字符,但现在就可以了。
import lxml.etree as et
from bs4 import BeautifulSoup as Soup
time = []
data = open('sample.tcx', 'r')
soup = Soup(data, 'lxml-xml')
for i in soup.find_all('Time'):
time.append((i))
对于任何感兴趣的人,这就是我设法解析和清理 tcx 文件格式的方法。
import csv
from bs4 import BeautifulSoup as Soup
import pandas as pd
data = open('jc180512.tcx', 'r')
soup = Soup(data, 'lxml-xml')
time = []
dist = []
caden = []
speed = []
watts = []
hrbpm = []
hrbp = [0 for i in range(6)] # hardcoded a number of zeros at the
心跳列开始,因为直到第 6 个周期才进行测量。
for i in soup.find_all('Time'):
time.append((i))
for i in soup.find_all('DistanceMeters'):
dist.append(i)
for i in soup.find_all('Cadence'):
caden.append(i)
for i in soup.find_all('Speed'):
speed.append(i)
for i in soup.find_all('Watts'):
watts.append(i)
for i in soup.find_all('Value'):
hrbpm.append(i)
heartrate = hrbp + hrbpm
f = open("test.csv", "w")
for i in range(len(time)):
f.write("{}, {}, {}, {}, {}\n".format(time[i], dist[i], caden[i], speed[i], watts[i]))
f.close()
g = open('test0.csv', 'w')
for i in range(len(heartrate)):
g.write('{}\n'.format(heartrate[i]))
g.close
df1 = pd.read_csv('test.csv', header = None)
df2 = pd.read_csv('test0.csv', header = None)
all_data = pd.concat([df1, df2], axis=1)
all_data = all_data.fillna(0)
all_data = all_data.astype(str)
all_data[0] = all_data[0].applymap(lambda x: x.lstrip('<Time>').rstrip('</Time>'))
all_data[0] = all_data[0].applymap(lambda x: x.lstrip('<Value>').rstrip('</Value>'))
all_data[0] = all_data[0].applymap(lambda x: x.rstrip('Z'))
all_data[0] = all_data[0].applymap(lambda x: x.replace('T', ' '))
all_data[1] = all_data[1].map(lambda x: x.rstrip('</DistanceMeters>').lstrip(' <DistanceMeters>'))
all_data[2] = all_data[2].map(lambda x: x.lstrip(' <Cadence>').rstrip('</Cadence>'))
all_data[3] = all_data[3].map(lambda x: x.lstrip(' <ns3:Speed>').rstrip('</ns3:Speed>'))
all_data[4] = all_data[4].map(lambda x: x.lstrip(' <ns3:Watts>').rstrip('</ns3:Watts>'))
all_data.to_csv('final.csv', index = False)
with open('final.csv') as inf:
with open('output_1.csv', 'w') as outf:
for line in inf:
outf.write(','.join(line.split(' ')))
我有一个 xml 文件(下面的示例)。我这辈子都无法访问孙子标签。我也包括了我的尝试。我一直在尝试通过查看 lxml xpath strucutres 和 this 答案来获得正确答案。
有人能指出我正确的方向吗?我的输出csv只有returns headers.
谢谢
样本XML
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<TrainingCenterDatabase xmlns="http://www.garmin.com/xmlschemas/TrainingCenterDatabase/v2" xmlns:ns2="http://www.garmin.com/xmlschemas/UserProfile/v2" xmlns:ns3="http://www.garmin.com/xmlschemas/ActivityExtension/v2" xmlns:ns4="http://www.garmin.com/xmlschemas/ProfileExtension/v1" xmlns:ns5="http://www.garmin.com/xmlschemas/ActivityGoals/v1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.garmin.com/xmlschemas/TrainingCenterDatabase/v2 http://www.garmin.com/xmlschemas/TrainingCenterDatabasev2.xsd">
<Trackpoint>
<Time>2018-04-13T13:06:10Z</Time>
<DistanceMeters>80.14</DistanceMeters>
<Cadence>5</Cadence>
<Extensions>
<TPX xmlns="http://www.garmin.com/xmlschemas/ActivityExtension/v2">
<Speed>2.01</Speed>
<Watts>2</Watts>
</TPX>
</Extensions>
<HeartRateBpm>
<Value>96</Value>
</HeartRateBpm>
</Trackpoint>
我尝试的解决方案:
import lxml.etree as et
import csv
x = et.parse('sample.tcx', parser = et.XMLParser(remove_comments = True))
y = et.tostring(x, pretty_print = True, encoding = 'unicode')
#root = x.getroot()
NS = 'http://www.garmin.com/xmlschemas/TrainingCenterDatabase/v2'
NS2 = 'http://www.garmin.com/xmlschemas/ActivityExtension/v2'
header1 = ('Speed', 'Watts')
with open('output1.csv', 'w') as g:
writer = csv.writer(g)
writer.writerow(header1)
root = et.fromstring(y)
for item in root.iter('./Trackpoint/Extensions/TPX {%s}' % NS2):
Speed = item.find('Speed').text #3
Watts = item.find('Watts').text #3
row = Speed, Watts
writer.writerow(row)
找到了 BeautifulSoup 的解决方案,它并不理想,因为我必须将所有元素都转换为字符串,然后去除字符,但现在就可以了。
import lxml.etree as et
from bs4 import BeautifulSoup as Soup
time = []
data = open('sample.tcx', 'r')
soup = Soup(data, 'lxml-xml')
for i in soup.find_all('Time'):
time.append((i))
对于任何感兴趣的人,这就是我设法解析和清理 tcx 文件格式的方法。
import csv
from bs4 import BeautifulSoup as Soup
import pandas as pd
data = open('jc180512.tcx', 'r')
soup = Soup(data, 'lxml-xml')
time = []
dist = []
caden = []
speed = []
watts = []
hrbpm = []
hrbp = [0 for i in range(6)] # hardcoded a number of zeros at the
心跳列开始,因为直到第 6 个周期才进行测量。
for i in soup.find_all('Time'):
time.append((i))
for i in soup.find_all('DistanceMeters'):
dist.append(i)
for i in soup.find_all('Cadence'):
caden.append(i)
for i in soup.find_all('Speed'):
speed.append(i)
for i in soup.find_all('Watts'):
watts.append(i)
for i in soup.find_all('Value'):
hrbpm.append(i)
heartrate = hrbp + hrbpm
f = open("test.csv", "w")
for i in range(len(time)):
f.write("{}, {}, {}, {}, {}\n".format(time[i], dist[i], caden[i], speed[i], watts[i]))
f.close()
g = open('test0.csv', 'w')
for i in range(len(heartrate)):
g.write('{}\n'.format(heartrate[i]))
g.close
df1 = pd.read_csv('test.csv', header = None)
df2 = pd.read_csv('test0.csv', header = None)
all_data = pd.concat([df1, df2], axis=1)
all_data = all_data.fillna(0)
all_data = all_data.astype(str)
all_data[0] = all_data[0].applymap(lambda x: x.lstrip('<Time>').rstrip('</Time>'))
all_data[0] = all_data[0].applymap(lambda x: x.lstrip('<Value>').rstrip('</Value>'))
all_data[0] = all_data[0].applymap(lambda x: x.rstrip('Z'))
all_data[0] = all_data[0].applymap(lambda x: x.replace('T', ' '))
all_data[1] = all_data[1].map(lambda x: x.rstrip('</DistanceMeters>').lstrip(' <DistanceMeters>'))
all_data[2] = all_data[2].map(lambda x: x.lstrip(' <Cadence>').rstrip('</Cadence>'))
all_data[3] = all_data[3].map(lambda x: x.lstrip(' <ns3:Speed>').rstrip('</ns3:Speed>'))
all_data[4] = all_data[4].map(lambda x: x.lstrip(' <ns3:Watts>').rstrip('</ns3:Watts>'))
all_data.to_csv('final.csv', index = False)
with open('final.csv') as inf:
with open('output_1.csv', 'w') as outf:
for line in inf:
outf.write(','.join(line.split(' ')))