XPath, select 两个连续的元素并将它们转化为一个数据框

XPath, select two consecutive elements and transform them into one data frame

总的来说,我对 Path 和 R 真的很陌生,我正在尝试使用 XPath 将 XML 文件转换为 R 中的数据框。在一些帮助下,我设法转换了 XML 中的大部分信息=29=] 已经。但是,现在我正在尝试获取两个连续的元素并将它们合并到一个数据框中。不知何故,我似乎做对了。

这是 xml 数据的摘录:

</customer-bootstrap-data>
  <customer-bootstrap-data id="970911" customerName="HighIncome-1_4" powerType="ELECTRIC_VEHICLE">
<netUsage>0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.1124173640233721,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.37606842556525066,-0.0,-0.0,-0.038684343289247636,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.8490012729862713,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0</netUsage>
  </customer-bootstrap-data>
  <customer-bootstrap-data id="970912" customerName="HighIncome-2_17" powerType="ELECTRIC_VEHICLE">
<netUsage>0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.21395291779884928,-0.0,-0.0,-1.3581716633726693,-0.0,-0.0,-2.8140822306420716,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-3.0221045637055397,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-3.3,-3.3,-3.223543705462774,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.153329511039221,-0.0,-0.0,-0.0,-0.0,-0.0,-0.820425411761537,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.7054631085029754,-0.0,-0.7130641168720118,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-2.5003661751788435,-0.0,-0.0,-3.3,-3.3,-0.6606989045692728,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.20818145620010853,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0493154269844851,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-3.3,-0.0,-1.041919182358086,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-3.3,-0.5334016276259916,-0.0,-0.0,-0.0</netUsage>
  </customer-bootstrap-data>

我可以使用以下代码生成第一个元素的 table:

customerBoot <- xpathSApply(doc=xml, path=("//customer-bootstrap-data"), xmlAttrs)
customerBoot <- data.frame(t(customerBoot))

包含 id、customerName 和 powerType。但我希望每个客户 ID 也包含净使用量。

以下代码选择了我想要的所有信息,只是不允许我将其转换为数据框。

customerBoot <- getNodeSet(xml,"//customer-bootstrap-data")

有任何想法吗?我正在寻找一个快速的解决方案。

谢谢!

让我知道这是否有速度问题(以这种方式遍历巨大的 XML 文档有时会很慢):

library(XML)
library(purrr)

fil <- '<dat><customer-bootstrap-data id="970911" customerName="HighIncome-1_4" powerType="ELECTRIC_VEHICLE">
<netUsage>0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.1124173640233721,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.37606842556525066,-0.0,-0.0,-0.038684343289247636,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.8490012729862713,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0</netUsage>
  </customer-bootstrap-data>
  <customer-bootstrap-data id="970912" customerName="HighIncome-2_17" powerType="ELECTRIC_VEHICLE">
<netUsage>0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.21395291779884928,-0.0,-0.0,-1.3581716633726693,-0.0,-0.0,-2.8140822306420716,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-3.0221045637055397,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-3.3,-3.3,-3.223543705462774,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.153329511039221,-0.0,-0.0,-0.0,-0.0,-0.0,-0.820425411761537,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.7054631085029754,-0.0,-0.7130641168720118,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-2.5003661751788435,-0.0,-0.0,-3.3,-3.3,-0.6606989045692728,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.20818145620010853,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0493154269844851,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-3.3,-0.0,-1.041919182358086,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-3.3,-0.5334016276259916,-0.0,-0.0,-0.0</netUsage>
  </customer-bootstrap-data></dat>'

doc <- xmlParse(fil)

customerBoot <- xpathSApply(doc=doc, path="//customer-bootstrap-data", xmlAttrs)
customerBoot <- data.frame(t(customerBoot), stringsAsFactors=FALSE)

# go by row using the id, grab the desired data node, convert it to a wide data.frame

customerBoot <- purrr::by_row(customerBoot, function(x) {

  path <- sprintf("//customer-bootstrap-data[@id='%s']/netUsage", x$id)
  vals <- strsplit(xpathSApply(doc=doc, path=path, xmlValue), ",")[[1]]
  as.numeric(vals)

}, .to="X", .collate="cols")

# limiting the "str()" equivalent "glimpse()" to 15 columns since there are >300 of them:

dplyr::glimpse(customerBoot[, 1:15])

## Observations: 2
## Variables: 15
## $ id           (chr) "970911", "970912"
## $ customerName (chr) "HighIncome-1_4", "HighIncome-2_17"
## $ powerType    (chr) "ELECTRIC_VEHICLE", "ELECTRIC_VEHICLE"
## $ X1           (dbl) 0, 0
## $ X2           (dbl) 0, 0
## $ X3           (dbl) 0, 0
## $ X4           (dbl) 0, 0
## $ X5           (dbl) 0, 0
## $ X6           (dbl) 0, 0
## $ X7           (dbl) 0, 0
## $ X8           (dbl) 0.0000000, -0.2139529
## $ X9           (dbl) 0, 0
## $ X10          (dbl) 0, 0
## $ X11          (dbl) 0.000000, -1.358172
## $ X12          (dbl) 0, 0

xml2的替代方法:

library(purrr)
library(xml2)

doc <- read_xml(fil)

xml_find_all(doc, "//customer-bootstrap-data") %>% 
  xml_attrs() %>% 
  map_df(function(x) {
    path <- sprintf("//customer-bootstrap-data[@id='%s']/netUsage", x["id"])
    vals <- strsplit(xml_text(xml_find_one(doc, path)), ",")[[1]]
    vals <- setNames(as.numeric(vals), sprintf("X%d", 1:length(vals)))
    rbind.data.frame((c(as.list(x), as.list(vals))), stringsAsFactors=FALSE)
  })

更新:

netUsage 长度不一致时,这是一种方法:

data.table::rbindlist(apply(customerBoot, 1, function(x) {

  path <- sprintf("//customer-bootstrap-data[@id='%s']/netUsage", x["id"])
  vals <- strsplit(xpathSApply(doc=doc, path=path, xmlValue), ",")[[1]]
  c(as.list(x), as.list(setNames(as.numeric(vals), sprintf("X%d", 1:length(vals)))))

}), fill=TRUE)