使用循环从R中的列表中提取数据

Using loop to extract data from list in R

我是 R 的新手。我昨天抓取了一个需要登录的网站,页面是 xml 格式,如下所示。

<result status="success">
  <code>1</code>
  <note>success</note>
  <teacherList>
    <teacher id="D95">
      <name>Mary</name>
      <department id="420">
        <name>Math</name>
      </department>
      <department id="421">
        <name>Statistics</name>
      </department>
    </teacher>
    <teacher id="D73">
      <name>Adam</name>
      <department id="412">
        <name>English</name>
      </department>
    </teacher>
  </teacherList>
</result> 

最近我刚刚将 XML 转换为列表。

library(XML)
library(rvest)
library(plyr)
library(dplyr)
library(httr)
library(pipeR)
library(xml2)

url.address <- "http://xxxxxxxxxxxxxxxxx"
session <-html_session(url.address)
form <-html_form(read_html(url.address))[[1]]
filled_form <- set_values(form,
                          "userid" = "id",
                          "Password" = "password")
s <- submit_form(session,filled_form)
z = read_xml(s$response)
z1 = as_list(z)
z2 <- z1$teacherList

现在我需要从列表中提取数据并将其作为数据框。顺便说一句,有些人属于2个部门,但有些人只属于1个。列表z2的一部分如下所示:

z2[[1]]

$name
$name[[1]]
[1] "Mary"


$department
$department$name
$department$name[[1]]
[1] "Math"


attr(,"id")
[1] "420"

$department
$department$name
$department$name[[1]]
[1] "statistics"


attr(,"id")
[1] "421"

attr(,"id")
[1] "D95236"

我一一提取的时候,时间太长了:

attr(z2[[1]],"id")

"D95"

z2[[1]][[1]][[1]] 

"Mary"

z2[[1]][[2]][[1]][[1]] 

"Math"

attr(z2[[1]][[2]], "id") 

"420"

z2[[1]][[3]][[1]][[1]] 

"statistics"

attr(z2[[1]][[3]], "id")

"421"

attr(z2[[2]],"id")

"D73"

z2[[2]][[1]][[1]] 

"Adam"

z2[[2]][[2]][[1]][[1]]

"English"

attr(z2[[2]][[2]],"id")

"412"

所以我试着写了一个循环:

for (x in 1:2){
  for (y in 2:3){
  a <- attr(z2[[x]],"id")
  b <- z2[[x]][[1]][[1]]
  d <- z2[[x]][[y]][[1]][[1]]
  e <- attr(z2[[x]][[y]],"id")
  g <- cbind(print(a),print(b),print(d),print(e))
  }}

但它根本不起作用,因为有些人只属于一个部门。我预期的结果:

如有任何建议,我们将不胜感激!

dput(head(z2, 10))

structure(list(teacher = structure(list(name = list("Mary"), 
    department = structure(list(name = list("Math")), .Names = "name", id = "420"), 
    department = structure(list(name = list("statistics")), .Names = "name", id = "421")), .Names = c("name", 
"department", "department"), id = "D95"), teacher = structure(list(
    name = list("Adam"), department = structure(list(name = list(
        "English")), .Names = "name", id = "412")), .Names = c("name", 
"department"), id = "D73"), teacher = structure(list(name = list(
    "Kevin"), department = structure(list(name = list("Chinese")), .Names = "name", id = "201")), .Names = c("name", 
"department"), id = "D101"), teacher = structure(list(name = list(
    "Nana"), department = structure(list(name = list("Science")), .Names = "name", id = "205")), .Names = c("name", 
"department"), id = "D58"), teacher = structure(list(name = list(
    "Nelson"), department = structure(list(name = list("Music")), .Names = "name", id = "370")), .Names = c("name", 
"department"), id = "D14"), teacher = structure(list(name = list(
    "Esther"), department = structure(list(name = list("Medicine")), .Names = "name", id = "361")), .Names = c("name", 
"department"), id = "D28"), teacher = structure(list(name = list(
    "Mia"), department = structure(list(name = list("Chemistry")), .Names = "name", id = "326")), .Names = c("name", 
"department"), id = "D17"), teacher = structure(list(name = list(
    "Jack"), department = structure(list(name = list("German")), .Names = "name", id = "306")), .Names = c("name", 
"department"), id = "D80"), teacher = structure(list(name = list(
    "Tom"), department = structure(list(name = list("French")), .Names = "name", id = "360")), .Names = c("name", 
"department"), id = "D53"), teacher = structure(list(name = list(
    "Allen"), department = structure(list(name = list("Spanish")), .Names = "name", id = "322")), .Names = c("name", 
"department"), id = "D18")), .Names = c("teacher", "teacher", 
"teacher", "teacher", "teacher", "teacher", "teacher", "teacher", "teacher", 
"teacher"))

XML 通常在 R

中很容易处理

使用 library(XML)library(plyr) 避免必须编写循环:

第一步是阅读 XML

我将您的示例 XML 保存为名为 Demo.xml 的 .xml 文件。也可以通过xml解析一个URL。

rawXML <- xmlParse("Demo.xml")

然后将XML转换为列表:

xmlList <- xmlToList(rawXML)

然后使用plyr

将列表转换为数据框

df1 <- ldply(xmlList, data.frame)

这是一般流程,如果您提供示例数据,我们可以对其进行优化以匹配您的特定用例。

这是生成的摘要输出。这是您要找的吗?

 str(df1)
'data.frame':   4 obs. of  12 variables:
 $ .id                        : chr  "code" "note" "teacherList" ".attrs"
 $ X..i..                     : Factor w/ 2 levels "1","success": 1 2 NA 2
 $ teacher.name               : Factor w/ 1 level "Mary": NA NA 1 NA
 $ teacher.department.name    : Factor w/ 1 level "Math": NA NA 1 NA
 $ teacher.department..attrs  : Factor w/ 1 level "420": NA NA 1 NA
 $ teacher.department.name.1  : Factor w/ 1 level "Statistics": NA NA 1 NA
 $ teacher.department..attrs.1: Factor w/ 1 level "421": NA NA 1 NA
 $ teacher..attrs             : Factor w/ 1 level "D95": NA NA 1 NA
 $ teacher.name.1             : Factor w/ 1 level "Adam": NA NA 1 NA
 $ teacher.department.name.2  : Factor w/ 1 level "English": NA NA 1 NA
 $ teacher.department..attrs.2: Factor w/ 1 level "412": NA NA 1 NA
 $ teacher..attrs.1           : Factor w/ 1 level "D73": NA NA 1 NA

构建起来有点疯狂,但我认为它或多或少符合 post 在 post 的先前版本中编辑的预期输出。我不得不在 lapply 函数中使用 sapply 来提取第二个 ID 变量。

do.call(rbind,             # rbind list of data.frames output by lapply
        lapply(unname(z2), # loop through list, first drop outer names
               function(x) { # begin lapply function
                 temp <- unlist(x) # unlist inner elements to a vector
                 data.frame(name=temp[names(temp) == "name"], # subset on names
                            dept=temp[names(temp) == "department.name"], # subset on dept
                            id=attr(x, "id"), # extract one id
                            id2=unlist(sapply(x, attr, "id")), # extract other id
                            row.names=NULL) # end data.frame function, drop row.names
                            })) # end lapply function, lapply, and do.call

这个returns

     name       dept   id id2
1    Mary       Math  D95 420
2    Mary statistics  D95 421
3    Adam    English  D73 412
4   Kevin    Chinese D101 201
5    Nana    Science  D58 205
6  Nelson      Music  D14 370
7  Esther   Medicine  D28 361
8     Mia  Chemistry  D17 326
9    Jack     German  D80 306
10    Tom     French  D53 360
11  Allen    Spanish  D18 322

第二个列表的结构在很多方面都与第一个示例不同。第一:一个巢被移除。也就是说,新列表的深度比初始示例的深度少一个。就像您为初始列表提供了 z2[[1]] 一样。其次,第二个示例缺少我最初称为 id 的内容(例如 D95 和 D101 等值)。

通过对原始代码进行一些操作,我可以使用它

lapply(list(z3), # loop through list, first drop outer names
       function(x) { # begin lapply function
           temp <- unlist(x) # unlist inner elements to a vector
           data.frame(name=temp[names(temp) == "name"], # subset on names
                      dept=temp[names(temp) == "department.name"], # subset on dept
                      # id=attr(x, "id"), # extract one id
                      id2=unlist(sapply(x, attr, "id")), # extract other id
                      row.names=NULL) # end data.frame function, drop row.names
       })

代码的更改解决了我之前提到的 z2 被 list(z3) 替换为 lapply 的第一个参数,它构建了所需的列表深度。此外,内部函数 id=attr(x, "id"), 的行已被注释掉,因为 id2 不存在。