将多个元素 XML 值解析为 R 数据帧
Parse multiple element XML values into a R dataframe
我有一个 XML 像:
<Trait ID="4711" Type="Disease">
<!-- each phenotype -->
<Name>
<ElementValue Type="Preferred">Breast-ovarian cancer, familial 1</ElementValue>
<XRef ID="Breast-ovarian+cancer%2C+familial+1/7865" DB="Genetic Alliance"/>
</Name>
<Name>
<ElementValue Type="Alternate">BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1</ElementValue>
<XRef Type="MIM" ID="604370" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0001" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0002" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0003" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0004" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0005" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0006" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0007" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0008" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0009" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0010" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0011" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0012" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0013" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0014" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0015" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0016" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0017" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0018" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0019" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0020" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0021" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0022" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0023" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0024" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0025" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0026" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0027" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0028" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0029" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0030" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0031" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0032" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0033" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0034" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0035" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0036" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0037" DB="OMIM"/>
</Name>
<Name>
<ElementValue Type="Alternate">OVARIAN CANCER, SUSCEPTIBILITY TO</ElementValue>
<XRef Type="Allelic variant" ID="602667.0001" DB="OMIM"/>
</Name>
<Name>
<ElementValue Type="Alternate">BREAST CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1</ElementValue>
<XRef ID="604370" DB="OMIM"/>
</Name>
<Name>
<ElementValue Type="Alternate">Breast cancer, familial 1</ElementValue>
</Name>
<Name>
<ElementValue Type="Alternate">Breast-ovarian cancer, familial 1 and 2</ElementValue>
<XRef ID="GTR000310494" DB="Laboratory of Genetics,HUSLAB"/>
</Name>
<Name>
<ElementValue Type="Alternate">BRCA1 Gene Mutation</ElementValue>
<XRef ID="GTR000501743" DB="Myriad Genetic Laboratories,Myriad Genetic Laboratories, Inc."/>
</Name>
</Trait>
我想将 XML 解析为如下数据框:
enter image description here
我想使用 r 包 XML,但问题是我的 XREF 属性值多于特征值的名称。
我可以用 for 循环解决这个问题,但这通常不是“R 方式”。我想知道是否有更简单的解决方案? (例如,使用 xpath 查询)。
我正在尝试这样的事情:
x <- do.call(rbind, xpathApply(xml_1, "//TraitSet/Trait[@ID='4711']/Name", function(node) {
trait <- xmlValue(node[["ElementValue"]])
xp <- "//TraitSet/Trait[@ID='4711']/Name/XRef"
DB <- sapply(c("ID","DB"), function(x) xpathSApply(xmltop, '//TraitSet/Trait/Name/XRef', xmlGetAttr, x))
if (is.null(DB)) DB <- NA
data.frame(trait, DB, stringsAsFactors = FALSE)
}))
但是记录乘错了。
非常感谢您的帮助!谢谢
我认为这是一个解决方案 xml2 和一个快速而冗长的 tidy verse 方法。
导入嵌套的 xml 数据时,要获得紧凑的代码并不总是那么容易。
library(xml2)
library(dplyr)
library(tidyr)
library(purrr)
# read the xml file
te <- xml2::read_xml("~/Desktop/test_so.xml")
# to move from node to node
cursor <- xml2::xml_find_all(te, ".//Name")
# i <- 1L
seq_along(cursor) %>%
# map to move along cursor
purrr::map_df(function(i){
print(i)
x <- cursor[i]
# first part is Name and Type (Alternate/Disease/Preferred)
dplyr::tibble(Type = xml2::xml_attr(xml_find_all(x, './/ElementValue'), 'Type'),
Trait = xml2::xml_text(x)) -> temp
# second part is not always here so test if it exists before, then
# if it exists, extract and compact it (nest)
if (!is.na(xml2::xml_text(xml2::xml_find_first(x, './/XRef')))){
Details <- dplyr::tibble(
DB = xml2::xml_find_all(x, './/XRef') %>%
xml2::xml_attr('DB'),
ID = xml2::xml_find_all(x, './/XRef') %>%
xml2::xml_attr('ID'),
Type_ = xml2::xml_find_all(x, './/XRef') %>%
xml2::xml_attr('Type')) %>% tidyr::nest(data = c('DB', 'ID', 'Type_'))
} else {
# if it doesn't exist, fill with an empty df and compact it too (nest)
Details <- dplyr::tibble(DB = NA, ID = NA, Type_ = NA) %>%
tidyr::nest(data = c('DB', 'ID', 'Type_'))
}
# add this new "df" column to temporary object
temp <- temp %>% dplyr::mutate(Details = Details$data)
# return
temp
}) -> te2
result <- te2 %>%
unnest(Details)
# A tibble: 44 × 5
Type Trait DB ID Type_
<chr> <chr> <chr> <chr> <chr>
1 Preferred Breast-ovarian cancer, familial 1 Genetic Alliance Breast-ovarian+cancer%2C+familial+1/7865 NA
2 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM 604370 MIM
3 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM 113705.0001 Allelic variant
4 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM 113705.0002 Allelic variant
5 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM 113705.0003 Allelic variant
6 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM 113705.0004 Allelic variant
7 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM 113705.0005 Allelic variant
8 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM 113705.0006 Allelic variant
9 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM 113705.0007 Allelic variant
10 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM 113705.0008 Allelic variant
# … with 34 more rows
>
我有一个 XML 像:
<Trait ID="4711" Type="Disease">
<!-- each phenotype -->
<Name>
<ElementValue Type="Preferred">Breast-ovarian cancer, familial 1</ElementValue>
<XRef ID="Breast-ovarian+cancer%2C+familial+1/7865" DB="Genetic Alliance"/>
</Name>
<Name>
<ElementValue Type="Alternate">BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1</ElementValue>
<XRef Type="MIM" ID="604370" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0001" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0002" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0003" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0004" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0005" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0006" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0007" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0008" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0009" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0010" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0011" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0012" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0013" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0014" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0015" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0016" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0017" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0018" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0019" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0020" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0021" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0022" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0023" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0024" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0025" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0026" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0027" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0028" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0029" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0030" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0031" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0032" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0033" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0034" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0035" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0036" DB="OMIM"/>
<XRef Type="Allelic variant" ID="113705.0037" DB="OMIM"/>
</Name>
<Name>
<ElementValue Type="Alternate">OVARIAN CANCER, SUSCEPTIBILITY TO</ElementValue>
<XRef Type="Allelic variant" ID="602667.0001" DB="OMIM"/>
</Name>
<Name>
<ElementValue Type="Alternate">BREAST CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1</ElementValue>
<XRef ID="604370" DB="OMIM"/>
</Name>
<Name>
<ElementValue Type="Alternate">Breast cancer, familial 1</ElementValue>
</Name>
<Name>
<ElementValue Type="Alternate">Breast-ovarian cancer, familial 1 and 2</ElementValue>
<XRef ID="GTR000310494" DB="Laboratory of Genetics,HUSLAB"/>
</Name>
<Name>
<ElementValue Type="Alternate">BRCA1 Gene Mutation</ElementValue>
<XRef ID="GTR000501743" DB="Myriad Genetic Laboratories,Myriad Genetic Laboratories, Inc."/>
</Name>
</Trait>
我想将 XML 解析为如下数据框:
enter image description here
我想使用 r 包 XML,但问题是我的 XREF 属性值多于特征值的名称。 我可以用 for 循环解决这个问题,但这通常不是“R 方式”。我想知道是否有更简单的解决方案? (例如,使用 xpath 查询)。
我正在尝试这样的事情:
x <- do.call(rbind, xpathApply(xml_1, "//TraitSet/Trait[@ID='4711']/Name", function(node) {
trait <- xmlValue(node[["ElementValue"]])
xp <- "//TraitSet/Trait[@ID='4711']/Name/XRef"
DB <- sapply(c("ID","DB"), function(x) xpathSApply(xmltop, '//TraitSet/Trait/Name/XRef', xmlGetAttr, x))
if (is.null(DB)) DB <- NA
data.frame(trait, DB, stringsAsFactors = FALSE)
}))
但是记录乘错了。
非常感谢您的帮助!谢谢
我认为这是一个解决方案 xml2 和一个快速而冗长的 tidy verse 方法。
导入嵌套的 xml 数据时,要获得紧凑的代码并不总是那么容易。
library(xml2)
library(dplyr)
library(tidyr)
library(purrr)
# read the xml file
te <- xml2::read_xml("~/Desktop/test_so.xml")
# to move from node to node
cursor <- xml2::xml_find_all(te, ".//Name")
# i <- 1L
seq_along(cursor) %>%
# map to move along cursor
purrr::map_df(function(i){
print(i)
x <- cursor[i]
# first part is Name and Type (Alternate/Disease/Preferred)
dplyr::tibble(Type = xml2::xml_attr(xml_find_all(x, './/ElementValue'), 'Type'),
Trait = xml2::xml_text(x)) -> temp
# second part is not always here so test if it exists before, then
# if it exists, extract and compact it (nest)
if (!is.na(xml2::xml_text(xml2::xml_find_first(x, './/XRef')))){
Details <- dplyr::tibble(
DB = xml2::xml_find_all(x, './/XRef') %>%
xml2::xml_attr('DB'),
ID = xml2::xml_find_all(x, './/XRef') %>%
xml2::xml_attr('ID'),
Type_ = xml2::xml_find_all(x, './/XRef') %>%
xml2::xml_attr('Type')) %>% tidyr::nest(data = c('DB', 'ID', 'Type_'))
} else {
# if it doesn't exist, fill with an empty df and compact it too (nest)
Details <- dplyr::tibble(DB = NA, ID = NA, Type_ = NA) %>%
tidyr::nest(data = c('DB', 'ID', 'Type_'))
}
# add this new "df" column to temporary object
temp <- temp %>% dplyr::mutate(Details = Details$data)
# return
temp
}) -> te2
result <- te2 %>%
unnest(Details)
# A tibble: 44 × 5
Type Trait DB ID Type_
<chr> <chr> <chr> <chr> <chr>
1 Preferred Breast-ovarian cancer, familial 1 Genetic Alliance Breast-ovarian+cancer%2C+familial+1/7865 NA
2 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM 604370 MIM
3 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM 113705.0001 Allelic variant
4 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM 113705.0002 Allelic variant
5 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM 113705.0003 Allelic variant
6 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM 113705.0004 Allelic variant
7 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM 113705.0005 Allelic variant
8 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM 113705.0006 Allelic variant
9 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM 113705.0007 Allelic variant
10 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM 113705.0008 Allelic variant
# … with 34 more rows
>