以 SDMX 格式循环 URL 并将结果合并到 R 数据帧中
Loop URL in SDMX format and combine the result into R dataframe
我正在尝试下载 World Bank's WITS data and convert them into R dataframe. The site's API(参考第 12-13 页)似乎不允许用户一次调用所有“记者”、“合作伙伴”和“产品”,所以可能需要使用他们的 XML 请求格式循环遍历国家列表(作为“记者”或“合作伙伴”):
http://wits.worldbank.org/API/V1/SDMX/V21/datasource/tradestats-tariff/reporter/usa/year/2000/partner/all/product/all/indicator/AHS-WGHTD-AVRG
(需要通过“记者”(国家/地区缩写)列表遍历“美国”部分)我的目标是遍历国家/地区缩写列表,为每个 运行,然后将它们绑定在一起成为一个更大的数据框。所以我引用了 并使用了下面的代码,但它并没有让我更进一步。我在下面发布了我的代码,如果有人可以看一下并分享一些技巧,我将不胜感激。
# load required packages
library(RCurl)
library(XML)
devtools::install_github("opensdmx/rsdmx")
library(rsdmx)
# if just for one reporter (usa)
myUrl <- "http://wits.worldbank.org/API/V1/SDMX/V21/datasource/tradestats-tariff/reporter/usa/year/2000/partner/all/product/all/indicator/AHS-WGHTD-AVRG"
dataset <- readSDMX(myUrl)
stats <- as.data.frame(dataset)
dim(stats)
[1] 5142 8
# looks like this
head(stats)
FREQ REPORTER PARTNER PRODUCTCODE
1 A USA ABW 01-05_Animal
2 A USA ABW 06-15_Vegetable
3 A USA ABW 16-24_FoodProd
## loop through a list of reporters (countries)
library(rvest)
# teams
reporters <- c("aus", "usa", "ukr")
# init
df <- data.frame()
# loop
for(i in reporters){
# find url
myUrl <- paste0("http://wits.worldbank.org/API/V1/SDMX/V21/datasource/tradestats-tariff/reporter/", i,"/year/2000/partner/all/product/all/indicator/AHS-WGHTD-AVRG")
dataset <- readSDMX(myUrl)
stats <- as.data.frame(dataset)
# bind to dataframe
df <- rbind(df, stats)
}
# view captured data
View(df)
# NOTHING!
考虑 R 的内置 utils::download.file()
然后用 XML
解析。因为您的数据是以属性为中心的,在 <Series>
和 <Obs>
节点中没有元素文本,请考虑未记录的 xmlAttrsToDataFrame
,需要三个冒号限定符 :::
.
最后,使用像 sapply
这样的应用函数,避免 for
循环的簿记和通过在同一数据帧上迭代调用 rbind
来低效地增长循环中的对象。下面甚至包含下载和 XML 解析 tryCatch
中的潜在错误,例如 ukr
.
library(XML)
build_df <- function(i) {
url <- paste0(
"http://wits.worldbank.org/API/V1/SDMX/V21/datasource/",
"tradestats-tariff/reporter/", i,
"/year/2000/partner/all/product/all/indicator/AHS-WGHTD-AVRG"
)
df <- NULL
tryCatch({
tmp <- tempfile() # DOWNLOAD XML TO TEMP FILE
download.file(url, tmp)
# PARSE XML AND EXTRACT ATTRIBUTES TO DATA FRAME
doc <- XML::xmlParse(tmp)
df <- cbind(
XML:::xmlAttrsToDataFrame(getNodeSet(doc, "//Series")),
XML:::xmlAttrsToDataFrame(getNodeSet(doc, "//Obs"))
)
unlink(tmp) # DELETE TEMP FILE
}, warning = function(w) print(w)
, error = function(e) print(e)
)
return(df)
}
reporters <- c("aus", "usa", "ukr")
# NAMED LIST OF DATA FRAMES
df_list <- sapply(reporters, build_df)
# COMPILE ALL INTO SINGLE DATA FRAME (CALL rbind ONCE)
final_df <- do.call(rbind, unname(df_list))
输出
df_list
head(df_list$aus)
FREQ REPORTER PARTNER PRODUCTCODE INDICATOR TIME_PERIOD OBS_VALUE DATASOURCE
1 A AUS AFG 50-63_TextCloth AHS-WGHTD-AVRG 2000 0.46377738685431 WITS-TRN
2 A AUS AFG manuf AHS-WGHTD-AVRG 2000 0.46377738685431 WITS-TRN
3 A AUS AFG Textiles AHS-WGHTD-AVRG 2000 0.46377738685431 WITS-TRN
4 A AUS AFG Total AHS-WGHTD-AVRG 2000 0.46377738685431 WITS-TRN
5 A AUS AFG UNCTAD-SoP3 AHS-WGHTD-AVRG 2000 0.46377738685431 WITS-TRN
6 A AUS AGO 41-43_HidesSkin AHS-WGHTD-AVRG 2000 20 WITS-TRN
head(df_list$usa)
FREQ REPORTER PARTNER PRODUCTCODE INDICATOR TIME_PERIOD OBS_VALUE DATASOURCE
1 A USA ABW 01-05_Animal AHS-WGHTD-AVRG 2000 0 WITS-TRN
2 A USA ABW 06-15_Vegetable AHS-WGHTD-AVRG 2000 0 WITS-TRN
3 A USA ABW 16-24_FoodProd AHS-WGHTD-AVRG 2000 0 WITS-TRN
4 A USA ABW 27-27_Fuels AHS-WGHTD-AVRG 2000 0 WITS-TRN
5 A USA ABW 28-38_Chemicals AHS-WGHTD-AVRG 2000 0 WITS-TRN
6 A USA ABW 39-40_PlastiRub AHS-WGHTD-AVRG 2000 0 WITS-TRN
head(df_list$ukr)
NULL
final_df
head(final_df)
FREQ REPORTER PARTNER PRODUCTCODE INDICATOR TIME_PERIOD OBS_VALUE DATASOURCE
1 A AUS AFG 50-63_TextCloth AHS-WGHTD-AVRG 2000 0.46377738685431 WITS-TRN
2 A AUS AFG manuf AHS-WGHTD-AVRG 2000 0.46377738685431 WITS-TRN
3 A AUS AFG Textiles AHS-WGHTD-AVRG 2000 0.46377738685431 WITS-TRN
4 A AUS AFG Total AHS-WGHTD-AVRG 2000 0.46377738685431 WITS-TRN
5 A AUS AFG UNCTAD-SoP3 AHS-WGHTD-AVRG 2000 0.46377738685431 WITS-TRN
6 A AUS AGO 41-43_HidesSkin AHS-WGHTD-AVRG 2000 20 WITS-TRN
tail(final_df)
FREQ REPORTER PARTNER PRODUCTCODE INDICATOR TIME_PERIOD OBS_VALUE DATASOURCE
8966 A USA ZWE Total AHS-WGHTD-AVRG 2000 5.65257483778078 WITS-TRN
8967 A USA ZWE Transp AHS-WGHTD-AVRG 2000 0.249632882835876 WITS-TRN
8968 A USA ZWE UNCTAD-SoP1 AHS-WGHTD-AVRG 2000 29.5531507778593 WITS-TRN
8969 A USA ZWE UNCTAD-SoP2 AHS-WGHTD-AVRG 2000 2.7243509937362 WITS-TRN
8970 A USA ZWE UNCTAD-SoP3 AHS-WGHTD-AVRG 2000 5.12801568054237 WITS-TRN
8971 A USA ZWE UNCTAD-SoP4 AHS-WGHTD-AVRG 2000 0.0082396181212962 WITS-TRN
我正在尝试下载 World Bank's WITS data and convert them into R dataframe. The site's API(参考第 12-13 页)似乎不允许用户一次调用所有“记者”、“合作伙伴”和“产品”,所以可能需要使用他们的 XML 请求格式循环遍历国家列表(作为“记者”或“合作伙伴”):
http://wits.worldbank.org/API/V1/SDMX/V21/datasource/tradestats-tariff/reporter/usa/year/2000/partner/all/product/all/indicator/AHS-WGHTD-AVRG
(需要通过“记者”(国家/地区缩写)列表遍历“美国”部分)我的目标是遍历国家/地区缩写列表,为每个 运行,然后将它们绑定在一起成为一个更大的数据框。所以我引用了
# load required packages
library(RCurl)
library(XML)
devtools::install_github("opensdmx/rsdmx")
library(rsdmx)
# if just for one reporter (usa)
myUrl <- "http://wits.worldbank.org/API/V1/SDMX/V21/datasource/tradestats-tariff/reporter/usa/year/2000/partner/all/product/all/indicator/AHS-WGHTD-AVRG"
dataset <- readSDMX(myUrl)
stats <- as.data.frame(dataset)
dim(stats)
[1] 5142 8
# looks like this
head(stats)
FREQ REPORTER PARTNER PRODUCTCODE
1 A USA ABW 01-05_Animal
2 A USA ABW 06-15_Vegetable
3 A USA ABW 16-24_FoodProd
## loop through a list of reporters (countries)
library(rvest)
# teams
reporters <- c("aus", "usa", "ukr")
# init
df <- data.frame()
# loop
for(i in reporters){
# find url
myUrl <- paste0("http://wits.worldbank.org/API/V1/SDMX/V21/datasource/tradestats-tariff/reporter/", i,"/year/2000/partner/all/product/all/indicator/AHS-WGHTD-AVRG")
dataset <- readSDMX(myUrl)
stats <- as.data.frame(dataset)
# bind to dataframe
df <- rbind(df, stats)
}
# view captured data
View(df)
# NOTHING!
考虑 R 的内置 utils::download.file()
然后用 XML
解析。因为您的数据是以属性为中心的,在 <Series>
和 <Obs>
节点中没有元素文本,请考虑未记录的 xmlAttrsToDataFrame
,需要三个冒号限定符 :::
.
最后,使用像 sapply
这样的应用函数,避免 for
循环的簿记和通过在同一数据帧上迭代调用 rbind
来低效地增长循环中的对象。下面甚至包含下载和 XML 解析 tryCatch
中的潜在错误,例如 ukr
.
library(XML)
build_df <- function(i) {
url <- paste0(
"http://wits.worldbank.org/API/V1/SDMX/V21/datasource/",
"tradestats-tariff/reporter/", i,
"/year/2000/partner/all/product/all/indicator/AHS-WGHTD-AVRG"
)
df <- NULL
tryCatch({
tmp <- tempfile() # DOWNLOAD XML TO TEMP FILE
download.file(url, tmp)
# PARSE XML AND EXTRACT ATTRIBUTES TO DATA FRAME
doc <- XML::xmlParse(tmp)
df <- cbind(
XML:::xmlAttrsToDataFrame(getNodeSet(doc, "//Series")),
XML:::xmlAttrsToDataFrame(getNodeSet(doc, "//Obs"))
)
unlink(tmp) # DELETE TEMP FILE
}, warning = function(w) print(w)
, error = function(e) print(e)
)
return(df)
}
reporters <- c("aus", "usa", "ukr")
# NAMED LIST OF DATA FRAMES
df_list <- sapply(reporters, build_df)
# COMPILE ALL INTO SINGLE DATA FRAME (CALL rbind ONCE)
final_df <- do.call(rbind, unname(df_list))
输出
df_list
head(df_list$aus)
FREQ REPORTER PARTNER PRODUCTCODE INDICATOR TIME_PERIOD OBS_VALUE DATASOURCE
1 A AUS AFG 50-63_TextCloth AHS-WGHTD-AVRG 2000 0.46377738685431 WITS-TRN
2 A AUS AFG manuf AHS-WGHTD-AVRG 2000 0.46377738685431 WITS-TRN
3 A AUS AFG Textiles AHS-WGHTD-AVRG 2000 0.46377738685431 WITS-TRN
4 A AUS AFG Total AHS-WGHTD-AVRG 2000 0.46377738685431 WITS-TRN
5 A AUS AFG UNCTAD-SoP3 AHS-WGHTD-AVRG 2000 0.46377738685431 WITS-TRN
6 A AUS AGO 41-43_HidesSkin AHS-WGHTD-AVRG 2000 20 WITS-TRN
head(df_list$usa)
FREQ REPORTER PARTNER PRODUCTCODE INDICATOR TIME_PERIOD OBS_VALUE DATASOURCE
1 A USA ABW 01-05_Animal AHS-WGHTD-AVRG 2000 0 WITS-TRN
2 A USA ABW 06-15_Vegetable AHS-WGHTD-AVRG 2000 0 WITS-TRN
3 A USA ABW 16-24_FoodProd AHS-WGHTD-AVRG 2000 0 WITS-TRN
4 A USA ABW 27-27_Fuels AHS-WGHTD-AVRG 2000 0 WITS-TRN
5 A USA ABW 28-38_Chemicals AHS-WGHTD-AVRG 2000 0 WITS-TRN
6 A USA ABW 39-40_PlastiRub AHS-WGHTD-AVRG 2000 0 WITS-TRN
head(df_list$ukr)
NULL
final_df
head(final_df)
FREQ REPORTER PARTNER PRODUCTCODE INDICATOR TIME_PERIOD OBS_VALUE DATASOURCE
1 A AUS AFG 50-63_TextCloth AHS-WGHTD-AVRG 2000 0.46377738685431 WITS-TRN
2 A AUS AFG manuf AHS-WGHTD-AVRG 2000 0.46377738685431 WITS-TRN
3 A AUS AFG Textiles AHS-WGHTD-AVRG 2000 0.46377738685431 WITS-TRN
4 A AUS AFG Total AHS-WGHTD-AVRG 2000 0.46377738685431 WITS-TRN
5 A AUS AFG UNCTAD-SoP3 AHS-WGHTD-AVRG 2000 0.46377738685431 WITS-TRN
6 A AUS AGO 41-43_HidesSkin AHS-WGHTD-AVRG 2000 20 WITS-TRN
tail(final_df)
FREQ REPORTER PARTNER PRODUCTCODE INDICATOR TIME_PERIOD OBS_VALUE DATASOURCE
8966 A USA ZWE Total AHS-WGHTD-AVRG 2000 5.65257483778078 WITS-TRN
8967 A USA ZWE Transp AHS-WGHTD-AVRG 2000 0.249632882835876 WITS-TRN
8968 A USA ZWE UNCTAD-SoP1 AHS-WGHTD-AVRG 2000 29.5531507778593 WITS-TRN
8969 A USA ZWE UNCTAD-SoP2 AHS-WGHTD-AVRG 2000 2.7243509937362 WITS-TRN
8970 A USA ZWE UNCTAD-SoP3 AHS-WGHTD-AVRG 2000 5.12801568054237 WITS-TRN
8971 A USA ZWE UNCTAD-SoP4 AHS-WGHTD-AVRG 2000 0.0082396181212962 WITS-TRN