将 YAML 文件读取为 R 中的数据框时出错
Error in reading YAML files as data frame in R
我正在尝试读取位于here using below commands, but both did not give the data in the required output format like the CSV files located here. Description of data in the YAML files is here或快速的YAML文件格式,您可以直接参考最后给出的格式。
我尝试使用这些命令加载数据,但没有成功。谁能指导我正确加载 YAML 文件中的数据作为 R 数据名或按照上面指定的输出格式转换为 csv?
cric <- yaml.load_file("911047.yaml")
cric <- data.frame(yaml.load_file("211028.yaml"))
我在下面给出了数据的高级格式供您快速参考(抱歉,粘贴到这里时原始 YAML 代码格式已经消失,我想不出一种方法来按原样粘贴和重新格式化):
meta:
data_version: 0.6
created: 2013-02-22
revision: 1
info:
city: Southampton
dates:
- 2005-06-13
match_type: T20
outcome:
by:
runs: 100
winner: England
overs: 20
player_of_match:
- KP Pietersen
teams:
- England
- Australia
toss:
decision: bat
winner: England
umpires:
- NJ Llong
- JW Lloyds
venue: The Rose Bowl innings:
- 1st innings:
team: England
deliveries:
- 0.1:
batsman: ME Trescothick
bowler: B Lee
non_striker: GO Jones
runs:
batsman: 0
extras: 0
total: 0
可以通过包 reshape2 中的 melt 解决
下面的代码会有所帮助
library(reshape2)
library(reshape2)
data = yaml.load_file("C:\Users\vsahu\Downloads\mdms\911047.yaml")
x = melt(data)
y = data.frame(x)
meta = y[y$L1 == 'meta',]
meta = meta[, colSums(is.na(meta)) != nrow(meta)]
data_meta = reshape(meta,direction = 'wide',timevar = 'L2',idvar = 'L1')
info = y[y$L1 == 'info',]
info = info[, colSums(is.na(info)) != nrow(info)]
info = subset(data_innings, select=-c(L1))
data_innings = y[(y$L1 == 'innings') & (y$L4 == 'deliveries'),]
data_innings$new = paste(data_innings$L7,data_innings$L8,sep="_")
data_innings = subset(data_innings, select=-c(L7,L8,L4,L1,L5))
data_innings = reshape(data_innings,idvar=c('L2','L3','L6'),direction = "wide",timevar = c('new'))
write.csv(data_innings,"data_innings.csv",row.names = F)
我已经编辑了上面 Vaibhav 的回答以创建一个函数来读取指定目录中的所有 yaml 文件并将其转换为 csv。它负责处理由重塑引起的多行匹配错误。
aggr_fielder <- function(x) {
paste0(x, collapse="/")
}
convertCricsheetData <- function(source = ".",destination = ""){
require(yaml)
require(reshape2)
require(data.table)
all.files <- list.files(path = source,
pattern = ".yaml",
full.names = TRUE)
for (i in 1:length(all.files)) {
data = yaml.load_file(all.files[i])
x = melt(data)
y = data.table(x)
meta = y[y$L1 == 'meta',]
meta = meta[, colSums(is.na(meta)) != nrow(meta), with=FALSE]
data_meta = reshape(meta,direction = 'wide',timevar = 'L2',idvar = 'L1')
info = y[y$L1 == 'info',]
info = info[, colSums(is.na(info)) != nrow(info), with=FALSE]
info[, L1 := NULL]
info[,match_no := i]
data_innings = y[(y$L1 == 'innings') & (y$L4 == 'deliveries'),]
data_innings[, new := paste(data_innings$L7,data_innings$L8,sep="_")]
data_innings [, c("L7","L8","L4","L1","L5") := NULL]
data_innings = dcast(data_innings, L2+L3+L6 ~ new, fun.aggregate = aggr_fielder,fill = NA)
data_innings[,match_no := i]
write.csv(data_innings,paste0(destination,paste(c(info[info$L2 == "dates",]$value,info[info$L2 == "teams",]$value), collapse = "-"),".csv"),row.names = F)
write.csv(info,paste0(destination,paste(c("info",info[info$L2 == "dates",]$value,info[info$L2 == "teams",]$value), collapse = "-"),".csv"),row.names = F)
}
}
我正在尝试读取位于here using below commands, but both did not give the data in the required output format like the CSV files located here. Description of data in the YAML files is here或快速的YAML文件格式,您可以直接参考最后给出的格式。
我尝试使用这些命令加载数据,但没有成功。谁能指导我正确加载 YAML 文件中的数据作为 R 数据名或按照上面指定的输出格式转换为 csv?
cric <- yaml.load_file("911047.yaml")
cric <- data.frame(yaml.load_file("211028.yaml"))
我在下面给出了数据的高级格式供您快速参考(抱歉,粘贴到这里时原始 YAML 代码格式已经消失,我想不出一种方法来按原样粘贴和重新格式化):
meta:
data_version: 0.6
created: 2013-02-22
revision: 1
info:
city: Southampton
dates:
- 2005-06-13
match_type: T20
outcome:
by:
runs: 100
winner: England
overs: 20
player_of_match:
- KP Pietersen
teams:
- England
- Australia
toss:
decision: bat
winner: England
umpires:
- NJ Llong
- JW Lloyds
venue: The Rose Bowl innings:
- 1st innings:
team: England
deliveries:
- 0.1:
batsman: ME Trescothick
bowler: B Lee
non_striker: GO Jones
runs:
batsman: 0
extras: 0
total: 0
可以通过包 reshape2 中的 melt 解决
下面的代码会有所帮助
library(reshape2)
library(reshape2)
data = yaml.load_file("C:\Users\vsahu\Downloads\mdms\911047.yaml")
x = melt(data)
y = data.frame(x)
meta = y[y$L1 == 'meta',]
meta = meta[, colSums(is.na(meta)) != nrow(meta)]
data_meta = reshape(meta,direction = 'wide',timevar = 'L2',idvar = 'L1')
info = y[y$L1 == 'info',]
info = info[, colSums(is.na(info)) != nrow(info)]
info = subset(data_innings, select=-c(L1))
data_innings = y[(y$L1 == 'innings') & (y$L4 == 'deliveries'),]
data_innings$new = paste(data_innings$L7,data_innings$L8,sep="_")
data_innings = subset(data_innings, select=-c(L7,L8,L4,L1,L5))
data_innings = reshape(data_innings,idvar=c('L2','L3','L6'),direction = "wide",timevar = c('new'))
write.csv(data_innings,"data_innings.csv",row.names = F)
我已经编辑了上面 Vaibhav 的回答以创建一个函数来读取指定目录中的所有 yaml 文件并将其转换为 csv。它负责处理由重塑引起的多行匹配错误。
aggr_fielder <- function(x) {
paste0(x, collapse="/")
}
convertCricsheetData <- function(source = ".",destination = ""){
require(yaml)
require(reshape2)
require(data.table)
all.files <- list.files(path = source,
pattern = ".yaml",
full.names = TRUE)
for (i in 1:length(all.files)) {
data = yaml.load_file(all.files[i])
x = melt(data)
y = data.table(x)
meta = y[y$L1 == 'meta',]
meta = meta[, colSums(is.na(meta)) != nrow(meta), with=FALSE]
data_meta = reshape(meta,direction = 'wide',timevar = 'L2',idvar = 'L1')
info = y[y$L1 == 'info',]
info = info[, colSums(is.na(info)) != nrow(info), with=FALSE]
info[, L1 := NULL]
info[,match_no := i]
data_innings = y[(y$L1 == 'innings') & (y$L4 == 'deliveries'),]
data_innings[, new := paste(data_innings$L7,data_innings$L8,sep="_")]
data_innings [, c("L7","L8","L4","L1","L5") := NULL]
data_innings = dcast(data_innings, L2+L3+L6 ~ new, fun.aggregate = aggr_fielder,fill = NA)
data_innings[,match_no := i]
write.csv(data_innings,paste0(destination,paste(c(info[info$L2 == "dates",]$value,info[info$L2 == "teams",]$value), collapse = "-"),".csv"),row.names = F)
write.csv(info,paste0(destination,paste(c("info",info[info$L2 == "dates",]$value,info[info$L2 == "teams",]$value), collapse = "-"),".csv"),row.names = F)
}
}