从 R 中不常见的 JSON 结构中提取所有信息
Extracting all the information from an uncommon JSON structure in R
在之前的 post () 中,我问过一个不常见的数据结构(或者至少对 R 不常见)
我有一个结构如下的 txt 文件:
identifier ### part A ### part B
我的真实数据的第一行的简化是这样
1 ### [{"X": "1", "Y": "2", "Z": "3"}, {"X": "4", "Y": "5", "Z": "6"}] ### [{"X": "7", "Y": "8", "Z": "9"}, {"X": "10", "Y": "11", "Z": "12"}, {"X": "13", "Y": "14", "Z": "15"}]
此结构来自public数据。
我用过这个
setwd("/myfolder")
library(stringi)
library(purrr)
library(jsonlite)
raw <- readLines("myfile.txt")
raw <- gsub("^.\###", " ", raw)
PartB <- gsub("^.*\]\###\[", "", raw)
PartB <- paste0("[", PartB)
PartB <- stri_replace_first_regex(PartB, "\###", "") %>%
map_df(fromJSON)
save(fundamento, file = "PartB.RData")
PartA <- gsub(pattern = "(.*\###)(.*)(\###.*)", "\2", raw)
PartA <- stri_replace_first_regex(concepto, "\###", "") %>%
map_df(fromJSON)
save(PartA, file = "PartA.RData")
这会创建两个数据框
PartA
X Y Z
1 2 3
4 5 6
PartB
X Y Z
7 8 9
10 11 12
13 14 15
我试过得到这样的东西
PartA
identifier part X Y Z
1 A 1 2 3
1 A 4 5 6
PartB
identifier part X Y Z
1 B 7 8 9
1 B 10 11 12
1 B 13 14 15
任何想法都会有所帮助。非常感谢。
试试这个:
library(stringr)
library(tidyjson)
library(purrr)
library(dplyr)
line1 <- '### [{"X": "1", "Y": "2", "Z": "3"}, {"X": "4", "Y": "5", "Z": "6"}] ### [{"X": "7", "Y": "8", "Z": "9"}, {"X": "10", "Y": "11", "Z": "12"}, {"X": "13", "Y": "14", "Z": "15"}]'
line2 <- '### [{"X": "2", "Y": "3", "Z": "4"}, {"X": "5", "Y": "6", "Z": "7"}] ### [{"X": "8", "Y": "9", "Z": "10"}, {"X": "11", "Y": "12", "Z": "13"}, {"X": "14", "Y": "15", "Z": "16"}]'
raw <- c(line1, line2)
cleanup_line <- function(line, id) {
line %>% gsub('^### ', '', .) %>% str_split('###') %>%
flatten_chr %>%
gather_array %>%
spread_all %>%
mutate(identifier = id, part = letters[document.id]) %>%
select(identifier, part, X, Y, Z) %>%
tbl_df
}
map2_df(raw, seq_along(raw), cleanup_line)
你需要 tidyjson 的开发版本才能使用 spread_all
,否则你可以使用 CRAN 版本中更详细的 spread_values
。
在之前的 post (
我有一个结构如下的 txt 文件:
identifier ### part A ### part B
我的真实数据的第一行的简化是这样
1 ### [{"X": "1", "Y": "2", "Z": "3"}, {"X": "4", "Y": "5", "Z": "6"}] ### [{"X": "7", "Y": "8", "Z": "9"}, {"X": "10", "Y": "11", "Z": "12"}, {"X": "13", "Y": "14", "Z": "15"}]
此结构来自public数据。
我用过这个
setwd("/myfolder")
library(stringi)
library(purrr)
library(jsonlite)
raw <- readLines("myfile.txt")
raw <- gsub("^.\###", " ", raw)
PartB <- gsub("^.*\]\###\[", "", raw)
PartB <- paste0("[", PartB)
PartB <- stri_replace_first_regex(PartB, "\###", "") %>%
map_df(fromJSON)
save(fundamento, file = "PartB.RData")
PartA <- gsub(pattern = "(.*\###)(.*)(\###.*)", "\2", raw)
PartA <- stri_replace_first_regex(concepto, "\###", "") %>%
map_df(fromJSON)
save(PartA, file = "PartA.RData")
这会创建两个数据框
PartA
X Y Z
1 2 3
4 5 6
PartB
X Y Z
7 8 9
10 11 12
13 14 15
我试过得到这样的东西
PartA
identifier part X Y Z
1 A 1 2 3
1 A 4 5 6
PartB
identifier part X Y Z
1 B 7 8 9
1 B 10 11 12
1 B 13 14 15
任何想法都会有所帮助。非常感谢。
试试这个:
library(stringr)
library(tidyjson)
library(purrr)
library(dplyr)
line1 <- '### [{"X": "1", "Y": "2", "Z": "3"}, {"X": "4", "Y": "5", "Z": "6"}] ### [{"X": "7", "Y": "8", "Z": "9"}, {"X": "10", "Y": "11", "Z": "12"}, {"X": "13", "Y": "14", "Z": "15"}]'
line2 <- '### [{"X": "2", "Y": "3", "Z": "4"}, {"X": "5", "Y": "6", "Z": "7"}] ### [{"X": "8", "Y": "9", "Z": "10"}, {"X": "11", "Y": "12", "Z": "13"}, {"X": "14", "Y": "15", "Z": "16"}]'
raw <- c(line1, line2)
cleanup_line <- function(line, id) {
line %>% gsub('^### ', '', .) %>% str_split('###') %>%
flatten_chr %>%
gather_array %>%
spread_all %>%
mutate(identifier = id, part = letters[document.id]) %>%
select(identifier, part, X, Y, Z) %>%
tbl_df
}
map2_df(raw, seq_along(raw), cleanup_line)
你需要 tidyjson 的开发版本才能使用 spread_all
,否则你可以使用 CRAN 版本中更详细的 spread_values
。