运行 一个目录中所有文件的 R 脚本,并将输出存储在一个公共数据框中

Run a R script for all files in a directory, and store the outputs in one common data frame

我有一个脚本适用于一个文件。它从 json 文件中获取信息,提取列表和它的子列表 (A),然后是另一个列表 B 和列表 A 的第三个元素。它创建一个包含列表 B 的数据框并将其与一个主文件。最后,它提供了两个数字:列表B中的元素个数和与主文件比较时该列表的匹配元素个数。

但是,我在一个文件夹中有 180 个不同的 json 文件,我需要 运行 所有文件的脚本,并为每个文件构建一个包含结果的数据框。所以最后的结果应该是这样的(注意最后一行的数字是正确的,前两个是虚构的):

我目前的代码如下:

library(rjson)
library(dplyr)
library(tidyverse)

        #load data from file
file <- "./raw_data/whf.json"
json_data <- fromJSON(file = file)
org_name <- json_data$id

        # extract lists and the sublist
usernames <- json_data$twitter   
following <- usernames$following 

        # create empty vector to populate
longitud = length(following)
names <- vector(length = longitud)

        # loop to populate the empty vector with third element of the sub-list
for(i in 1:longitud){
    names[i] <- following[[i]][3]
}

        # create a data frame and change column name
names_list <- data.frame(sapply(names, c))
colnames(names_list) <- "usernames"

        # create a data frame with the correct formatting ready to comparison
org_handles <- data.frame(paste("@", names_list$usernames, sep=""))
colnames(org_handles) <- "Twitter"

        # load master file and select the needed columns
psa_handles <- read_csv(file = "./raw_data/psa_handles.csv") %>%
    select(Name, AKA, Twitter)

        # merge data frames and present the results
org_list <- inner_join(psa_handles, org_handles)
length(org_list$Twitter)
length(usernames$following)

我的第一次尝试是在开头包含这段代码:

files <- list.files()
for(f in files){

json_data <- fromJSON(file = f)

# the rest of the script for one file here

}

但我不知道如何编写数据框的代码,甚至不知道如何整合这两种想法——工作脚本和文件名循环。我的想法来自 here.

Alvaro Morales 回答后的新代码如下

library(rjson)
library(dplyr)
library(tidyverse)

archivos <- list.files("./raw_data/")
calculate_accounts <- function(archivos){

        #load data from file
path <- paste("./raw_data/", archivos, sep = "")
json_data <- fromJSON(file = path)
org_name <- json_data$id

        # extract lists and the sublist
usernames <- json_data$twitter   
following <- usernames$following 

        # create empty vector to populate
longitud = length(following)
names <- vector(length = longitud)

        # loop to populate the empty vector with third element of the sub-list
for(i in 1:longitud){
    names[i] <- following[[i]][3]
}

        # create a data frame and change column name
names_list <- data.frame(sapply(names, c))
colnames(names_list) <- "usernames"

        # create a data frame with the correct formatting ready to comparison
org_handles <- data.frame(paste("@", names_list$usernames, sep=""))
colnames(org_handles) <- "Twitter"

        # load master file and select the needed columns
psa_handles <- read_csv(file = "./psa_handles.csv") %>%
    select(Name, AKA, Twitter)

        # merge data frames and present the results
org_list <- inner_join(psa_handles, org_handles)

accounts_db_org <- length(org_list$Twitter)
accounts_total_org <- length(usernames$following)
}

table_psa <- map_dfr(archivos, calculate_accounts)

但是,现在Joining, by = "Twitter"时出现错误,显示为subindex out of limits

Links 到 3 个测试文件放在 raw_data 文件夹中:

https://drive.google.com/file/d/1ilUHwLjgtZCzh0LneIJEhTryrGumDF1V/view?usp=sharing

https://drive.google.com/file/d/1KM3hRZ8DzgPMEsMFmwBdmMNHrPCttuaB/view?usp=sharing

https://drive.google.com/file/d/17cWXJ9ltGXZ6izkgJv0uyNwStrE95_OA/view?usp=sharing

Link 到主文件比较:

https://drive.google.com/file/d/11fOpYFFfHijhZl_CuWHKvkrI7edkpUNQ/view?usp=sharing

<<<<<更新>>>>>>

我正在尝试找到解决方案,我完成了代码工作并提供了一个有效的输出(一个 180x3 数据框),但是应该用 objects [=20] 的值填充的列=] 和 accounts_total_org 显示 NA。检查存储在 objects 中的值时,这些值是正确的(对于最后一次迭代)。所以输出现在是正确的格式,但用 NA 而不是数字。

我真的很接近,但我无法让代码显示正确的数字。我最后一次尝试是:

library(rjson)
library(dplyr)
library(tidyverse)


archivos <- list.files("./raw_data", pattern = "json", full.names = TRUE)
psa_handles <- read_csv(file = "./raw_data/psa_handles.csv", show_col_types = FALSE) %>%
  select(Name, AKA, Twitter)

nr_archivos <- length(archivos)
psa_result <- matrix(nrow = nr_archivos, ncol = 3)

# loop for working with all files, one by one
for(f in 1:nr_archivos){

# load file
    json_data <- fromJSON(file = archivos[f])
    org_name <- json_data$id

# extract lists and the sublist
    usernames <- json_data$twitter
    following <- usernames$following

# empty vector
    longitud = length(following)
    names <- vector(length = longitud)

# loop to populate with the third element of each i item of the sublist
    for(i in 1:longitud){
        names[i] <- following[[i]][3]
    }

# convert the list into a data frame
    names_list <- data.frame(sapply(names, c))
    colnames(names_list) <- "usernames"

# applying some format prior to comparison
    org_handles <- data.frame(paste("@", names_list$usernames, sep=""))
    colnames(org_handles) <- "Twitter"
                                        
# merge tables and calculate the results for each iteration
    org_list <- inner_join(psa_handles, org_handles)
    accounts_db_org <- length(org_list$Twitter)
    accounts_total_org <- length(usernames$following)

# populate the matrix row by row
psa_result[f] <- c(org_name, accounts_db_org, accounts_total_org)
}

# create a data frame from the matrix and save the result
psa_result <- data.frame(psa_result)
write_csv(psa_result, file = "./outputs/cuentas_seguidas_en_psa.csv")

subscript out of bounds 错误是由包含 0 条记录的 json 文件引起的。已修复删除文件。

您可以使用 purrr::mappurrr::map_dfr

这是您要找的吗?

archivos <- list.files("./raw_data", pattern = "json", full.names = TRUE)

# load master file and select the needed columns. This needs to be out of "calculate_accounts" because you only read it once. 
psa_handles <- read_csv(file = "./raw_data/psa_handles.csv") %>%
  select(Name, AKA, Twitter)

# calculate accounts
calculate_accounts <- function(archivo){
  
   json_data <- rjson::fromJSON(file = archivo)
  
  org_handles <- json_data %>%
    pluck("twitter", "following") %>%
    map_chr("username") %>% 
    as_tibble() %>% 
    rename(usernames = value) %>% 
    mutate(Twitter = str_c("@", usernames)) %>% 
    select(Twitter)
  
  org_list <- inner_join(psa_handles, org_handles)
  
  org_list %>% 
    mutate(accounts_db_org = length(Twitter),
           accounts_total_org = nrow(org_handles)) %>% 
    select(-Twitter)
}

table_psa <- map_dfr(archivos, calculate_accounts)

#output:
# A tibble: 53 x 4
   Name                                                                       AKA   accounts_db_org accounts_total_org
   <chr>                                                                      <chr>           <int>              <int>
 1 Association of American Medical Colleges                                   AAMC               20               2924
 2 American College of Cardiology                                             ACC                20               2924
 3 American Heart Association                                                 AHA                20               2924
 4 British Association of Dermatologists                                      BAD                20               2924
 5 Canadian Psoriasis Network                                                 CPN                20               2924
 6 Canadian Skin Patient Alliance                                             CSPA               20               2924
 7 European Academy of Dermatology and Venereology                            EADV               20               2924
 8 European Society for Dermatological Research                               ESDR               20               2924
 9 US Department of Health and Human Service                                  HHS                20               2924
10 International Alliance of Dermatology Patients Organisations (Global Skin) IADPO              20               2924
# ... with 43 more rows

不幸的是,Álvaro 提供的答案没有按预期工作,因为输出重复相同的数字和不同的组织名称,因此很难阅读。实际上,数字 20 重复了 20 次,数字 11 重复了 11 次,依此类推。信息在那里,但如果不进行进一步的数据处理,则无法访问。

同时我正在做自己的研究,我得到了以下代码。终于我成功了,但是数据格式是"matrix" "array",真的很混乱。幸运的是,我写了最后几行来转置数据,取消列出数组并转换为矩阵,它可以在数据框中转换并照常操作。

也许我的解释不是很有用,而且由于我是新手,我敢肯定代码还远远不够优雅和优化。不管怎样,请查看下面的代码:

library(purrr)
library(rjson)
library(dplyr)
library(tidyverse)
setwd("~/documentos/varios/proyectos/programacion/R/psa_twitter")

                                        # Load data from files.
archivos <- list.files("./raw_data/json_files",
                       pattern = ".json",
                       full.names = TRUE)
psa_handles <- read_csv(file = "./raw_data/psa_handles.csv") %>%
    select(Name, AKA, Twitter)

nr_archivos <- length(archivos)

calcula_cuentas <- function(a){
                                        # Extract lists
    json_data <- fromJSON(file = a)
    org_aka <- json_data$id
    org_meta <- json_data$metadata
    org_name <- org_meta$company

    twitter <- json_data$twitter
    following <- twitter$following
                                    # create an empty vector to populate
    longitud = length(following)
    names <- vector(length = longitud)
    
# loop to populate the empty vector with third element of the sub-list
    for(i in 1:longitud){
        names[i] <- following[[i]][3]
    }
                            # create a data frame and change column name
    names_list <- data.frame(sapply(names, c))
    colnames(names_list) <- "usernames"

# Create a data frame with the correct formatting ready to comparison
    org_handles <- data.frame(paste("@",
                                    names_list$usernames,
                                    sep="")
                              )
    colnames(org_handles) <- "Twitter"
    
                                        # merge tables
    org_list <- inner_join(psa_handles, org_handles)
    cuentas_db_org <- length(org_list$Twitter)
    cuentas_total_org <- length(twitter$following)
    results <- data.frame(Name = org_name,
                            AKA = org_aka,
                            Cuentas_db = cuentas_db_org,
                            Total = cuentas_total_org)
    results
}

        # apply function to list of files and unlist the result
psa <- sapply(archivos, calcula_cuentas)
psa1 <- t(as.data.frame(psa))
psa2 <- matrix(unlist(psa1), ncol = 4) %>%
    as.data.frame()
colnames(psa2) <- c("Name", "AKA", "tw_int_outbound", "tw_ext_outbound")

         # Save the results.
saveRDS(psa2, file = "rda/psa.RDS")