是否可以在 R 中处理文件读取和解析
is it possible to process file reading and parsing in R
目录中有一堆文件,每一行都有 json 格式的条目。文件的大小从 5k 到 200MB 不等。我有这段代码遍历每个文件,解析我在 json 中寻找的数据,最后形成一个数据框。此脚本需要很长时间才能完成,实际上它永远不会完成。
有什么方法可以加快它的速度,以便我可以更快地读取文件吗?
代码:
library(jsonlite)
library(data.table)
setwd("C:/Files/")
#data <- lapply(readLines("test.txt"), fromJSON)
df<-data.frame(Timestamp=factor(),Source=factor(),Host=factor(),Status=factor())
filenames <- list.files("Json_files", pattern="*.txt", full.names=TRUE)
for(i in filenames){
print(i)
data <- lapply(readLines(i), fromJSON)
myDf <- do.call("rbind", lapply(data, function(d) {
data.frame(TimeStamp = d$payloadData$timestamp,
Source = d$payloadData$source,
Host = d$payloadData$host,
Status = d$payloadData$status)}))
df<-rbind(df,myDf)
}
这是一个示例条目,但文件中有数千个这样的条目:
{"senderDateTimeStamp":"2016/04/08 10:53:18","senderHost":null,"senderAppcode":"app","senderUsecase":"appinternalstats_prod","destinationTopic":"app_appinternalstats_realtimedata_topic","correlatedRecord":false,"needCorrelationCacheCleanup":false,"needCorrelation":false,"correlationAttributes":null,"correlationRecordCount":0,"correlateTimeWindowInMills":0,"lastCorrelationRecord":false,"realtimeESStorage":true,"receiverDateTimeStamp":1460127623591,"payloadData":{"timestamp":"2016-04-08T10:53:18.169","status":"get","source":"STREAM","fund":"JVV","client":"","region":"","evetid":"","osareqid":"","basis":"","pricingdate":"","content":"","msgname":"","recipient":"","objid":"","idlreqno":"","host":"WEB01","servermember":"test"},"payloadDataText":"","key":"app:appinternalstats_prod","destinationTopicName":"app_appinternalstats_realtimedata_topic","hdfsPath":"app/appinternalstats_prod","esindex":"app","estype":"appinternalstats_prod","useCase":"appinternalstats_prod","appCode":"app"}
{"senderDateTimeStamp":"2016/04/08 10:54:18","senderHost":null,"senderAppcode":"app","senderUsecase":"appinternalstats_prod","destinationTopic":"app_appinternalstats_realtimedata_topic","correlatedRecord":false,"needCorrelationCacheCleanup":false,"needCorrelation":false,"correlationAttributes":null,"correlationRecordCount":0,"correlateTimeWindowInMills":0,"lastCorrelationRecord":false,"realtimeESStorage":true,"receiverDateTimeStamp":1460127623591,"payloadData":{"timestamp":"2016-04-08T10:53:18.169","status":"get","source":"STREAM","fund":"JVV","client":"","region":"","evetid":"","osareqid":"","basis":"","pricingdate":"","content":"","msgname":"","recipient":"","objid":"","idlreqno":"","host":"WEB02","servermember":""},"payloadDataText":"","key":"app:appinternalstats_prod","destinationTopicName":"app_appinternalstats_realtimedata_topic","hdfsPath":"app/appinternalstats_prod","esindex":"app","estype":"appinternalstats_prod","useCase":"appinternalstats_prod","appCode":"app"}
{"senderDateTimeStamp":"2016/04/08 10:55:18","senderHost":null,"senderAppcode":"app","senderUsecase":"appinternalstats_prod","destinationTopic":"app_appinternalstats_realtimedata_topic","correlatedRecord":false,"needCorrelationCacheCleanup":false,"needCorrelation":false,"correlationAttributes":null,"correlationRecordCount":0,"correlateTimeWindowInMills":0,"lastCorrelationRecord":false,"realtimeESStorage":true,"receiverDateTimeStamp":1460127623591,"payloadData":{"timestamp":"2016-04-08T10:53:18.169","status":"get","source":"STREAM","fund":"JVV","client":"","region":"","evetid":"","osareqid":"","basis":"","pricingdate":"","content":"","msgname":"","recipient":"","objid":"","idlreqno":"","host":"WEB02","servermember":""},"payloadDataText":"","key":"app:appinternalstats_prod","destinationTopicName":"app_appinternalstats_realtimedata_topic","hdfsPath":"app/appinternalstats_prod","esindex":"app","estype":"appinternalstats_prod","useCase":"appinternalstats_prod","appCode":"app"}
一种加速方法是将 for
循环替换为 lapply
然后删除最后的 rbind
。这里的加速是 R 不必重复复制一个越来越大的文件,df 在你的 "bunch" 个文件上。结果将存储在一个方便的列表中,您可以按原样使用或一次性转换为 data.frame:
# create processing function
getData <- function(i) {
print(i)
data <- lapply(readLines(i), fromJSON)
myDf <- do.call("rbind", lapply(data, function(d) {
data.frame(TimeStamp = d$payloadData$timestamp,
Source = d$payloadData$source,
Host = d$payloadData$host,
Status = d$payloadData$status)}))
}
# lapply over files
myDataList <- lapply(filenames, getData)
使用 "c:/tmp.txt" 中的示例数据:
> df <- jsonlite::fromJSON(paste0("[",paste0(readLines("c:/tmp.txt"),collapse=","),"]"))$payloadData[c("timestamp","source","host","status")]
> df
timestamp source host status
1 2016-04-08T10:53:18.169 STREAM WEB01 get
2 2016-04-08T10:53:18.169 STREAM WEB02 get
3 2016-04-08T10:53:18.169 STREAM WEB02 get
因此调整您的代码以获取数据帧列表:
dflist <- lapply(filenames, function(i) {
jsonlite::fromJSON(
paste0("[",
paste0(readLines(i),collapse=","),
"]")
)$payloadData[c("timestamp","source","host","status")]
})
我们的想法是将您的行(来自 readLines
)转换为一个大的 json 数组,然后通过将其解析为 json.
来创建数据框
正如 lmo 已经展示的那样,在你的文件列表中使用 lapply 会为你提供一个数据帧列表,如果你真的只想要最后一个数据帧,你可以加载 data.table
包然后使用rbindlist
在 dflist
上仅获取一个数据帧。
或者,如果您的记忆力不足,this thread 可能会对您有所帮助。
目录中有一堆文件,每一行都有 json 格式的条目。文件的大小从 5k 到 200MB 不等。我有这段代码遍历每个文件,解析我在 json 中寻找的数据,最后形成一个数据框。此脚本需要很长时间才能完成,实际上它永远不会完成。
有什么方法可以加快它的速度,以便我可以更快地读取文件吗?
代码:
library(jsonlite)
library(data.table)
setwd("C:/Files/")
#data <- lapply(readLines("test.txt"), fromJSON)
df<-data.frame(Timestamp=factor(),Source=factor(),Host=factor(),Status=factor())
filenames <- list.files("Json_files", pattern="*.txt", full.names=TRUE)
for(i in filenames){
print(i)
data <- lapply(readLines(i), fromJSON)
myDf <- do.call("rbind", lapply(data, function(d) {
data.frame(TimeStamp = d$payloadData$timestamp,
Source = d$payloadData$source,
Host = d$payloadData$host,
Status = d$payloadData$status)}))
df<-rbind(df,myDf)
}
这是一个示例条目,但文件中有数千个这样的条目:
{"senderDateTimeStamp":"2016/04/08 10:53:18","senderHost":null,"senderAppcode":"app","senderUsecase":"appinternalstats_prod","destinationTopic":"app_appinternalstats_realtimedata_topic","correlatedRecord":false,"needCorrelationCacheCleanup":false,"needCorrelation":false,"correlationAttributes":null,"correlationRecordCount":0,"correlateTimeWindowInMills":0,"lastCorrelationRecord":false,"realtimeESStorage":true,"receiverDateTimeStamp":1460127623591,"payloadData":{"timestamp":"2016-04-08T10:53:18.169","status":"get","source":"STREAM","fund":"JVV","client":"","region":"","evetid":"","osareqid":"","basis":"","pricingdate":"","content":"","msgname":"","recipient":"","objid":"","idlreqno":"","host":"WEB01","servermember":"test"},"payloadDataText":"","key":"app:appinternalstats_prod","destinationTopicName":"app_appinternalstats_realtimedata_topic","hdfsPath":"app/appinternalstats_prod","esindex":"app","estype":"appinternalstats_prod","useCase":"appinternalstats_prod","appCode":"app"}
{"senderDateTimeStamp":"2016/04/08 10:54:18","senderHost":null,"senderAppcode":"app","senderUsecase":"appinternalstats_prod","destinationTopic":"app_appinternalstats_realtimedata_topic","correlatedRecord":false,"needCorrelationCacheCleanup":false,"needCorrelation":false,"correlationAttributes":null,"correlationRecordCount":0,"correlateTimeWindowInMills":0,"lastCorrelationRecord":false,"realtimeESStorage":true,"receiverDateTimeStamp":1460127623591,"payloadData":{"timestamp":"2016-04-08T10:53:18.169","status":"get","source":"STREAM","fund":"JVV","client":"","region":"","evetid":"","osareqid":"","basis":"","pricingdate":"","content":"","msgname":"","recipient":"","objid":"","idlreqno":"","host":"WEB02","servermember":""},"payloadDataText":"","key":"app:appinternalstats_prod","destinationTopicName":"app_appinternalstats_realtimedata_topic","hdfsPath":"app/appinternalstats_prod","esindex":"app","estype":"appinternalstats_prod","useCase":"appinternalstats_prod","appCode":"app"}
{"senderDateTimeStamp":"2016/04/08 10:55:18","senderHost":null,"senderAppcode":"app","senderUsecase":"appinternalstats_prod","destinationTopic":"app_appinternalstats_realtimedata_topic","correlatedRecord":false,"needCorrelationCacheCleanup":false,"needCorrelation":false,"correlationAttributes":null,"correlationRecordCount":0,"correlateTimeWindowInMills":0,"lastCorrelationRecord":false,"realtimeESStorage":true,"receiverDateTimeStamp":1460127623591,"payloadData":{"timestamp":"2016-04-08T10:53:18.169","status":"get","source":"STREAM","fund":"JVV","client":"","region":"","evetid":"","osareqid":"","basis":"","pricingdate":"","content":"","msgname":"","recipient":"","objid":"","idlreqno":"","host":"WEB02","servermember":""},"payloadDataText":"","key":"app:appinternalstats_prod","destinationTopicName":"app_appinternalstats_realtimedata_topic","hdfsPath":"app/appinternalstats_prod","esindex":"app","estype":"appinternalstats_prod","useCase":"appinternalstats_prod","appCode":"app"}
一种加速方法是将 for
循环替换为 lapply
然后删除最后的 rbind
。这里的加速是 R 不必重复复制一个越来越大的文件,df 在你的 "bunch" 个文件上。结果将存储在一个方便的列表中,您可以按原样使用或一次性转换为 data.frame:
# create processing function
getData <- function(i) {
print(i)
data <- lapply(readLines(i), fromJSON)
myDf <- do.call("rbind", lapply(data, function(d) {
data.frame(TimeStamp = d$payloadData$timestamp,
Source = d$payloadData$source,
Host = d$payloadData$host,
Status = d$payloadData$status)}))
}
# lapply over files
myDataList <- lapply(filenames, getData)
使用 "c:/tmp.txt" 中的示例数据:
> df <- jsonlite::fromJSON(paste0("[",paste0(readLines("c:/tmp.txt"),collapse=","),"]"))$payloadData[c("timestamp","source","host","status")]
> df
timestamp source host status
1 2016-04-08T10:53:18.169 STREAM WEB01 get
2 2016-04-08T10:53:18.169 STREAM WEB02 get
3 2016-04-08T10:53:18.169 STREAM WEB02 get
因此调整您的代码以获取数据帧列表:
dflist <- lapply(filenames, function(i) {
jsonlite::fromJSON(
paste0("[",
paste0(readLines(i),collapse=","),
"]")
)$payloadData[c("timestamp","source","host","status")]
})
我们的想法是将您的行(来自 readLines
)转换为一个大的 json 数组,然后通过将其解析为 json.
正如 lmo 已经展示的那样,在你的文件列表中使用 lapply 会为你提供一个数据帧列表,如果你真的只想要最后一个数据帧,你可以加载 data.table
包然后使用rbindlist
在 dflist
上仅获取一个数据帧。
或者,如果您的记忆力不足,this thread 可能会对您有所帮助。