R - 如何导入带有块的巨大 .csv?
R - How can I import a huge .csv with chunks?
我一直在尝试导入一个巨大的 .csv,其中包含块和过滤器。
但是我的代码只是读取了存档的一部分(4500 万中的 2000 万)。
我也已经尝试使用 data.table()
但没有成功。
arq_grande <- file("cnpj_dados_cadastrais_pj.csv", "r")
tam_chunk <- 5000
df1 <- read.csv(arq_grande, nrows = 10, header = T, sep = "#", dec = ".")
for(i in 1:ncol(df1)){df1[,i] <- df1[,i] %>% iconv(from = 'UTF-8', to = 'latin1')}
df_filtrado <- df1 %>% filter(codigo_natureza_juridica == c("2143","2330")) %>% select(cnpj,everything())
write.table(df_filtrado, "/cnpj_dados_cadastrais_pj_filtrado_coop.csv", row.names = F, sep = "#", dec = ".")
names(df1)
nrow <- 1
totalRows <- 0
repeat {
df <- read.csv(arq_grande, header=FALSE, sep="#", col.names = names(df1), nrows = tam_chunk)
for(i in 1:ncol(df)){df[,i] <- df[,i] %>% iconv(from = 'UTF-8', to = 'latin1')}
nRow = nrow(df)
totalRows <- totalRows + nRow
cat("Lendo", nrow(df), "linhas, total lido", totalRows, "\n")
if (nrow(df) == 0)
break
df_filtrado <- df %>% filter(codigo_natureza_juridica == c("2143","2330")) %>% select(cnpj,everything())
write.table(df_filtrado, "/cnpj_dados_cadastrais_pj_filtrado_coop.csv", append = T, col.names = F, row.names = F, sep = "#", dec = ".")
}
close(arq_grande)
我在这里看到了其他示例,但没有任何效果。抱歉,我是刚接触此类数据。
我只想阅读我的 .csv 文件的所有行。
您可以使用 skip
和 n_max
参数以 readr::read_csv
分块读取 csv 文件:skip
是开始时要跳过的行数, n_max
是之后阅读的行数。
library("readr")
# Example uses `#` as the separator
file <- "
lineno#X#Y#Z
1#a#b#c
2#d#e#f
3#g#h#i
4#j#k#l
5#m#n#o
6#p#q#r
7#s#t#u
8#v#w#
9#x#y#z
"
# Increase the chunk size appropriately
chunk_size <- 3
# Assumption: There is a header on the first line
# but we don't know what it is.
col_names <- TRUE
line_num <- 1
while (TRUE) {
chunk <- read_delim(
file, "#",
skip = line_num,
n_max = chunk_size,
# On the first iteration, col_names is TRUE
# so the first line "X,Y,Z" is assumed to be the header
# On any subsequent iteration, col_names is a character vector
# of the actual column names
col_names = col_names
)
# If the chunk has now rows, then reached end of file
if (!nrow(chunk)) {
break
}
# Do something with the chunk of data
print(chunk)
# Update `col_names` so that it is equal the actual column names
col_names <- colnames(chunk)
# Move to the next chunk. Add 1 for the header.
line_num <- line_num + chunk_size + (line_num == 1)
}
#> # A tibble: 3 x 4
#> lineno X Y Z
#> <dbl> <chr> <chr> <chr>
#> 1 1 a b c
#> 2 2 d e f
#> 3 3 g h i
#> # A tibble: 3 x 4
#> lineno X Y Z
#> <dbl> <chr> <chr> <chr>
#> 1 4 j k l
#> 2 5 m n o
#> 3 6 p q r
#> # A tibble: 3 x 4
#> lineno X Y Z
#> <dbl> <chr> <chr> <chr>
#> 1 7 s t u
#> 2 8 v w <NA>
#> 3 9 x y z
由 reprex package (v0.3.0)
于 2019-10-31 创建
您可以直接使用 R 中的 fread
函数快速加载大块 .csv
文件数据。
下面是一个例子:
n = 1e6
DT = data.table( a=sample(1:1000,n,replace=TRUE),
b=sample(1:1000,n,replace=TRUE),
c=rnorm(n),
d=sample(c("foo","bar","baz","qux","quux"),n,replace=TRUE),
e=rnorm(n),
f=sample(1:1000,n,replace=TRUE) )
DT[2,b:=NA_integer_]
DT[4,c:=NA_real_]
DT[3,d:=NA_character_]
DT[5,d:=""]
DT[2,e:=+Inf]
DT[3,e:=-Inf]
write.table(DT,"test.csv",sep=",",row.names=FALSE,quote=FALSE)
cat("File size (MB):", round(file.info("test.csv")$size/1024^2),"\n")
# 50 MB (1e6 rows x 6 columns)
system.time(DF1 <-read.csv("test.csv",stringsAsFactors=FALSE))
# 60 sec (first time in fresh R session)
system.time(DF1 <- read.csv("test.csv",stringsAsFactors=FALSE))
# 30 sec (immediate repeat is faster, varies)
system.time(DF2 <- read.table("test.csv",header=TRUE,sep=",",quote="",
stringsAsFactors=FALSE,comment.char="",nrows=n,
colClasses=c("integer","integer","numeric",
"character","numeric","integer")))
现在使用来自 data.table
的 fread
require(data.table)
system.time(DT <- fread("test.csv"))
# 3 sec (faster and friendlier)
当您将数据扩展到 GB 时,同样的效果是场景。
下面是fread
函数的文档。
https://www.rdocumentation.org/packages/data.table/versions/1.12.2/topics/fread
我一直在尝试导入一个巨大的 .csv,其中包含块和过滤器。 但是我的代码只是读取了存档的一部分(4500 万中的 2000 万)。
我也已经尝试使用 data.table()
但没有成功。
arq_grande <- file("cnpj_dados_cadastrais_pj.csv", "r")
tam_chunk <- 5000
df1 <- read.csv(arq_grande, nrows = 10, header = T, sep = "#", dec = ".")
for(i in 1:ncol(df1)){df1[,i] <- df1[,i] %>% iconv(from = 'UTF-8', to = 'latin1')}
df_filtrado <- df1 %>% filter(codigo_natureza_juridica == c("2143","2330")) %>% select(cnpj,everything())
write.table(df_filtrado, "/cnpj_dados_cadastrais_pj_filtrado_coop.csv", row.names = F, sep = "#", dec = ".")
names(df1)
nrow <- 1
totalRows <- 0
repeat {
df <- read.csv(arq_grande, header=FALSE, sep="#", col.names = names(df1), nrows = tam_chunk)
for(i in 1:ncol(df)){df[,i] <- df[,i] %>% iconv(from = 'UTF-8', to = 'latin1')}
nRow = nrow(df)
totalRows <- totalRows + nRow
cat("Lendo", nrow(df), "linhas, total lido", totalRows, "\n")
if (nrow(df) == 0)
break
df_filtrado <- df %>% filter(codigo_natureza_juridica == c("2143","2330")) %>% select(cnpj,everything())
write.table(df_filtrado, "/cnpj_dados_cadastrais_pj_filtrado_coop.csv", append = T, col.names = F, row.names = F, sep = "#", dec = ".")
}
close(arq_grande)
我在这里看到了其他示例,但没有任何效果。抱歉,我是刚接触此类数据。
我只想阅读我的 .csv 文件的所有行。
您可以使用 skip
和 n_max
参数以 readr::read_csv
分块读取 csv 文件:skip
是开始时要跳过的行数, n_max
是之后阅读的行数。
library("readr")
# Example uses `#` as the separator
file <- "
lineno#X#Y#Z
1#a#b#c
2#d#e#f
3#g#h#i
4#j#k#l
5#m#n#o
6#p#q#r
7#s#t#u
8#v#w#
9#x#y#z
"
# Increase the chunk size appropriately
chunk_size <- 3
# Assumption: There is a header on the first line
# but we don't know what it is.
col_names <- TRUE
line_num <- 1
while (TRUE) {
chunk <- read_delim(
file, "#",
skip = line_num,
n_max = chunk_size,
# On the first iteration, col_names is TRUE
# so the first line "X,Y,Z" is assumed to be the header
# On any subsequent iteration, col_names is a character vector
# of the actual column names
col_names = col_names
)
# If the chunk has now rows, then reached end of file
if (!nrow(chunk)) {
break
}
# Do something with the chunk of data
print(chunk)
# Update `col_names` so that it is equal the actual column names
col_names <- colnames(chunk)
# Move to the next chunk. Add 1 for the header.
line_num <- line_num + chunk_size + (line_num == 1)
}
#> # A tibble: 3 x 4
#> lineno X Y Z
#> <dbl> <chr> <chr> <chr>
#> 1 1 a b c
#> 2 2 d e f
#> 3 3 g h i
#> # A tibble: 3 x 4
#> lineno X Y Z
#> <dbl> <chr> <chr> <chr>
#> 1 4 j k l
#> 2 5 m n o
#> 3 6 p q r
#> # A tibble: 3 x 4
#> lineno X Y Z
#> <dbl> <chr> <chr> <chr>
#> 1 7 s t u
#> 2 8 v w <NA>
#> 3 9 x y z
由 reprex package (v0.3.0)
于 2019-10-31 创建您可以直接使用 R 中的 fread
函数快速加载大块 .csv
文件数据。
下面是一个例子:
n = 1e6
DT = data.table( a=sample(1:1000,n,replace=TRUE),
b=sample(1:1000,n,replace=TRUE),
c=rnorm(n),
d=sample(c("foo","bar","baz","qux","quux"),n,replace=TRUE),
e=rnorm(n),
f=sample(1:1000,n,replace=TRUE) )
DT[2,b:=NA_integer_]
DT[4,c:=NA_real_]
DT[3,d:=NA_character_]
DT[5,d:=""]
DT[2,e:=+Inf]
DT[3,e:=-Inf]
write.table(DT,"test.csv",sep=",",row.names=FALSE,quote=FALSE)
cat("File size (MB):", round(file.info("test.csv")$size/1024^2),"\n")
# 50 MB (1e6 rows x 6 columns)
system.time(DF1 <-read.csv("test.csv",stringsAsFactors=FALSE))
# 60 sec (first time in fresh R session)
system.time(DF1 <- read.csv("test.csv",stringsAsFactors=FALSE))
# 30 sec (immediate repeat is faster, varies)
system.time(DF2 <- read.table("test.csv",header=TRUE,sep=",",quote="",
stringsAsFactors=FALSE,comment.char="",nrows=n,
colClasses=c("integer","integer","numeric",
"character","numeric","integer")))
现在使用来自 data.table
fread
require(data.table)
system.time(DT <- fread("test.csv"))
# 3 sec (faster and friendlier)
当您将数据扩展到 GB 时,同样的效果是场景。
下面是fread
函数的文档。
https://www.rdocumentation.org/packages/data.table/versions/1.12.2/topics/fread