将数据框中每四行的系列折叠成一个向量,覆盖缺失值
Collapse every series of four rows in a data frame into a single vector, overwriting missing values
我想分析网站上的一些货币交易数据,但这些数据只能通过复制和粘贴访问。我将它复制到我的计算机的剪贴板并通过以下方式将其导入 R:
#df <- read.table("clipboard", header = FALSE, sep = "\t", stringsAsFactors = FALSE, na.strings = "", fill = TRUE)
但是数据框在读入 R 时将单个观察值分成四行:
df <- structure(list(V1 = c("Buy", "Completed", "Fee1.00 USD", "Total199.00 USD", "Buy", "Completed", "Fee0.50 USD", "Total100.00 USD", "Buy", "Completed", "Fee0.64 USD", "Total127.00 USD"), V2 = c(NA, "2021-02-11 20:49:19", NA, NA, NA, "2021-02-11 20:48:03", NA, NA, NA, "2021-02-11 20:47:22", NA, NA), V3 = c(NA, "0.11057", NA, NA, NA, "82.146", NA, NA, NA, "30.15", NA, NA)), row.names = c(NA, 12L), class = "data.frame")
df
# V1 V2 V3
#1 Buy <NA> <NA>
#2 Completed 2021-02-11 20:49:19 0.11057
#3 Fee1.00 USD <NA> <NA>
#4 Total199.00 USD <NA> <NA>
#5 Buy <NA> <NA>
#6 Completed 2021-02-11 20:48:03 82.146
#7 Fee0.50 USD <NA> <NA>
#8 Total100.00 USD <NA> <NA>
#9 Buy <NA> <NA>
#10 Completed 2021-02-11 20:47:22 30.15
#11 Fee0.64 USD <NA> <NA>
#12 Total127.00 USD <NA> <NA>
因此,我想将每一系列的四行折叠成一行,这样会覆盖作为数据导入过程的怪癖而生成的缺失值:
want <- structure(list(V1 = structure(c(1L, 1L, 1L), .Label = "Buy", class = "factor"), V2 = structure(c(1L, 1L, 1L), .Label = "Completed", class = "factor"), V3 = structure(3:1, .Label = c("2/11/2021 20:47", "2/11/2021 20:48", "2/11/2021 20:49"), class = "factor"), V4 = c(0.11057, 82.146, 30.15), V5 = structure(c(3L, 1L, 2L), .Label = c("Fee0.50 USD", "Fee0.64 USD", "Fee1.00 USD"), class = "factor"), V6 = structure(c(3L, 1L, 2L), .Label = c("Total100.00 USD", "Total127.00 USD", "Total199.00 USD"), class = "factor")), class = "data.frame", row.names = c(NA, -3L))
want
# V1 V2 V3 V4 V5 V6
#1 Buy Completed 2/11/2021 20:49 0.11057 Fee1.00 USD Total199.00 USD
#2 Buy Completed 2/11/2021 20:48 82.14600 Fee0.50 USD Total100.00 USD
#3 Buy Completed 2/11/2021 20:47 30.15000 Fee0.64 USD Total127.00 USD
显然,事情仍然有点混乱,因为我需要将一些字符串分成单独的列(例如 df$V5 = "Fee1.00 USD" 将变为 df$Fee = 1.00),但是那是另一个问题。
我试过添加一个 id 变量,然后从长变宽,as discussed here,但是通过获取我需要的值(例如“Fee1.00 USD”中的 1.00)会变得更加混乱,并且将它们作为新的列名:
df$id <- gl((nrow(df)/4), 4)
reshape(df, timevar = "V1", idvar = "id", direction = "wide")
我已经尝试将数据框拆分为数据框列表,as discussed here,但我仍然不确定如何折叠每个数据框并将其拼接在一起:
split(df, f = df$id)
将数据转换为正确格式的最佳方法是什么?
这个怎么样:
library(dplyr)
library(tidyr)
df <- df %>% mutate(obs = rep(1:(nrow(.)/4), each=4))
df <- df %>%
pivot_longer(-obs, names_to="var", values_to="vals") %>%
na.omit() %>%
group_by(obs) %>%
mutate(col = seq_along(obs)) %>%
select(obs, col, vals) %>%
pivot_wider(names_from="col", names_prefix="V", values_from="vals")
df
# # A tibble: 3 x 7
# # Groups: obs [3]
# obs V1 V2 V3 V4 V5 V6
# <int> <chr> <chr> <chr> <chr> <chr> <chr>
# 1 1 Buy Completed 2021-02-11 20:49:19 0.11057 Fee1.00 USD Total199.00 USD
# 2 2 Buy Completed 2021-02-11 20:48:03 82.146 Fee0.50 USD Total100.00 USD
# 3 3 Buy Completed 2021-02-11 20:47:22 30.15 Fee0.64 USD Total127.00 USD
Dave 的回答完美且简洁。如果出于某种原因有人不能使用外部包,我使用他的脚本作为指南并尝试使用 base R:
复制它
df$id <- gl((nrow(df) / 4), 4)
df <- reshape(df, idvar = "id",
v.names = "val",
timevar = "var",
times = names(df[1:3]),
varying = names(df[1:3]),
new.row.names = 1:1000,
direction = "long")
df <- na.omit(df)
df <- df[order(df$id),]
df$col <- ave(seq_len(nrow(df)), df$id, FUN = seq_along)
df <- subset(df, select = c("id", "col", "val"))
df <- reshape(df, timevar = "col",
idvar = "id",
direction = "wide")
colnames(df) <- c("id", "V1", "V2", "V5", "V6", "V3", "V4")
varnames <- c("id", "V1", "V2", "V3", "V4", "V5", "V6")
df <- df[, varnames]
df
我想分析网站上的一些货币交易数据,但这些数据只能通过复制和粘贴访问。我将它复制到我的计算机的剪贴板并通过以下方式将其导入 R:
#df <- read.table("clipboard", header = FALSE, sep = "\t", stringsAsFactors = FALSE, na.strings = "", fill = TRUE)
但是数据框在读入 R 时将单个观察值分成四行:
df <- structure(list(V1 = c("Buy", "Completed", "Fee1.00 USD", "Total199.00 USD", "Buy", "Completed", "Fee0.50 USD", "Total100.00 USD", "Buy", "Completed", "Fee0.64 USD", "Total127.00 USD"), V2 = c(NA, "2021-02-11 20:49:19", NA, NA, NA, "2021-02-11 20:48:03", NA, NA, NA, "2021-02-11 20:47:22", NA, NA), V3 = c(NA, "0.11057", NA, NA, NA, "82.146", NA, NA, NA, "30.15", NA, NA)), row.names = c(NA, 12L), class = "data.frame")
df
# V1 V2 V3
#1 Buy <NA> <NA>
#2 Completed 2021-02-11 20:49:19 0.11057
#3 Fee1.00 USD <NA> <NA>
#4 Total199.00 USD <NA> <NA>
#5 Buy <NA> <NA>
#6 Completed 2021-02-11 20:48:03 82.146
#7 Fee0.50 USD <NA> <NA>
#8 Total100.00 USD <NA> <NA>
#9 Buy <NA> <NA>
#10 Completed 2021-02-11 20:47:22 30.15
#11 Fee0.64 USD <NA> <NA>
#12 Total127.00 USD <NA> <NA>
因此,我想将每一系列的四行折叠成一行,这样会覆盖作为数据导入过程的怪癖而生成的缺失值:
want <- structure(list(V1 = structure(c(1L, 1L, 1L), .Label = "Buy", class = "factor"), V2 = structure(c(1L, 1L, 1L), .Label = "Completed", class = "factor"), V3 = structure(3:1, .Label = c("2/11/2021 20:47", "2/11/2021 20:48", "2/11/2021 20:49"), class = "factor"), V4 = c(0.11057, 82.146, 30.15), V5 = structure(c(3L, 1L, 2L), .Label = c("Fee0.50 USD", "Fee0.64 USD", "Fee1.00 USD"), class = "factor"), V6 = structure(c(3L, 1L, 2L), .Label = c("Total100.00 USD", "Total127.00 USD", "Total199.00 USD"), class = "factor")), class = "data.frame", row.names = c(NA, -3L))
want
# V1 V2 V3 V4 V5 V6
#1 Buy Completed 2/11/2021 20:49 0.11057 Fee1.00 USD Total199.00 USD
#2 Buy Completed 2/11/2021 20:48 82.14600 Fee0.50 USD Total100.00 USD
#3 Buy Completed 2/11/2021 20:47 30.15000 Fee0.64 USD Total127.00 USD
显然,事情仍然有点混乱,因为我需要将一些字符串分成单独的列(例如 df$V5 = "Fee1.00 USD" 将变为 df$Fee = 1.00),但是那是另一个问题。
我试过添加一个 id 变量,然后从长变宽,as discussed here,但是通过获取我需要的值(例如“Fee1.00 USD”中的 1.00)会变得更加混乱,并且将它们作为新的列名:
df$id <- gl((nrow(df)/4), 4)
reshape(df, timevar = "V1", idvar = "id", direction = "wide")
我已经尝试将数据框拆分为数据框列表,as discussed here,但我仍然不确定如何折叠每个数据框并将其拼接在一起:
split(df, f = df$id)
将数据转换为正确格式的最佳方法是什么?
这个怎么样:
library(dplyr)
library(tidyr)
df <- df %>% mutate(obs = rep(1:(nrow(.)/4), each=4))
df <- df %>%
pivot_longer(-obs, names_to="var", values_to="vals") %>%
na.omit() %>%
group_by(obs) %>%
mutate(col = seq_along(obs)) %>%
select(obs, col, vals) %>%
pivot_wider(names_from="col", names_prefix="V", values_from="vals")
df
# # A tibble: 3 x 7
# # Groups: obs [3]
# obs V1 V2 V3 V4 V5 V6
# <int> <chr> <chr> <chr> <chr> <chr> <chr>
# 1 1 Buy Completed 2021-02-11 20:49:19 0.11057 Fee1.00 USD Total199.00 USD
# 2 2 Buy Completed 2021-02-11 20:48:03 82.146 Fee0.50 USD Total100.00 USD
# 3 3 Buy Completed 2021-02-11 20:47:22 30.15 Fee0.64 USD Total127.00 USD
Dave 的回答完美且简洁。如果出于某种原因有人不能使用外部包,我使用他的脚本作为指南并尝试使用 base R:
复制它df$id <- gl((nrow(df) / 4), 4)
df <- reshape(df, idvar = "id",
v.names = "val",
timevar = "var",
times = names(df[1:3]),
varying = names(df[1:3]),
new.row.names = 1:1000,
direction = "long")
df <- na.omit(df)
df <- df[order(df$id),]
df$col <- ave(seq_len(nrow(df)), df$id, FUN = seq_along)
df <- subset(df, select = c("id", "col", "val"))
df <- reshape(df, timevar = "col",
idvar = "id",
direction = "wide")
colnames(df) <- c("id", "V1", "V2", "V5", "V6", "V3", "V4")
varnames <- c("id", "V1", "V2", "V3", "V4", "V5", "V6")
df <- df[, varnames]
df