如何在 R 中使用 dcast 计算唯一计数
how to calculate unique count using dcast in R
我正在使用 dcast 转置以下 table
date event user_id
25-07-2020 Create 3455
25-07-2020 Visit 3567
25-07-2020 Visit 3567
25-07-2020 Add 3567
25-07-2020 Add 3678
25-07-2020 Add 3678
25-07-2020 Create 3567
24-07-2020 Edit 3871
我正在使用 dcast 转置以将我的事件作为列并计数 user_id
dae_summ <- dcast(ahoy_events, date ~ event, value.var="user_id")
但我没有获得 唯一 用户 ID。它多次计算相同的 user_id。我该怎么做才能让一个 user_id 在同一日期和事件中只被计算一次。
你可以试试:
library(reshape2)
#Data
df <- structure(list(date = c("25-07-2020", "25-07-2020", "25-07-2020",
"25-07-2020", "25-07-2020", "25-07-2020", "25-07-2020", "24-07-2020"
), event = c("Create", "Visit", "Visit", "Add", "Add", "Add",
"Create", "Edit"), user_id = c(3455L, 3567L, 3567L, 3567L, 3678L,
3678L, 3567L, 3871L)), class = "data.frame", row.names = c(NA,
-8L))
#New code
dae_summ <- dcast(df, date ~ event, value.var="user_id",fun.aggregate = function(x) length(unique(x)))
date Add Create Edit Visit
1 24-07-2020 0 0 1 0
2 25-07-2020 2 2 0 1
你的代码产生这个:
date Add Create Edit Visit
1 24-07-2020 0 0 1 0
2 25-07-2020 3 2 0 2
所以还是有区别的
使用 reshape2
包,您可以利用以下内容:
library(reshape2)
数据:
zz <- "date event user_id
25-07-2020 Create 3455
25-07-2020 Visit 3567
25-07-2020 Visit 3567
25-07-2020 Add 3567
25-07-2020 Add 3678
25-07-2020 Add 3678
25-07-2020 Create 3567
24-07-2020 Edit 3871"
data <- read.table(text=zz, header = TRUE)
代码:
data %>%
dcast(user_id ~ event, value.var="user_id",fun.aggregate = function(x) length(unique(x)))
输出:
date Add Create Edit Visit
<fctr> <int> <int> <int> <int>
24-07-2020 0 0 1 0
25-07-2020 2 2 0 1
由 reprex package (v0.3.0)
于 2020-07-25 创建
我们可以使用 data.table
中的 uniqueN
library(data.table)
dcast(setDT(ahoy_events), date ~ event, fun.aggregate = uniqueN)
# date Add Create Edit Visit
#1: 24-07-2020 0 0 1 0
#2: 25-07-2020 2 2 0 1
或使用 tidyr
中的 pivot_wider
并将 values_fn
指定为 n_distinct
library(tidyr)
library(dplyr)
ahoy_events %>%
pivot_wider(names_from = event, values_from = user_id,
values_fn = list(user_id = n_distinct), values_fill = list(user_id = 0))
# A tibble: 2 x 5
# date Create Visit Add Edit
# <chr> <int> <int> <int> <int>
#1 25-07-2020 2 1 2 0
#2 24-07-2020 0 0 0 1
数据
ahoy_events <- structure(list(date = c("25-07-2020", "25-07-2020", "25-07-2020",
"25-07-2020", "25-07-2020", "25-07-2020", "25-07-2020", "24-07-2020"
), event = c("Create", "Visit", "Visit", "Add", "Add", "Add",
"Create", "Edit"), user_id = c(3455L, 3567L, 3567L, 3567L, 3678L,
3678L, 3567L, 3871L)), class = "data.frame", row.names = c(NA,
-8L))
使用 reshape
的基础 R 选项
out <- replace(
u <- reshape(
unique(transform(ahoy_events, user_id = ave(user_id, event, date, FUN = function(x) length(unique(x))))),
direction = "wide",
idvar = "date",
timevar = "event"
),
is.na(u),
0
)
这样
> out
date user_id.Create user_id.Visit user_id.Add user_id.Edit
1 25-07-2020 2 1 2 0
8 24-07-2020 0 0 0 1
数据
"25-07-2020", "25-07-2020", "25-07-2020",
"25-07-2020", "25-07-2020", "25-07-2020", "25-07-2020", "24-07-2020"
), event = c(
"Create", "Visit", "Visit", "Add", "Add", "Add",
"Create", "Edit"
), user_id = c(
3455L, 3567L, 3567L, 3567L, 3678L,
3678L, 3567L, 3871L
)), class = "data.frame", row.names = c(
NA,
-8L
))
我正在使用 dcast 转置以下 table
date event user_id
25-07-2020 Create 3455
25-07-2020 Visit 3567
25-07-2020 Visit 3567
25-07-2020 Add 3567
25-07-2020 Add 3678
25-07-2020 Add 3678
25-07-2020 Create 3567
24-07-2020 Edit 3871
我正在使用 dcast 转置以将我的事件作为列并计数 user_id
dae_summ <- dcast(ahoy_events, date ~ event, value.var="user_id")
但我没有获得 唯一 用户 ID。它多次计算相同的 user_id。我该怎么做才能让一个 user_id 在同一日期和事件中只被计算一次。
你可以试试:
library(reshape2)
#Data
df <- structure(list(date = c("25-07-2020", "25-07-2020", "25-07-2020",
"25-07-2020", "25-07-2020", "25-07-2020", "25-07-2020", "24-07-2020"
), event = c("Create", "Visit", "Visit", "Add", "Add", "Add",
"Create", "Edit"), user_id = c(3455L, 3567L, 3567L, 3567L, 3678L,
3678L, 3567L, 3871L)), class = "data.frame", row.names = c(NA,
-8L))
#New code
dae_summ <- dcast(df, date ~ event, value.var="user_id",fun.aggregate = function(x) length(unique(x)))
date Add Create Edit Visit
1 24-07-2020 0 0 1 0
2 25-07-2020 2 2 0 1
你的代码产生这个:
date Add Create Edit Visit
1 24-07-2020 0 0 1 0
2 25-07-2020 3 2 0 2
所以还是有区别的
使用 reshape2
包,您可以利用以下内容:
library(reshape2)
数据:
zz <- "date event user_id
25-07-2020 Create 3455
25-07-2020 Visit 3567
25-07-2020 Visit 3567
25-07-2020 Add 3567
25-07-2020 Add 3678
25-07-2020 Add 3678
25-07-2020 Create 3567
24-07-2020 Edit 3871"
data <- read.table(text=zz, header = TRUE)
代码:
data %>%
dcast(user_id ~ event, value.var="user_id",fun.aggregate = function(x) length(unique(x)))
输出:
date Add Create Edit Visit
<fctr> <int> <int> <int> <int>
24-07-2020 0 0 1 0
25-07-2020 2 2 0 1
由 reprex package (v0.3.0)
于 2020-07-25 创建我们可以使用 data.table
uniqueN
library(data.table)
dcast(setDT(ahoy_events), date ~ event, fun.aggregate = uniqueN)
# date Add Create Edit Visit
#1: 24-07-2020 0 0 1 0
#2: 25-07-2020 2 2 0 1
或使用 tidyr
中的 pivot_wider
并将 values_fn
指定为 n_distinct
library(tidyr)
library(dplyr)
ahoy_events %>%
pivot_wider(names_from = event, values_from = user_id,
values_fn = list(user_id = n_distinct), values_fill = list(user_id = 0))
# A tibble: 2 x 5
# date Create Visit Add Edit
# <chr> <int> <int> <int> <int>
#1 25-07-2020 2 1 2 0
#2 24-07-2020 0 0 0 1
数据
ahoy_events <- structure(list(date = c("25-07-2020", "25-07-2020", "25-07-2020",
"25-07-2020", "25-07-2020", "25-07-2020", "25-07-2020", "24-07-2020"
), event = c("Create", "Visit", "Visit", "Add", "Add", "Add",
"Create", "Edit"), user_id = c(3455L, 3567L, 3567L, 3567L, 3678L,
3678L, 3567L, 3871L)), class = "data.frame", row.names = c(NA,
-8L))
使用 reshape
out <- replace(
u <- reshape(
unique(transform(ahoy_events, user_id = ave(user_id, event, date, FUN = function(x) length(unique(x))))),
direction = "wide",
idvar = "date",
timevar = "event"
),
is.na(u),
0
)
这样
> out
date user_id.Create user_id.Visit user_id.Add user_id.Edit
1 25-07-2020 2 1 2 0
8 24-07-2020 0 0 0 1
数据
"25-07-2020", "25-07-2020", "25-07-2020",
"25-07-2020", "25-07-2020", "25-07-2020", "25-07-2020", "24-07-2020"
), event = c(
"Create", "Visit", "Visit", "Add", "Add", "Add",
"Create", "Edit"
), user_id = c(
3455L, 3567L, 3567L, 3567L, 3678L,
3678L, 3567L, 3871L
)), class = "data.frame", row.names = c(
NA,
-8L
))