R 两个因素的字符串计数
R Count of strings by two factors
我需要一些帮助。我有以下 table:
country_code=c(1,1,1,1,1,1,2,2,2,2,2,2)
target=c('V1','V1','V2','V2','V3','V3','V1','V1','V2','V2','V3','V3')
M1=c('X7','X7','X14','X14','X8','X8','X29','X22','X2','X22','X22','X22')
M2=c('X1','X1','X17','X11','X21','X21','X1','X29','X8','X18','X24','X24')
M3=c('NA','NA','NA','X1','NA','NA','NA','NA','NA','NA','NA','NA')
CountofRun=c(1,2,1,2,1,2,1,2,1,2,1,2)
df<-data.frame(country_code,target,M1,M2,M3,CountofRun)
我想为每个 country_code 和 target 组合获得频率 table。因此,例如,如果 X7 出现在 country_code=1 和 target=V1[=25 的所有三个运行中=],X7 需要求和为 3。如您所见,我只对计算 country_code 和目标的 6 种组合中每一种的 X1 到 X30 在这 3 次运行中出现的次数感兴趣.我无法转换为数字。
最终的table,希望是这样的
也许
library(dplyr)
library(tidyr)
df %>%
select(-CountofRun) %>%
gather(key, value, -(country_code:target)) %>%
select(-key) %>%
ftable(xtabs(~ country_code + target + value, data = .))
给出:
# value NA X1 X11 X14 X17 X18 X2 X21 X22 X24 X29 X7 X8
#country_code target
#1 V1 2 2 0 0 0 0 0 0 0 0 0 2 0
# V2 1 1 1 2 1 0 0 0 0 0 0 0 0
# V3 2 0 0 0 0 0 0 2 0 0 0 0 2
#2 V1 2 1 0 0 0 0 0 0 1 0 2 0 0
# V2 2 0 0 0 0 1 1 0 1 0 0 0 1
# V3 2 0 0 0 0 0 0 0 2 2 0 0 0
这会让你分道扬镳;你现在有了计数,它只是格式化:
> library(data.table)
>
> country_code=c(1,1,1,1,1,1,2,2,2,2,2,2)
> target=c('V1','V1','V2','V2','V3','V3','V1','V1','V2','V2','V3','V3')
> M1=c('X7','X7','X14','X14','X8','X8','X29','X22','X2','X22','X22','X22')
> M2=c('X1','X1','X17','X11','X21','X21','X1','X29','X8','X18','X24','X24')
> M3=c('NA','NA','NA','X1','NA','NA','NA','NA','NA','NA','NA','NA')
> CountofRun=c(1,2,1,2,1,2,1,2,1,2,1,2)
> df<-data.table(country_code,target,M1,M2,M3,CountofRun)
>
> # melt the data for easier processing
> df_m <- melt(df, id.vars = c('country_code', 'target', 'CountofRun'))
>
> # count
> df_count <- df_m[,
+ .(count = sum(CountofRun)),
+ keyby = .(country_code, target, value)
+ ][value != "NA"] # remove 'NA's
>
> df_count
country_code target value count
1: 1 V1 X1 3
2: 1 V1 X7 3
3: 1 V2 X1 2
4: 1 V2 X11 2
5: 1 V2 X14 3
6: 1 V2 X17 1
7: 1 V3 X21 3
8: 1 V3 X8 3
9: 2 V1 X1 1
10: 2 V1 X22 2
11: 2 V1 X29 3
12: 2 V2 X18 2
13: 2 V2 X2 1
14: 2 V2 X22 2
15: 2 V2 X8 1
16: 2 V3 X22 3
17: 2 V3 X24 3
>
一个data.table解决方案(结构与dplyr + tidyr相似,只是语法不同)
setDT(df)
df[, .SD
][, CountofRun := NULL
][, melt(.SD, id.vars=c('country_code', 'target'))
][, .N, .(country_code, target, value)
][, dcast(.SD, country_code + target ~ value, value.var='N', fill=0)
]
我需要一些帮助。我有以下 table:
country_code=c(1,1,1,1,1,1,2,2,2,2,2,2)
target=c('V1','V1','V2','V2','V3','V3','V1','V1','V2','V2','V3','V3')
M1=c('X7','X7','X14','X14','X8','X8','X29','X22','X2','X22','X22','X22')
M2=c('X1','X1','X17','X11','X21','X21','X1','X29','X8','X18','X24','X24')
M3=c('NA','NA','NA','X1','NA','NA','NA','NA','NA','NA','NA','NA')
CountofRun=c(1,2,1,2,1,2,1,2,1,2,1,2)
df<-data.frame(country_code,target,M1,M2,M3,CountofRun)
我想为每个 country_code 和 target 组合获得频率 table。因此,例如,如果 X7 出现在 country_code=1 和 target=V1[=25 的所有三个运行中=],X7 需要求和为 3。如您所见,我只对计算 country_code 和目标的 6 种组合中每一种的 X1 到 X30 在这 3 次运行中出现的次数感兴趣.我无法转换为数字。
最终的table,希望是这样的
也许
library(dplyr)
library(tidyr)
df %>%
select(-CountofRun) %>%
gather(key, value, -(country_code:target)) %>%
select(-key) %>%
ftable(xtabs(~ country_code + target + value, data = .))
给出:
# value NA X1 X11 X14 X17 X18 X2 X21 X22 X24 X29 X7 X8
#country_code target
#1 V1 2 2 0 0 0 0 0 0 0 0 0 2 0
# V2 1 1 1 2 1 0 0 0 0 0 0 0 0
# V3 2 0 0 0 0 0 0 2 0 0 0 0 2
#2 V1 2 1 0 0 0 0 0 0 1 0 2 0 0
# V2 2 0 0 0 0 1 1 0 1 0 0 0 1
# V3 2 0 0 0 0 0 0 0 2 2 0 0 0
这会让你分道扬镳;你现在有了计数,它只是格式化:
> library(data.table)
>
> country_code=c(1,1,1,1,1,1,2,2,2,2,2,2)
> target=c('V1','V1','V2','V2','V3','V3','V1','V1','V2','V2','V3','V3')
> M1=c('X7','X7','X14','X14','X8','X8','X29','X22','X2','X22','X22','X22')
> M2=c('X1','X1','X17','X11','X21','X21','X1','X29','X8','X18','X24','X24')
> M3=c('NA','NA','NA','X1','NA','NA','NA','NA','NA','NA','NA','NA')
> CountofRun=c(1,2,1,2,1,2,1,2,1,2,1,2)
> df<-data.table(country_code,target,M1,M2,M3,CountofRun)
>
> # melt the data for easier processing
> df_m <- melt(df, id.vars = c('country_code', 'target', 'CountofRun'))
>
> # count
> df_count <- df_m[,
+ .(count = sum(CountofRun)),
+ keyby = .(country_code, target, value)
+ ][value != "NA"] # remove 'NA's
>
> df_count
country_code target value count
1: 1 V1 X1 3
2: 1 V1 X7 3
3: 1 V2 X1 2
4: 1 V2 X11 2
5: 1 V2 X14 3
6: 1 V2 X17 1
7: 1 V3 X21 3
8: 1 V3 X8 3
9: 2 V1 X1 1
10: 2 V1 X22 2
11: 2 V1 X29 3
12: 2 V2 X18 2
13: 2 V2 X2 1
14: 2 V2 X22 2
15: 2 V2 X8 1
16: 2 V3 X22 3
17: 2 V3 X24 3
>
一个data.table解决方案(结构与dplyr + tidyr相似,只是语法不同)
setDT(df)
df[, .SD
][, CountofRun := NULL
][, melt(.SD, id.vars=c('country_code', 'target'))
][, .N, .(country_code, target, value)
][, dcast(.SD, country_code + target ~ value, value.var='N', fill=0)
]