如何为组 Y 中的唯一 X 值创建索引变量?
How do I create an index variable for unique values of X within a group Y?
我有以下 table:
id_question id_event num_events
2015012713 49508 1
2015012711 49708 1
2015011523 41808 3
2015011523 44008 3
2015011523 44108 3
2015011522 41508 3
2015011522 43608 3
2015011522 43708 3
2015011521 39708 1
2015011519 44208 1
第三列按问题给出事件数。我想创建一个变量,该变量仅在每个问题有多个事件的情况下按问题对事件进行索引。它看起来像这样:
id_question id_event num_events index_event
2015012713 49508 1
2015012711 49708 1
2015011523 41808 3 1
2015011523 44008 3 2
2015011523 44108 3 3
2015011522 41508 3 1
2015011522 43608 3 2
2015011522 43708 3 3
2015011521 39708 1
2015011519 44208 1
我该怎么做?
我们可以使用 tidyverse
在按 'id_question' 分组后创建一个 'index_event'。如果行数大于1(n() >1
),则获取行的顺序(row_number()
),case_when
中的默认选项是NA
library(dplyr)
df1 %>%
group_by(id_question) %>%
mutate(index_event = case_when(n() >1 ~ row_number()))
# A tibble: 10 x 4
# Groups: id_question [6]
# id_question id_event num_events index_event
# <int> <int> <int> <int>
# 1 2015012713 49508 1 NA
# 2 2015012711 49708 1 NA
# 3 2015011523 41808 3 1
# 4 2015011523 44008 3 2
# 5 2015011523 44108 3 3
# 6 2015011522 41508 3 1
# 7 2015011522 43608 3 2
# 8 2015011522 43708 3 3
# 9 2015011521 39708 1 NA
#10 2015011519 44208 1 NA
或者使用data.table
,我们在'id_question'上使用rowid
,并将'num_events'中为1的元素更改为NA
,使用NA^
(利用NA^0
、NA^1
)
library(data.table)
setDT(df1)[, index_event := rowid(id_question) * NA^(num_events == 1)]
或使用 base R
,另一种选择 sequence
的频率来自 'id_question',并将元素更改为 NA,如前一种情况
df1$index_event <- with(df1, sequence(table(id_question)) * NA^(num_events == 1))
df1$index_event
#[1] NA NA 1 2 3 1 2 3 NA NA
数据
df1 <- structure(list(id_question = c(2015012713L, 2015012711L, 2015011523L,
2015011523L, 2015011523L, 2015011522L, 2015011522L, 2015011522L,
2015011521L, 2015011519L), id_event = c(49508L, 49708L, 41808L,
44008L, 44108L, 41508L, 43608L, 43708L, 39708L, 44208L), num_events = c(1L,
1L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L)), class = "data.frame", row.names = c(NA,
-10L))
如果 num_events = 1
您可以 return NA
或为每个 id_question
创建一个行索引。
这可以在 base R 中完成:
df$index_event <- with(df, ave(num_events == 1, id_question,
FUN = function(x) replace(seq_along(x), x, NA)))
df
# id_question id_event num_events index_event
#1 2015012713 49508 1 NA
#2 2015012711 49708 1 NA
#3 2015011523 41808 3 1
#4 2015011523 44008 3 2
#5 2015011523 44108 3 3
#6 2015011522 41508 3 1
#7 2015011522 43608 3 2
#8 2015011522 43708 3 3
#9 2015011521 39708 1 NA
#10 2015011519 44208 1 NA
dplyr
:
library(dplyr)
df %>%
group_by(id_question) %>%
mutate(index_event = if_else(num_events == 1, NA_integer_, row_number()))
或data.table
:
library(data.table)
setDT(df)
df[,index_event := ifelse(num_events == 1, NA_integer_, seq_len(.N)), id_question]
数据
df <- structure(list(id_question = c(2015012713L, 2015012711L, 2015011523L,
2015011523L, 2015011523L, 2015011522L, 2015011522L, 2015011522L,
2015011521L, 2015011519L), id_event = c(49508L, 49708L, 41808L,
44008L, 44108L, 41508L, 43608L, 43708L, 39708L, 44208L), num_events = c(1L,
1L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L)),class = "data.frame",row.names = c(NA, -10L))
我有以下 table:
id_question id_event num_events
2015012713 49508 1
2015012711 49708 1
2015011523 41808 3
2015011523 44008 3
2015011523 44108 3
2015011522 41508 3
2015011522 43608 3
2015011522 43708 3
2015011521 39708 1
2015011519 44208 1
第三列按问题给出事件数。我想创建一个变量,该变量仅在每个问题有多个事件的情况下按问题对事件进行索引。它看起来像这样:
id_question id_event num_events index_event
2015012713 49508 1
2015012711 49708 1
2015011523 41808 3 1
2015011523 44008 3 2
2015011523 44108 3 3
2015011522 41508 3 1
2015011522 43608 3 2
2015011522 43708 3 3
2015011521 39708 1
2015011519 44208 1
我该怎么做?
我们可以使用 tidyverse
在按 'id_question' 分组后创建一个 'index_event'。如果行数大于1(n() >1
),则获取行的顺序(row_number()
),case_when
中的默认选项是NA
library(dplyr)
df1 %>%
group_by(id_question) %>%
mutate(index_event = case_when(n() >1 ~ row_number()))
# A tibble: 10 x 4
# Groups: id_question [6]
# id_question id_event num_events index_event
# <int> <int> <int> <int>
# 1 2015012713 49508 1 NA
# 2 2015012711 49708 1 NA
# 3 2015011523 41808 3 1
# 4 2015011523 44008 3 2
# 5 2015011523 44108 3 3
# 6 2015011522 41508 3 1
# 7 2015011522 43608 3 2
# 8 2015011522 43708 3 3
# 9 2015011521 39708 1 NA
#10 2015011519 44208 1 NA
或者使用data.table
,我们在'id_question'上使用rowid
,并将'num_events'中为1的元素更改为NA
,使用NA^
(利用NA^0
、NA^1
)
library(data.table)
setDT(df1)[, index_event := rowid(id_question) * NA^(num_events == 1)]
或使用 base R
,另一种选择 sequence
的频率来自 'id_question',并将元素更改为 NA,如前一种情况
df1$index_event <- with(df1, sequence(table(id_question)) * NA^(num_events == 1))
df1$index_event
#[1] NA NA 1 2 3 1 2 3 NA NA
数据
df1 <- structure(list(id_question = c(2015012713L, 2015012711L, 2015011523L,
2015011523L, 2015011523L, 2015011522L, 2015011522L, 2015011522L,
2015011521L, 2015011519L), id_event = c(49508L, 49708L, 41808L,
44008L, 44108L, 41508L, 43608L, 43708L, 39708L, 44208L), num_events = c(1L,
1L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L)), class = "data.frame", row.names = c(NA,
-10L))
如果 num_events = 1
您可以 return NA
或为每个 id_question
创建一个行索引。
这可以在 base R 中完成:
df$index_event <- with(df, ave(num_events == 1, id_question,
FUN = function(x) replace(seq_along(x), x, NA)))
df
# id_question id_event num_events index_event
#1 2015012713 49508 1 NA
#2 2015012711 49708 1 NA
#3 2015011523 41808 3 1
#4 2015011523 44008 3 2
#5 2015011523 44108 3 3
#6 2015011522 41508 3 1
#7 2015011522 43608 3 2
#8 2015011522 43708 3 3
#9 2015011521 39708 1 NA
#10 2015011519 44208 1 NA
dplyr
:
library(dplyr)
df %>%
group_by(id_question) %>%
mutate(index_event = if_else(num_events == 1, NA_integer_, row_number()))
或data.table
:
library(data.table)
setDT(df)
df[,index_event := ifelse(num_events == 1, NA_integer_, seq_len(.N)), id_question]
数据
df <- structure(list(id_question = c(2015012713L, 2015012711L, 2015011523L,
2015011523L, 2015011523L, 2015011522L, 2015011522L, 2015011522L,
2015011521L, 2015011519L), id_event = c(49508L, 49708L, 41808L,
44008L, 44108L, 41508L, 43608L, 43708L, 39708L, 44208L), num_events = c(1L,
1L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L)),class = "data.frame",row.names = c(NA, -10L))