识别相关记录并为其分配唯一 ID/数据清理
Identifying related records and assigning them unique IDs / data clean-up
我有一个超级混乱的数据集。有两列标识每一行:
"id" - 这是每条记录的唯一标识符
"code" - 指的是项目代码。一个项目"code"可以有多个记录。
数据集样本:
df <- data.frame(stringsAsFactors=FALSE,
id = c("C01182", "C00966", "C00130", "d34567", "c34567", "C01142",
"C00241", "C00232", "C01094", "C00979", "C00144"),
code = c("13762", "13762", "13762, 13886,13850", "55653", "65247",
"13698", "13698", "13698", "13880", "13773, 13858, 13880", "13773, 13880")
)
我想要的是能够通过"code"
来识别相关记录
我想要的输出示例是:
df1 <- data.frame(stringsAsFactors=FALSE,
id = c("C01182", "C00966", "C00130", "d34567", "c34567", "C01142",
"C00241", "C00232", "C01094", "C00979", "C00144"),
code = c("13762", "13762", "13762, 13886,13850", "55653", "65247",
"13698", "13698", "13698", "13880", "13773, 13858, 13880", "13773, 13880"),
new_col = c("unique_id_1", "unique_id_1", "unique_id_1", "unique_id_2",
"unique_id_3", "unique_id_4", "unique_id_4",
"unique_id_4", "unique_id_5", "unique_id_5", "unique_id_5")
)
new_col
由 code
:
驱动
- 当有一堆代码时,这个记录连同所有
具有任何这些代码的其他记录被认为是
独一无二的套装。
- 也有相同的记录
code
,这些也需要标记为一个独特的集合
唯一标识符可以是任何东西,不必像示例那样。
任何想法都可以实现
新答案:
# load 'data.table' package & convert 'df' to a data.table
library(data.table)
setDT(df)
d1 <- df[, .(code = unlist(tstrsplit(code, ',', type.convert = TRUE))), by = id]
d2 <- d1[, dcast(.SD, id ~ code, fun = length)][, -1]
df[d1[.(code = as.integer(names(d2)), val = colSums(as.matrix(d2)))
, on = .(code), val := i.val][, .(code = code[which.max(val)]), by = id]
, on = .(id)
, new_col := rleid(i.code)][]
给出:
> df
id code new_col
1: C01182 13762 1
2: C00966 13762 1
3: C00130 13762, 13886,13850 1
4: d34567 55653 2
5: c34567 65247 3
6: C01142 13698 4
7: C00241 13698 4
8: C00232 13698 4
9: C01094 13880 5
10: C00979 13773, 13858, 13880 5
11: C00144 13773, 13880 5
这是做什么的:
setDT(df)
将 df
转换为 data.table
.
d1 <- df[, .(code = unlist(tstrsplit(code, ',', type.convert = TRUE))), by = id]
将 df
转换为长格式,其中每个 code
都有一个新行。
d2 <- d1[, dcast(.SD, id ~ code, fun = length)][, -1]
创建一个宽 data.table
,其中 code
作为列,每个 id
表示 code
出现的次数与之相关。使用 [-1]
,id
列被删除,因为下一步不需要它。
- 最后一部分可以分成几个部分:
.(code = as.integer(names(d2)), val = colSums(as.matrix(d2)))
创建一个长临时 data.table
,其中每个 code
出现的次数。
- 将上一步的临时
data.table
与 d1
和 code
合并为 join-key (on = .(code)
) 并将计数添加到通过引用对应 codes
(val := i.val
)。之后,对于每个 id
,仅选择 [, .(code = code[which.max(val)]), by = id]
部分计数最高的 code
(= val
)。
- 最后,这个 最频繁的代码 由
id
(on = .(id)
) 连接回 df
并创建 new_col
通过创建 run-lenght-id of code
(new_col := rleid(code)
).
@minem 指定的更大数据集的速度比较:
n1 <- 10000
n2 <- 10000
set.seed(20)
ll <- lapply(1:n1, function(x) sample(1:n2, sample(1:5, 1)))
dfl <- data.table(id = 1:n1, code = sapply(ll, paste, collapse = ', '))
时间:
> system.time(getGroupsJaap(dfl))
user system elapsed
1.878 0.595 2.479
> system.time(getGroupsMinem(dfl))
user system elapsed
4.332 0.598 4.931
通过将 colSums
替换为 matrixStats
-package 中的 colSums2
,可以进一步(尽管很小)改进我的方法。
旧答案:
dfc <- lapply(strsplit(df$code, ','), type.convert)
m <- as.data.table(outer(unlist(dfc),
unlist(dfc),
'==')
)[, lapply(.SD, sum), rep(seq_along(dfc), lengths(dfc))
][, rep := NULL][, t(.SD)]
dt <- data.table(id = rep(df$id, lengths(dfc)),
m)[, grp := .GRP, by = V1:V11
][, rs := rowSums(.SD), .SDcols = 2:12
][, .(grp = grp[rs == max(rs)]), by = id
][, unid := paste0('unique_id_', .GRP), by = grp][]
df[dt, on = .(id), new_col := unid][]
getGroups <- function(df) {
require(data.table)
setDT(df)
l <- strsplit(df$code, ",")
l <- lapply(l, as.integer)
x <- rep(df$id, times = sapply(l, length))
d <- data.table(id = x, code = unlist(l))
D <- dcast(d, id ~ code, fun.aggregate = length, value.var = 'code')
x <- as.matrix(D[, -1])
g <- rep(0L, nrow(x))
cols <- 1:ncol(x)
i <- cols[1]
test <- colSums(x)
gi <- 1
while (length(cols) > 0) {
r = F
while (r == F) {
y <- rowSums(x[, i, drop = F]) > 0
ssums <- colSums(x[y, , drop = F])
i <- ssums > 0
r <- all(test[i] == ssums[i])
}
g[y] <- gi
gi <- gi + 1
cols <- cols[!(cols %in% which(i))]
i <- cols[1]
}
m <- D[, .(id, group = g)]
results <- merge(df, m, by = 'id', sort = F)
results[]
}
结果:
getGroups(df)
# id code group
# 1: C01182 13762 2
# 2: C00966 13762 2
# 3: C00130 13762, 13886,13850 2
# 4: d34567 55653 4
# 5: c34567 65247 5
# 6: C01142 13698 1
# 7: C00241 13698 1
# 8: C00232 13698 1
# 9: C01094 13880 3
# 10: C00979 13773, 13858, 13880 3
# 11: C00144 13773, 13880 3
更新:
为了提高速度,我们可以将 colSums
/rowSums
替换为 colSums2
/rowSums2
:
getGroups <- function(df) {
require(data.table)
require(matrixStats)
setDT(df) # convert df to data.table
l <- strsplit(df$code, ",") # split codes to list
l <- lapply(l, as.integer) # convert them to integers for efficiency
x <- rep(df$id, times = sapply(l, length)) # create id vector for each code
d <- data.table(id = x, code = unlist(l)) # combine into data.table
# converts the data from long to wide format ( each column represents if id has particular code):
D <- dcast(d, id ~ code, fun.aggregate = length, value.var = 'code')
x <- as.matrix.data.frame(D[, -1]) # convert to matrix and removes id column
g <- rep(0L, nrow(x)) # initialise result vector
cols <- 1:ncol(x) # creates column indices vector
test <- colSums(x) # calculates how much id have each code / for testing if we have selected all
gi <- 1 # first group value
while (length(cols) > 0) {
i <- cols[test[cols] == max(test[cols])][1] # selects code column from remaining which have largest count of id`s
r <- F # initialise indicator if we have selected all id in group
while (r == F) {
if (is.integer(i) != T) i <- which(i == T) # if logical converts to integer indicies
y <- rowSums2(x, cols = i) > 0 # get indices of ids which is in current selection
ssums <- colSums2(x, rows = y) # for those ids get all code columns and cont how many ids have each
i <- ssums > 0 # converts to logical
r <- all(test[i] == ssums[i]) # if selected column sums are equal to initial col sums, then we have selected all one group ids
}
g[y] <- gi # give group id
gi <- gi + 1 # increase group id
cols <- cols[!(cols %in% which(i))] # remove cols that was in this group
}
m <- D[, .(id, group = g)]
results <- merge(df, m, by = 'id', sort = F) # merge group id to initial data
results[]
}
更大数据的时间:
n1 <- 10000
n2 <- 10000
set.seed(20)
ll <- lapply(1:n1, function(x) sample(1:n2, sample(1:5, 1)))
df <- data.table(id = 1:n1, codes = sapply(ll, paste, collapse = ', '))
system.time(wFroups1 <- getGroupsOld(df)) # 17.96
system.time(wFroups2 <- getGroups(df)) #5.35
我有一个超级混乱的数据集。有两列标识每一行:
"id" - 这是每条记录的唯一标识符
"code" - 指的是项目代码。一个项目"code"可以有多个记录。
数据集样本:
df <- data.frame(stringsAsFactors=FALSE,
id = c("C01182", "C00966", "C00130", "d34567", "c34567", "C01142",
"C00241", "C00232", "C01094", "C00979", "C00144"),
code = c("13762", "13762", "13762, 13886,13850", "55653", "65247",
"13698", "13698", "13698", "13880", "13773, 13858, 13880", "13773, 13880")
)
我想要的是能够通过"code"
来识别相关记录我想要的输出示例是:
df1 <- data.frame(stringsAsFactors=FALSE,
id = c("C01182", "C00966", "C00130", "d34567", "c34567", "C01142",
"C00241", "C00232", "C01094", "C00979", "C00144"),
code = c("13762", "13762", "13762, 13886,13850", "55653", "65247",
"13698", "13698", "13698", "13880", "13773, 13858, 13880", "13773, 13880"),
new_col = c("unique_id_1", "unique_id_1", "unique_id_1", "unique_id_2",
"unique_id_3", "unique_id_4", "unique_id_4",
"unique_id_4", "unique_id_5", "unique_id_5", "unique_id_5")
)
new_col
由 code
:
驱动
- 当有一堆代码时,这个记录连同所有 具有任何这些代码的其他记录被认为是 独一无二的套装。
- 也有相同的记录
code
,这些也需要标记为一个独特的集合
唯一标识符可以是任何东西,不必像示例那样。
任何想法都可以实现
新答案:
# load 'data.table' package & convert 'df' to a data.table
library(data.table)
setDT(df)
d1 <- df[, .(code = unlist(tstrsplit(code, ',', type.convert = TRUE))), by = id]
d2 <- d1[, dcast(.SD, id ~ code, fun = length)][, -1]
df[d1[.(code = as.integer(names(d2)), val = colSums(as.matrix(d2)))
, on = .(code), val := i.val][, .(code = code[which.max(val)]), by = id]
, on = .(id)
, new_col := rleid(i.code)][]
给出:
> df id code new_col 1: C01182 13762 1 2: C00966 13762 1 3: C00130 13762, 13886,13850 1 4: d34567 55653 2 5: c34567 65247 3 6: C01142 13698 4 7: C00241 13698 4 8: C00232 13698 4 9: C01094 13880 5 10: C00979 13773, 13858, 13880 5 11: C00144 13773, 13880 5
这是做什么的:
setDT(df)
将df
转换为data.table
.d1 <- df[, .(code = unlist(tstrsplit(code, ',', type.convert = TRUE))), by = id]
将df
转换为长格式,其中每个code
都有一个新行。d2 <- d1[, dcast(.SD, id ~ code, fun = length)][, -1]
创建一个宽data.table
,其中code
作为列,每个id
表示code
出现的次数与之相关。使用[-1]
,id
列被删除,因为下一步不需要它。- 最后一部分可以分成几个部分:
.(code = as.integer(names(d2)), val = colSums(as.matrix(d2)))
创建一个长临时data.table
,其中每个code
出现的次数。- 将上一步的临时
data.table
与d1
和code
合并为 join-key (on = .(code)
) 并将计数添加到通过引用对应codes
(val := i.val
)。之后,对于每个id
,仅选择[, .(code = code[which.max(val)]), by = id]
部分计数最高的code
(=val
)。 - 最后,这个 最频繁的代码 由
id
(on = .(id)
) 连接回df
并创建new_col
通过创建 run-lenght-id ofcode
(new_col := rleid(code)
).
@minem 指定的更大数据集的速度比较:
n1 <- 10000
n2 <- 10000
set.seed(20)
ll <- lapply(1:n1, function(x) sample(1:n2, sample(1:5, 1)))
dfl <- data.table(id = 1:n1, code = sapply(ll, paste, collapse = ', '))
时间:
> system.time(getGroupsJaap(dfl)) user system elapsed 1.878 0.595 2.479 > system.time(getGroupsMinem(dfl)) user system elapsed 4.332 0.598 4.931
通过将 colSums
替换为 matrixStats
-package 中的 colSums2
,可以进一步(尽管很小)改进我的方法。
旧答案:
dfc <- lapply(strsplit(df$code, ','), type.convert)
m <- as.data.table(outer(unlist(dfc),
unlist(dfc),
'==')
)[, lapply(.SD, sum), rep(seq_along(dfc), lengths(dfc))
][, rep := NULL][, t(.SD)]
dt <- data.table(id = rep(df$id, lengths(dfc)),
m)[, grp := .GRP, by = V1:V11
][, rs := rowSums(.SD), .SDcols = 2:12
][, .(grp = grp[rs == max(rs)]), by = id
][, unid := paste0('unique_id_', .GRP), by = grp][]
df[dt, on = .(id), new_col := unid][]
getGroups <- function(df) {
require(data.table)
setDT(df)
l <- strsplit(df$code, ",")
l <- lapply(l, as.integer)
x <- rep(df$id, times = sapply(l, length))
d <- data.table(id = x, code = unlist(l))
D <- dcast(d, id ~ code, fun.aggregate = length, value.var = 'code')
x <- as.matrix(D[, -1])
g <- rep(0L, nrow(x))
cols <- 1:ncol(x)
i <- cols[1]
test <- colSums(x)
gi <- 1
while (length(cols) > 0) {
r = F
while (r == F) {
y <- rowSums(x[, i, drop = F]) > 0
ssums <- colSums(x[y, , drop = F])
i <- ssums > 0
r <- all(test[i] == ssums[i])
}
g[y] <- gi
gi <- gi + 1
cols <- cols[!(cols %in% which(i))]
i <- cols[1]
}
m <- D[, .(id, group = g)]
results <- merge(df, m, by = 'id', sort = F)
results[]
}
结果:
getGroups(df)
# id code group
# 1: C01182 13762 2
# 2: C00966 13762 2
# 3: C00130 13762, 13886,13850 2
# 4: d34567 55653 4
# 5: c34567 65247 5
# 6: C01142 13698 1
# 7: C00241 13698 1
# 8: C00232 13698 1
# 9: C01094 13880 3
# 10: C00979 13773, 13858, 13880 3
# 11: C00144 13773, 13880 3
更新:
为了提高速度,我们可以将 colSums
/rowSums
替换为 colSums2
/rowSums2
:
getGroups <- function(df) {
require(data.table)
require(matrixStats)
setDT(df) # convert df to data.table
l <- strsplit(df$code, ",") # split codes to list
l <- lapply(l, as.integer) # convert them to integers for efficiency
x <- rep(df$id, times = sapply(l, length)) # create id vector for each code
d <- data.table(id = x, code = unlist(l)) # combine into data.table
# converts the data from long to wide format ( each column represents if id has particular code):
D <- dcast(d, id ~ code, fun.aggregate = length, value.var = 'code')
x <- as.matrix.data.frame(D[, -1]) # convert to matrix and removes id column
g <- rep(0L, nrow(x)) # initialise result vector
cols <- 1:ncol(x) # creates column indices vector
test <- colSums(x) # calculates how much id have each code / for testing if we have selected all
gi <- 1 # first group value
while (length(cols) > 0) {
i <- cols[test[cols] == max(test[cols])][1] # selects code column from remaining which have largest count of id`s
r <- F # initialise indicator if we have selected all id in group
while (r == F) {
if (is.integer(i) != T) i <- which(i == T) # if logical converts to integer indicies
y <- rowSums2(x, cols = i) > 0 # get indices of ids which is in current selection
ssums <- colSums2(x, rows = y) # for those ids get all code columns and cont how many ids have each
i <- ssums > 0 # converts to logical
r <- all(test[i] == ssums[i]) # if selected column sums are equal to initial col sums, then we have selected all one group ids
}
g[y] <- gi # give group id
gi <- gi + 1 # increase group id
cols <- cols[!(cols %in% which(i))] # remove cols that was in this group
}
m <- D[, .(id, group = g)]
results <- merge(df, m, by = 'id', sort = F) # merge group id to initial data
results[]
}
更大数据的时间:
n1 <- 10000
n2 <- 10000
set.seed(20)
ll <- lapply(1:n1, function(x) sample(1:n2, sample(1:5, 1)))
df <- data.table(id = 1:n1, codes = sapply(ll, paste, collapse = ', '))
system.time(wFroups1 <- getGroupsOld(df)) # 17.96
system.time(wFroups2 <- getGroups(df)) #5.35