在 R 中使用 data.table 提取文本字符串
Extracting text strings using data.table in R
我有一个data.table
类似下面的那个
数据
library(data.table)
DT <- structure(list(N = 1:6, VN = c("v1", "v3", "v6", "v7a", "v18",
"v23"), T1 = c("bigby (wolf)", "white", "red (rose)", "piggy (straw)",
"(curse) beast", "prince"), T2 = c("jack (bean)", "snow (dwarves)",
"beard (blue)", "bhageera (jungle) mowgli (book)", "beauty",
"glass (slipper)"), T3 = c("hk (34)", "VL (r45)", "tg (h5)",
"tt (HG) (45)", "gh", "vlp"), Val = c(36, 25, 0.84, 12, 78, 258
)), .Names = c("N", "VN", "T1", "T2", "T3", "Val"), class = "data.frame", row.names = c(NA,
-6L))
setDT(DT)
DT
N VN T1 T2 T3 Val
1: 1 v1 bigby (wolf) jack (bean) hk (34) 36.00
2: 2 v3 white snow (dwarves) VL (r45) 25.00
3: 3 v6 red (rose) beard (blue) tg (h5) 0.84
4: 4 v7a piggy (straw) bhageera (jungle) mowgli (book) tt (HG) (45) 12.00
5: 5 v18 (curse) beast beauty gh 78.00
6: 6 v23 prince glass (slipper) vlp 258.00
我想将括号内的所有字符串从列 T1
和 T2
提取到新列 C
.
我可以按如下方式对单行进行处理。
按行计算
setDF(DT)
dtf <- c("T1", "T2")
paste(unique(unlist(regmatches(DT[4,dtf], gregexpr("(?=\().*?(?<=\))", DT[4,dtf], perl=T)))), collapse=" ")
[1] "(straw) (jungle) (book)"
paste(unique(unlist(regmatches(DT[3,dtf], gregexpr("(?=\().*?(?<=\))", DT[3,dtf], perl=T)))), collapse=" ")
[1] "(rose) (blue)"
我无法使用 data.table
获得类似的结果。
试试 data.table
setDT(DT)
DT[, C := paste(unique(unlist(regmatches(get(dtf), gregexpr("(?=\().*?(?<=\))", get(dtf), perl=T)))), collapse=" ")]
如何使用data.table
得到想要的结果?
想要的结果
out <- structure(list(N = 1:6, VN = c("v1", "v3", "v6", "v7a", "v18",
"v23"), T1 = c("bigby (wolf)", "white", "red (rose)", "piggy (straw)",
"(curse) beast", "prince"), T2 = c("jack (bean)", "snow (dwarves)",
"beard (blue)", "bhageera (jungle) mowgli (book)", "beauty",
"glass (slipper)"), T3 = c("hk (34)", "VL (r45)", "tg (h5)",
"tt (HG) (45)", "gh", "vlp"), Val = c(36, 25, 0.84, 12, 78, 258
), C = c("(wolf) (bean)", "(dwarves)", "(rose) (blue)", "(straw) (jungle) (book)",
"(curse)", "(slipper)")), .Names = c("N", "VN", "T1", "T2", "T3",
"Val", "C"), class = "data.frame", row.names = c(NA, -6L))
out
N VN T1 T2 T3 Val C
1 1 v1 bigby (wolf) jack (bean) hk (34) 36.00 (wolf) (bean)
2 2 v3 white snow (dwarves) VL (r45) 25.00 (dwarves)
3 3 v6 red (rose) beard (blue) tg (h5) 0.84 (rose) (blue)
4 4 v7a piggy (straw) bhageera (jungle) mowgli (book) tt (HG) (45) 12.00 (straw) (jungle) (book)
5 5 v18 (curse) beast beauty gh 78.00 (curse)
6 6 v23 prince glass (slipper) vlp 258.00 (slipper)
您可以使用 by
和 .SDcols
来执行此操作。
setDT(DT)
dtf <- c("T1", "T2")
DT[, C := paste(unique(unlist(regmatches(.SD, gregexpr("(?=\().*?(?<=\))", .SD, perl=T)))),
collapse=" "),
by = N,
.SDcols = dtf]
DT
## N VN T1 T2 T3 Val C
## 1: 1 v1 bigby (wolf) jack (bean) hk (34) 36.00 (wolf) (bean)
## 2: 2 v3 white snow (dwarves) VL (r45) 25.00 (dwarves)
## 3: 3 v6 red (rose) beard (blue) tg (h5) 0.84 (rose) (blue)
## 4: 4 v7a piggy (straw) bhageera (jungle) mowgli (book) tt (HG) (45) 12.00 (straw) (jungle) (book)
## 5: 5 v18 (curse) beast beauty gh 78.00 (curse)
## 6: 6 v23 prince glass (slipper) vlp 258.00 (slipper)
我有一个data.table
类似下面的那个
数据
library(data.table)
DT <- structure(list(N = 1:6, VN = c("v1", "v3", "v6", "v7a", "v18",
"v23"), T1 = c("bigby (wolf)", "white", "red (rose)", "piggy (straw)",
"(curse) beast", "prince"), T2 = c("jack (bean)", "snow (dwarves)",
"beard (blue)", "bhageera (jungle) mowgli (book)", "beauty",
"glass (slipper)"), T3 = c("hk (34)", "VL (r45)", "tg (h5)",
"tt (HG) (45)", "gh", "vlp"), Val = c(36, 25, 0.84, 12, 78, 258
)), .Names = c("N", "VN", "T1", "T2", "T3", "Val"), class = "data.frame", row.names = c(NA,
-6L))
setDT(DT)
DT
N VN T1 T2 T3 Val
1: 1 v1 bigby (wolf) jack (bean) hk (34) 36.00
2: 2 v3 white snow (dwarves) VL (r45) 25.00
3: 3 v6 red (rose) beard (blue) tg (h5) 0.84
4: 4 v7a piggy (straw) bhageera (jungle) mowgli (book) tt (HG) (45) 12.00
5: 5 v18 (curse) beast beauty gh 78.00
6: 6 v23 prince glass (slipper) vlp 258.00
我想将括号内的所有字符串从列 T1
和 T2
提取到新列 C
.
我可以按如下方式对单行进行处理。
按行计算
setDF(DT)
dtf <- c("T1", "T2")
paste(unique(unlist(regmatches(DT[4,dtf], gregexpr("(?=\().*?(?<=\))", DT[4,dtf], perl=T)))), collapse=" ")
[1] "(straw) (jungle) (book)"
paste(unique(unlist(regmatches(DT[3,dtf], gregexpr("(?=\().*?(?<=\))", DT[3,dtf], perl=T)))), collapse=" ")
[1] "(rose) (blue)"
我无法使用 data.table
获得类似的结果。
试试 data.table
setDT(DT)
DT[, C := paste(unique(unlist(regmatches(get(dtf), gregexpr("(?=\().*?(?<=\))", get(dtf), perl=T)))), collapse=" ")]
如何使用data.table
得到想要的结果?
想要的结果
out <- structure(list(N = 1:6, VN = c("v1", "v3", "v6", "v7a", "v18",
"v23"), T1 = c("bigby (wolf)", "white", "red (rose)", "piggy (straw)",
"(curse) beast", "prince"), T2 = c("jack (bean)", "snow (dwarves)",
"beard (blue)", "bhageera (jungle) mowgli (book)", "beauty",
"glass (slipper)"), T3 = c("hk (34)", "VL (r45)", "tg (h5)",
"tt (HG) (45)", "gh", "vlp"), Val = c(36, 25, 0.84, 12, 78, 258
), C = c("(wolf) (bean)", "(dwarves)", "(rose) (blue)", "(straw) (jungle) (book)",
"(curse)", "(slipper)")), .Names = c("N", "VN", "T1", "T2", "T3",
"Val", "C"), class = "data.frame", row.names = c(NA, -6L))
out
N VN T1 T2 T3 Val C
1 1 v1 bigby (wolf) jack (bean) hk (34) 36.00 (wolf) (bean)
2 2 v3 white snow (dwarves) VL (r45) 25.00 (dwarves)
3 3 v6 red (rose) beard (blue) tg (h5) 0.84 (rose) (blue)
4 4 v7a piggy (straw) bhageera (jungle) mowgli (book) tt (HG) (45) 12.00 (straw) (jungle) (book)
5 5 v18 (curse) beast beauty gh 78.00 (curse)
6 6 v23 prince glass (slipper) vlp 258.00 (slipper)
您可以使用 by
和 .SDcols
来执行此操作。
setDT(DT)
dtf <- c("T1", "T2")
DT[, C := paste(unique(unlist(regmatches(.SD, gregexpr("(?=\().*?(?<=\))", .SD, perl=T)))),
collapse=" "),
by = N,
.SDcols = dtf]
DT
## N VN T1 T2 T3 Val C
## 1: 1 v1 bigby (wolf) jack (bean) hk (34) 36.00 (wolf) (bean)
## 2: 2 v3 white snow (dwarves) VL (r45) 25.00 (dwarves)
## 3: 3 v6 red (rose) beard (blue) tg (h5) 0.84 (rose) (blue)
## 4: 4 v7a piggy (straw) bhageera (jungle) mowgli (book) tt (HG) (45) 12.00 (straw) (jungle) (book)
## 5: 5 v18 (curse) beast beauty gh 78.00 (curse)
## 6: 6 v23 prince glass (slipper) vlp 258.00 (slipper)