查找列中出现的常用词并将其删除
finding common word appear in a column and removing them
我有一个数据框,其中有一列由文本字符串组成。
df <- data.frame(c(1,2,3,4), c("one three", "one four five", "one two five", "one three five six")
我需要找到出现在第二列中的所有常用词(在本例中,词 'one' 出现在第二列的每一行中。
然后我需要删除那个常用词并获得新的 df:
newdf <- data.frame(c(1,2,3,4), c("three", "four five", "two five", "three five six")
我尝试使用 intersect、string grp() 等函数,但无法实现。任何 tidyverse 或类似工具?
#sample data
df <- data.frame(id = c(1,2,3,4),
text = c("one three", "one four five", "one two five", "one three five six"))
#find common words
common.words <- Reduce(intersect, strsplit(df$text, " "))
#remove comming words, trim leftover whitespace
df$text2 <- trimws(gsub(paste0(common.words, collapse = "|"), "", df$text))
# id text text2
# 1 1 one three three
# 2 2 one four five four five
# 3 3 one two five two five
# 4 4 one three five six three five six
多列更新
#sample data
df <- data.frame(id = c(1,2,3,4),
text = c("one three", "one four three five", "one two five", "one three five six"),
text2 = c("five one three", "one four three five", "one two five", "one three five six"))
library(data.table)
#make data a data.table
setDT(df)
#columns to analyse
textcols <- c("text", "text2")
#check: find common words by column
# df[, lapply(.SD, function(x) Reduce(intersect, strsplit(x, " "))), .SDcols = textcols]
df[, (textcols) := lapply(.SD, function(x) {
common.words <- Reduce(intersect, strsplit(x, " "))
trimws(gsub(paste0(common.words, collapse = "|"), "", x))
}),
.SDcols = textcols]
你也可以试试tidyverse
df <- data.frame(id = c(1,2,3,4),
text = c("one three", "one four five", "one two five", "one three five six"),
text2= c("two three", "two four five", "two five", "two three five six"),
text3 = c("one two three", "one four five two", "one two five", "one three five two six"))
library(tidyverse)
df %>%
mutate_if(is.character, ~stringi::stri_extract_all_words(..1) %>%
reduce(intersect) %>%
paste0(., collapse = "|") %>%
str_remove_all(..1, .) %>%
str_trim)
id text text2 text3
1 1 three three three
2 2 four five four five four five
3 3 two five five five
4 4 three five six three five six three five six
将函数放入命名列表以保留列
mutate_if(is.character, list(new=~stringi::stri_extract_all_words(..1) %>%
reduce(intersect) %>%
paste0(., collapse = "|") %>%
str_remove_all(..1, .) %>%
str_trim))
然后输出是:
id text text2 text3 text_new text2_new text3_new
1 1 one three two three one two three three three three
2 2 one four five two four five one four five two four five four five four five
3 3 one two five two five one two five two five five five
4 4 one three five six two three five six one three five two six three five six three five six three five six
我有一个数据框,其中有一列由文本字符串组成。
df <- data.frame(c(1,2,3,4), c("one three", "one four five", "one two five", "one three five six")
我需要找到出现在第二列中的所有常用词(在本例中,词 'one' 出现在第二列的每一行中。
然后我需要删除那个常用词并获得新的 df:
newdf <- data.frame(c(1,2,3,4), c("three", "four five", "two five", "three five six")
我尝试使用 intersect、string grp() 等函数,但无法实现。任何 tidyverse 或类似工具?
#sample data
df <- data.frame(id = c(1,2,3,4),
text = c("one three", "one four five", "one two five", "one three five six"))
#find common words
common.words <- Reduce(intersect, strsplit(df$text, " "))
#remove comming words, trim leftover whitespace
df$text2 <- trimws(gsub(paste0(common.words, collapse = "|"), "", df$text))
# id text text2
# 1 1 one three three
# 2 2 one four five four five
# 3 3 one two five two five
# 4 4 one three five six three five six
多列更新
#sample data
df <- data.frame(id = c(1,2,3,4),
text = c("one three", "one four three five", "one two five", "one three five six"),
text2 = c("five one three", "one four three five", "one two five", "one three five six"))
library(data.table)
#make data a data.table
setDT(df)
#columns to analyse
textcols <- c("text", "text2")
#check: find common words by column
# df[, lapply(.SD, function(x) Reduce(intersect, strsplit(x, " "))), .SDcols = textcols]
df[, (textcols) := lapply(.SD, function(x) {
common.words <- Reduce(intersect, strsplit(x, " "))
trimws(gsub(paste0(common.words, collapse = "|"), "", x))
}),
.SDcols = textcols]
你也可以试试tidyverse
df <- data.frame(id = c(1,2,3,4),
text = c("one three", "one four five", "one two five", "one three five six"),
text2= c("two three", "two four five", "two five", "two three five six"),
text3 = c("one two three", "one four five two", "one two five", "one three five two six"))
library(tidyverse)
df %>%
mutate_if(is.character, ~stringi::stri_extract_all_words(..1) %>%
reduce(intersect) %>%
paste0(., collapse = "|") %>%
str_remove_all(..1, .) %>%
str_trim)
id text text2 text3
1 1 three three three
2 2 four five four five four five
3 3 two five five five
4 4 three five six three five six three five six
将函数放入命名列表以保留列
mutate_if(is.character, list(new=~stringi::stri_extract_all_words(..1) %>%
reduce(intersect) %>%
paste0(., collapse = "|") %>%
str_remove_all(..1, .) %>%
str_trim))
然后输出是:
id text text2 text3 text_new text2_new text3_new
1 1 one three two three one two three three three three
2 2 one four five two four five one four five two four five four five four five
3 3 one two five two five one two five two five five five
4 4 one three five six two three five six one three five two six three five six three five six three five six