加速 R 代码以通过向量化从字符串中删除停用词
Speedup R code to remove stop words from string by vectorizing
我有一个成功的 运行ning 下面列出的代码可以从文本中删除停用词以及相应的词性 [POS]。但是大音量 运行 大约需要 4 个小时。
我在想,如果我通过矢量化摆脱 for 循环,它会加快速度。但我不知道它是否可能或它是否有用。我需要通过使用更好的方法来加速代码的帮助。
我可以使用 tm packge 删除停用词 R tm removeWords stopwords is not removing stopwords 但我需要删除相应的 POS 标签,这在 tm 包中是不可能的。
注意:我已经能够使用 foreach 将最外层的 for 循环并行化到 12 个内核上的 运行。
代码:
# Reproducible data
# id is to identify the source
# phrase contains original string
# modifiedphrase contains string with stop words removved
id <- c(1,2,3)
phrase <- c("choice_for_selection","accordingly_choices_for_selection", "only_top_selection")
pos <- c("NN JJ NN","NN JJ NN NN", "NNS NN NNS") #fake part of speech
df <- as.data.frame(cbind(id,phrase,pos))
df<-cbind(df,df$phrase) # creating copy of the phrase to modify it
df<-cbind(df,df$pos) # creating copy of the pos to modify it
colnames(df) <- c("id","phrase","pos","modifiedphrase","modpos")
df$modifiedphrase<-as.character(df$modifiedphrase)
df$modpos<-as.character(df$modpos)
# stop words list
library(tm)
SWList<- stopwords(kind = "SMART")
library(stringr)
#Code to remove stop words in strings
# the first outermost for loop i am able to parallelize using foreach
for(i in 1:length(df[,1])){
tokensplit<-str_split(df[i,"phrase"],"_")[[1]]
possplit<-str_split(df[i,"pos"]," ")[[1]]
change=0
forremoval=NULL
for(j in 1:length(tokensplit)){
if(tokensplit[j] %in% SWList){
change=1
forremoval<-append(forremoval,j)
tmppos<-paste(possplit[-forremoval],collapse=" ")
}
}
if(change==1){
tmp<-paste(tokensplit[-forremoval],collapse="_")
if(length(tmp)==0){
tmp=""
tmppos=""
}
df[i,"modifiedphrase"]=tmp
df[i,"modpos"]=tmppos
}
}
# Final output
print(df)
id phrase pos modifiedphrase modpos
1 1 choice_for_selection NN JJ NN choice_selection NN NN
2 2 accordingly_choices_for_selection NN JJ NN NN choices_selection JJ NN
3 3 only_top_selection NNS NN NNS top_selection NN NNS
>
这是一个应用版本:
# stop words list
library(tm)
SWList <- stopwords(kind = "SMART")
df$modpos <- apply(df[,c('phrase', 'pos')], 1, function(x){
paste(strsplit(x[2],' ')[[1]][!((strsplit(x[1],'_')[[1]])%in%SWList)], collapse=" ")
})
df$modifiedphrase <- sapply(df$modified, function(x) {
paste(setdiff(strsplit(x,"_")[[1]],SWList),collapse="_")
})
我知道这是假数据,但您可能还想考虑删除停用词中的撇号:
SWList = gsub('\'','',SWList)
更新
效率检查:
(1)设置数据功能:因此我们可以在每次效率检查前设置数据。
setup_data = function(){
id <- c(1,2,3)
phrase <- c("choice_for_selection","accordingly_choices_for_selection", "only_top_selection")
pos <- c("NN JJ NN","NN JJ NN NN", "NNS NN NNS") #fake part of speech
df <- as.data.frame(cbind(id,phrase,pos))
df<-cbind(df,df$phrase) # creating copy of the phrase to modify it
df<-cbind(df,df$pos) # creating copy of the pos to modify it
colnames(df) <- c("id","phrase","pos","modifiedphrase","modpos")
df$modifiedphrase<-as.character(df$modifiedphrase)
df$modpos<-as.character(df$modpos)
return(df)
}
(2) 原始的For-loop方法:
forloop_method = function(){
for(i in 1:length(df[,1])){
tokensplit<-str_split(df[i,"phrase"],"_")[[1]]
possplit<-str_split(df[i,"pos"]," ")[[1]]
change=0
forremoval=NULL
for(j in 1:length(tokensplit)){
if(tokensplit[j] %in% SWList){
change=1
forremoval<-append(forremoval,j)
tmppos<-paste(possplit[-forremoval],collapse=" ")
}
}
if(change==1){
tmp<-paste(tokensplit[-forremoval],collapse="_")
if(length(tmp)==0){
tmp=""
tmppos=""
}
df[i,"modifiedphrase"]=tmp
df[i,"modpos"]=tmppos
}
}
}
(3) 应用方法:
apply_method = function(){
df$modpos <- apply(df[,c('phrase', 'pos')], 1, function(x){
paste(strsplit(x[2],' ')[[1]][!((strsplit(x[1],'_')[[1]])%in%SWList)], collapse=" ")
})
df$modifiedphrase <- sapply(df$modified, function(x) {
paste(setdiff(strsplit(x,"_")[[1]],SWList),collapse="_")
})
}
(4) 使用 'microbenchmark' 包的微秒效率:
library(microbenchmark)
df = setup_data()
microbenchmark(forloop_method(), unit='us')
Unit: microseconds
expr min lq mean median uq max neval
forloop_method 884.229 965.2805 1050.775 992.224 1032.69 2680.374 100
df = setup_data()
microbenchmark(apply_method, unit='us')
Unit: microseconds
expr min lq mean median uq max neval
apply_method 0.018 0.025 0.49948 0.026 0.027 45.379 100
1050.775/0.49948 = 2103.738 倍我的系统加速。
我有一个成功的 运行ning 下面列出的代码可以从文本中删除停用词以及相应的词性 [POS]。但是大音量 运行 大约需要 4 个小时。 我在想,如果我通过矢量化摆脱 for 循环,它会加快速度。但我不知道它是否可能或它是否有用。我需要通过使用更好的方法来加速代码的帮助。
我可以使用 tm packge 删除停用词 R tm removeWords stopwords is not removing stopwords 但我需要删除相应的 POS 标签,这在 tm 包中是不可能的。
注意:我已经能够使用 foreach 将最外层的 for 循环并行化到 12 个内核上的 运行。
代码:
# Reproducible data
# id is to identify the source
# phrase contains original string
# modifiedphrase contains string with stop words removved
id <- c(1,2,3)
phrase <- c("choice_for_selection","accordingly_choices_for_selection", "only_top_selection")
pos <- c("NN JJ NN","NN JJ NN NN", "NNS NN NNS") #fake part of speech
df <- as.data.frame(cbind(id,phrase,pos))
df<-cbind(df,df$phrase) # creating copy of the phrase to modify it
df<-cbind(df,df$pos) # creating copy of the pos to modify it
colnames(df) <- c("id","phrase","pos","modifiedphrase","modpos")
df$modifiedphrase<-as.character(df$modifiedphrase)
df$modpos<-as.character(df$modpos)
# stop words list
library(tm)
SWList<- stopwords(kind = "SMART")
library(stringr)
#Code to remove stop words in strings
# the first outermost for loop i am able to parallelize using foreach
for(i in 1:length(df[,1])){
tokensplit<-str_split(df[i,"phrase"],"_")[[1]]
possplit<-str_split(df[i,"pos"]," ")[[1]]
change=0
forremoval=NULL
for(j in 1:length(tokensplit)){
if(tokensplit[j] %in% SWList){
change=1
forremoval<-append(forremoval,j)
tmppos<-paste(possplit[-forremoval],collapse=" ")
}
}
if(change==1){
tmp<-paste(tokensplit[-forremoval],collapse="_")
if(length(tmp)==0){
tmp=""
tmppos=""
}
df[i,"modifiedphrase"]=tmp
df[i,"modpos"]=tmppos
}
}
# Final output
print(df)
id phrase pos modifiedphrase modpos
1 1 choice_for_selection NN JJ NN choice_selection NN NN
2 2 accordingly_choices_for_selection NN JJ NN NN choices_selection JJ NN
3 3 only_top_selection NNS NN NNS top_selection NN NNS
>
这是一个应用版本:
# stop words list
library(tm)
SWList <- stopwords(kind = "SMART")
df$modpos <- apply(df[,c('phrase', 'pos')], 1, function(x){
paste(strsplit(x[2],' ')[[1]][!((strsplit(x[1],'_')[[1]])%in%SWList)], collapse=" ")
})
df$modifiedphrase <- sapply(df$modified, function(x) {
paste(setdiff(strsplit(x,"_")[[1]],SWList),collapse="_")
})
我知道这是假数据,但您可能还想考虑删除停用词中的撇号:
SWList = gsub('\'','',SWList)
更新
效率检查:
(1)设置数据功能:因此我们可以在每次效率检查前设置数据。
setup_data = function(){
id <- c(1,2,3)
phrase <- c("choice_for_selection","accordingly_choices_for_selection", "only_top_selection")
pos <- c("NN JJ NN","NN JJ NN NN", "NNS NN NNS") #fake part of speech
df <- as.data.frame(cbind(id,phrase,pos))
df<-cbind(df,df$phrase) # creating copy of the phrase to modify it
df<-cbind(df,df$pos) # creating copy of the pos to modify it
colnames(df) <- c("id","phrase","pos","modifiedphrase","modpos")
df$modifiedphrase<-as.character(df$modifiedphrase)
df$modpos<-as.character(df$modpos)
return(df)
}
(2) 原始的For-loop方法:
forloop_method = function(){
for(i in 1:length(df[,1])){
tokensplit<-str_split(df[i,"phrase"],"_")[[1]]
possplit<-str_split(df[i,"pos"]," ")[[1]]
change=0
forremoval=NULL
for(j in 1:length(tokensplit)){
if(tokensplit[j] %in% SWList){
change=1
forremoval<-append(forremoval,j)
tmppos<-paste(possplit[-forremoval],collapse=" ")
}
}
if(change==1){
tmp<-paste(tokensplit[-forremoval],collapse="_")
if(length(tmp)==0){
tmp=""
tmppos=""
}
df[i,"modifiedphrase"]=tmp
df[i,"modpos"]=tmppos
}
}
}
(3) 应用方法:
apply_method = function(){
df$modpos <- apply(df[,c('phrase', 'pos')], 1, function(x){
paste(strsplit(x[2],' ')[[1]][!((strsplit(x[1],'_')[[1]])%in%SWList)], collapse=" ")
})
df$modifiedphrase <- sapply(df$modified, function(x) {
paste(setdiff(strsplit(x,"_")[[1]],SWList),collapse="_")
})
}
(4) 使用 'microbenchmark' 包的微秒效率:
library(microbenchmark)
df = setup_data()
microbenchmark(forloop_method(), unit='us')
Unit: microseconds
expr min lq mean median uq max neval
forloop_method 884.229 965.2805 1050.775 992.224 1032.69 2680.374 100
df = setup_data()
microbenchmark(apply_method, unit='us')
Unit: microseconds
expr min lq mean median uq max neval
apply_method 0.018 0.025 0.49948 0.026 0.027 45.379 100
1050.775/0.49948 = 2103.738 倍我的系统加速。