通过组合 r 中的单词创建语料库
Create Corpus by combining words in r
我正在尝试创建语料库,但因为我想在文档中组合 2 个连续的单词,所以我不想要单个单词的语料库。
我正在使用以下脚本。有没有一种方法可以创建语料库 "docs",其中将包含每个文档中组合的 2 个连续单词?请指教
library(plyr)
library(tm)
library(e1071)
setwd("C:/Assignment/Assignment-Group-Prediction/IPM")
training<- read.csv("Data.csv",header=T,na.strings=c(""))
Res_Desc_Train <- subset(training,select=c("Group","Description"))
##Step 1 : Create Document Matrix
docs <- Corpus(VectorSource(Res_Desc_Train$Description))
docs <-tm_map(docs,content_transformer(tolower))
#remove potentially problematic symbols
toSpace <- content_transformer(function(x, pattern) { return (gsub(pattern, " ", x))})
removeSpecialChars <- function(x) gsub("[^a-zA-Z0-9 ]","",x)
docs <- tm_map(docs, toSpace, "/")
docs <- tm_map(docs, toSpace, "-")
docs <- tm_map(docs, toSpace, ":")
docs <- tm_map(docs, toSpace, ";")
docs <- tm_map(docs, toSpace, "@")
docs <- tm_map(docs, toSpace, "\(" )
docs <- tm_map(docs, toSpace, ")")
docs <- tm_map(docs, toSpace, ",")
docs <- tm_map(docs, toSpace, "_")
docs <- tm_map(docs, content_transformer(removeSpecialChars))
docs <- tm_map(docs, content_transformer(tolower))
docs <- tm_map(docs, removeWords, stopwords("en"))
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, stripWhitespace)
docs <- tm_map(docs, removeNumbers)
tm
包的FAQ直接回答你的问题:
Can I use bigrams instead of single tokens in a term-document matrix?
Yes. Package NLP provides functionality to compute n-grams which can be used to construct a corresponding tokenizer. E.g.:
library("tm")
data("crude")
BigramTokenizer <-
function(x)
unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
tdm <- TermDocumentMatrix(crude, control = list(tokenize = BigramTokenizer))
inspect(removeSparseTerms(tdm[, 1:10], 0.7))
我正在尝试创建语料库,但因为我想在文档中组合 2 个连续的单词,所以我不想要单个单词的语料库。
我正在使用以下脚本。有没有一种方法可以创建语料库 "docs",其中将包含每个文档中组合的 2 个连续单词?请指教
library(plyr)
library(tm)
library(e1071)
setwd("C:/Assignment/Assignment-Group-Prediction/IPM")
training<- read.csv("Data.csv",header=T,na.strings=c(""))
Res_Desc_Train <- subset(training,select=c("Group","Description"))
##Step 1 : Create Document Matrix
docs <- Corpus(VectorSource(Res_Desc_Train$Description))
docs <-tm_map(docs,content_transformer(tolower))
#remove potentially problematic symbols
toSpace <- content_transformer(function(x, pattern) { return (gsub(pattern, " ", x))})
removeSpecialChars <- function(x) gsub("[^a-zA-Z0-9 ]","",x)
docs <- tm_map(docs, toSpace, "/")
docs <- tm_map(docs, toSpace, "-")
docs <- tm_map(docs, toSpace, ":")
docs <- tm_map(docs, toSpace, ";")
docs <- tm_map(docs, toSpace, "@")
docs <- tm_map(docs, toSpace, "\(" )
docs <- tm_map(docs, toSpace, ")")
docs <- tm_map(docs, toSpace, ",")
docs <- tm_map(docs, toSpace, "_")
docs <- tm_map(docs, content_transformer(removeSpecialChars))
docs <- tm_map(docs, content_transformer(tolower))
docs <- tm_map(docs, removeWords, stopwords("en"))
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, stripWhitespace)
docs <- tm_map(docs, removeNumbers)
tm
包的FAQ直接回答你的问题:
Can I use bigrams instead of single tokens in a term-document matrix?
Yes. Package NLP provides functionality to compute n-grams which can be used to construct a corresponding tokenizer. E.g.:
library("tm")
data("crude")
BigramTokenizer <-
function(x)
unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
tdm <- TermDocumentMatrix(crude, control = list(tokenize = BigramTokenizer))
inspect(removeSparseTerms(tdm[, 1:10], 0.7))