使用 phrasemachine (R) 从提取的短语列表中创建文档特征矩阵
Creating a document-feature matrix from list of extracted phrases after using phrasemachine (R)
应用 phrasemachine()
后,我有一个包含短语的嵌套列表。现在我想创建一个文档特征矩阵,其中第一列是文档(用户),其余列是所有特征,每个用户在单元格中的使用频率。
library(rJava)
library(phrasemachine)
library(quanteda)
#creating dummy data
id <- c(1:2)
text <- c("Election day is coming up and I am super excited. Election day. Wooho. I voted President Obama.", "School is boring. Partying is cool. Happy Birthday to me. When is Election Day?")
test <- data.frame(id, text)
test$text <- as.character(test$text)
corpus_test <- corpus(test[["text"]], docnames = test[["id"]])
tokens_test <- tokens(corpus_test)
phrases_test <- phrasemachine(tokens_test, minimum_ngram_length = 2, maximum_ngram_length = 3, return_phrase_vectors = TRUE, return_tag_sequences = TRUE)
phrases_test
# > phrases_test
# [[1]]
# [[1]]$phrases
# [1] "Election_day" "Election_day" "President_Obama"
#
# [[1]]$tags
# [1] "NN" "NN" "NN"
#
#
# [[2]]
# [[2]]$phrases
# [1] "Happy_Birthday" "Election_Day"
#
# [[2]]$tags
# [1] "AN" "NN"
这是我正在寻找的输出(文档特征矩阵):
# user Election_day President_Obama Happy_Birthday
# 1 2 1 0
# 2 1 0 1
我尝试使用 lapply
,但由于每个用户的短语具有不同的维度,所以这行不通。
这是我尝试过的:
library(plyr)
phrases_user <- laply(phrases_test, function(x) laply(x, identity)) #Error: Results must have the same dimensions.
library(dplyr)
phrases_user <- lapply(phrases_test, `[`, "phrases")
在找出每个 Id 提取短语的问题后,我想我必须执行以下操作:
corpus_test_2 <- corpus(phrases_user[["phrases"]], docnames = phrases_user[["id"]])
dfm_test <- dfm(corpus_test_2)
有人能帮忙吗? :)
将 udpipe 与 phrasemachine 结合使用的示例
library(udpipe)
text <- c("Election day is coming up and I am super excited. Election day. Wooho. I voted President Obama.", "School is boring. Partying is cool. Happy Birthday to me. When is Election Day?")
x <- udpipe(text, "english")
x$tags <- as_phrasemachine(x$upos, type = "upos")
keyw <- keywords_phrases(x$tags,
term = x$token, pattern = "(A|N)*N(P+D*(A|N)*N)*",
is_regex = TRUE, detailed = FALSE)
head(keyw)
x$term <- txt_recode_ngram(x$token,
compound = keyw$keyword,
ngram = keyw$ngram)
dtm <- document_term_frequencies(x, document = "doc_id", term = c("term", "token"))
dtm <- document_term_matrix(dtm)
请注意,尽管您可能也对使用依赖项解析输出感兴趣。您可以根据 udpipe 输出的 dep_rel
字段提取多词表达式 - 如果它显示 fixed/flat/compound,那就是多词表达式。 fixed/flat/compound的定义在http://universaldependencies.org/u/dep/index.html处定义。
应用 phrasemachine()
后,我有一个包含短语的嵌套列表。现在我想创建一个文档特征矩阵,其中第一列是文档(用户),其余列是所有特征,每个用户在单元格中的使用频率。
library(rJava)
library(phrasemachine)
library(quanteda)
#creating dummy data
id <- c(1:2)
text <- c("Election day is coming up and I am super excited. Election day. Wooho. I voted President Obama.", "School is boring. Partying is cool. Happy Birthday to me. When is Election Day?")
test <- data.frame(id, text)
test$text <- as.character(test$text)
corpus_test <- corpus(test[["text"]], docnames = test[["id"]])
tokens_test <- tokens(corpus_test)
phrases_test <- phrasemachine(tokens_test, minimum_ngram_length = 2, maximum_ngram_length = 3, return_phrase_vectors = TRUE, return_tag_sequences = TRUE)
phrases_test
# > phrases_test
# [[1]]
# [[1]]$phrases
# [1] "Election_day" "Election_day" "President_Obama"
#
# [[1]]$tags
# [1] "NN" "NN" "NN"
#
#
# [[2]]
# [[2]]$phrases
# [1] "Happy_Birthday" "Election_Day"
#
# [[2]]$tags
# [1] "AN" "NN"
这是我正在寻找的输出(文档特征矩阵):
# user Election_day President_Obama Happy_Birthday
# 1 2 1 0
# 2 1 0 1
我尝试使用 lapply
,但由于每个用户的短语具有不同的维度,所以这行不通。
这是我尝试过的:
library(plyr)
phrases_user <- laply(phrases_test, function(x) laply(x, identity)) #Error: Results must have the same dimensions.
library(dplyr)
phrases_user <- lapply(phrases_test, `[`, "phrases")
在找出每个 Id 提取短语的问题后,我想我必须执行以下操作:
corpus_test_2 <- corpus(phrases_user[["phrases"]], docnames = phrases_user[["id"]])
dfm_test <- dfm(corpus_test_2)
有人能帮忙吗? :)
将 udpipe 与 phrasemachine 结合使用的示例
library(udpipe)
text <- c("Election day is coming up and I am super excited. Election day. Wooho. I voted President Obama.", "School is boring. Partying is cool. Happy Birthday to me. When is Election Day?")
x <- udpipe(text, "english")
x$tags <- as_phrasemachine(x$upos, type = "upos")
keyw <- keywords_phrases(x$tags,
term = x$token, pattern = "(A|N)*N(P+D*(A|N)*N)*",
is_regex = TRUE, detailed = FALSE)
head(keyw)
x$term <- txt_recode_ngram(x$token,
compound = keyw$keyword,
ngram = keyw$ngram)
dtm <- document_term_frequencies(x, document = "doc_id", term = c("term", "token"))
dtm <- document_term_matrix(dtm)
请注意,尽管您可能也对使用依赖项解析输出感兴趣。您可以根据 udpipe 输出的 dep_rel
字段提取多词表达式 - 如果它显示 fixed/flat/compound,那就是多词表达式。 fixed/flat/compound的定义在http://universaldependencies.org/u/dep/index.html处定义。