R:将相关词与一个或多个词可视化
R : Visualize correlated words against one or more words
我有一个包含 11 个文本文档的语料库。我使用以下命令找到了单词关联:
findAssocs(dtm, c("youngster","campaign"), corlimit=0.9)
findAssocs(dtms, "corruption", corlimit=0.9)
dtm 是文档术语矩阵。
dtm <- DocumentTermMatrix(docs)
其中 docs 是语料库。
dtms 是去除 10% 稀疏词后的文档词矩阵。
dtms <- removeSparseTerms(dtm, 0.1)
我想绘制针对 (i) 2 个特定单词和 (ii) 1 个特定单词的相关术语
我试着按照这个 post : Plot highly correlated words against a specific word of interest
toi <- "corruption" # term of interest
corlimit <- 0.9 # lower correlation bound limit.
cor_0.9 <- data.frame(corr = findAssocs(dtm, toi, corlimit)[,1],terms=row.names(findAssocs(dtm, toi, corlimit)))
但不幸的是代码:
cor_0.9 <- data.frame(corr = findAssocs(dtm, toi, corlimit)[,1],terms=row.names(findAssocs(dtm, toi, corlimit)))
给我一个错误:
Error in findAssocs(dtm, toi, corlimit)[, 1]:incorrect number of dimensions
这是文档术语矩阵的结构:
dtm
<<DocumentTermMatrix (documents: 11, terms: 1847)>>
Non-/sparse entries: 8024/12293
Sparsity : 61%
Maximal term length: 23
Weighting : term frequency (tf)
在环境中它的形式是:
dtm List of 6
i: int [1:8024] 1 1 1 1 1 ...
j: int [1:8024] 17 29 34 43 47 ...
v: num [1:8024] 9 4 9 5 5 ...
nrow : int 11
ncol : int 1847
dimnames: list of 2
...$ Docs : chr [1:11] "character (0)" "character (0)" "character (0)"
...$ Terms: chr [1:1847] "campaigning"|__truncated__"a"|__"truncated"__
attr(*,"class") = chr [1:2] "DocumentTermMatrix" "simple_triplet_matrix"...
attr(*,"weighting") = chr [1:2] "term frequency" "tf"
如何绘制单个词和多个词的词相关性?请帮忙
这是
的输出
findAssocs(dtm, c("youngster","campaign"), corlimit=0.9)
$youngster
character colleges controversi expect corrupt much
1.00 1.00 1.00 1.00 0.99 0.99
okay saritha existing leads satisfi social
0.99 0.99 0.98 0.98 0.98 0.98
$campaign
basic make lack internal general method satisfied time
0.95 0.95 0.94 0.93 0.92 0.92 0.92 0.92
两个词需要稍微不同的方法,这里是一个快速尝试:
require(tm)
data("crude")
tdm <- TermDocumentMatrix(crude)
# Compute correlations and store in data frame...
toi1 <- "oil" # term of interest
toi2 <- "winter"
corlimit <- 0.7 # lower correlation bound limit.
corr1 <- findAssocs(tdm, toi1, corlimit)[[1]]
corr1 <- cbind(read.table(text = names(corr1), stringsAsFactors = FALSE), corr1)
corr2 <- findAssocs(tdm, toi2, corlimit)[[1]]
corr2 <- cbind(read.table(text = names(corr2), stringsAsFactors = FALSE), corr2)
# join them together
library(dplyr)
two_terms_corrs <- full_join(corr1, corr2)
# gather for plotting
library(tidyr)
two_terms_corrs_gathered <- gather(two_terms_corrs, term, correlation, corr1:corr2)
# insert the actual terms of interest so they show up on the legend
two_terms_corrs_gathered$term <- ifelse(two_terms_corrs_gathered$term == "corr1", toi1, toi2)
# Draw the plot...
require(ggplot2)
ggplot(two_terms_corrs_gathered, aes(x = V1, y = correlation, colour = term ) ) +
geom_point(size = 3) +
ylab(paste0("Correlation with the terms ", "\"", toi1, "\"", " and ", "\"", toi2, "\"")) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
我有一个包含 11 个文本文档的语料库。我使用以下命令找到了单词关联:
findAssocs(dtm, c("youngster","campaign"), corlimit=0.9)
findAssocs(dtms, "corruption", corlimit=0.9)
dtm 是文档术语矩阵。
dtm <- DocumentTermMatrix(docs)
其中 docs 是语料库。
dtms 是去除 10% 稀疏词后的文档词矩阵。
dtms <- removeSparseTerms(dtm, 0.1)
我想绘制针对 (i) 2 个特定单词和 (ii) 1 个特定单词的相关术语 我试着按照这个 post : Plot highly correlated words against a specific word of interest
toi <- "corruption" # term of interest
corlimit <- 0.9 # lower correlation bound limit.
cor_0.9 <- data.frame(corr = findAssocs(dtm, toi, corlimit)[,1],terms=row.names(findAssocs(dtm, toi, corlimit)))
但不幸的是代码:
cor_0.9 <- data.frame(corr = findAssocs(dtm, toi, corlimit)[,1],terms=row.names(findAssocs(dtm, toi, corlimit)))
给我一个错误:
Error in findAssocs(dtm, toi, corlimit)[, 1]:incorrect number of dimensions
这是文档术语矩阵的结构:
dtm
<<DocumentTermMatrix (documents: 11, terms: 1847)>>
Non-/sparse entries: 8024/12293
Sparsity : 61%
Maximal term length: 23
Weighting : term frequency (tf)
在环境中它的形式是:
dtm List of 6
i: int [1:8024] 1 1 1 1 1 ...
j: int [1:8024] 17 29 34 43 47 ...
v: num [1:8024] 9 4 9 5 5 ...
nrow : int 11
ncol : int 1847
dimnames: list of 2
...$ Docs : chr [1:11] "character (0)" "character (0)" "character (0)"
...$ Terms: chr [1:1847] "campaigning"|__truncated__"a"|__"truncated"__
attr(*,"class") = chr [1:2] "DocumentTermMatrix" "simple_triplet_matrix"...
attr(*,"weighting") = chr [1:2] "term frequency" "tf"
如何绘制单个词和多个词的词相关性?请帮忙
这是
的输出findAssocs(dtm, c("youngster","campaign"), corlimit=0.9)
$youngster
character colleges controversi expect corrupt much
1.00 1.00 1.00 1.00 0.99 0.99
okay saritha existing leads satisfi social
0.99 0.99 0.98 0.98 0.98 0.98
$campaign
basic make lack internal general method satisfied time
0.95 0.95 0.94 0.93 0.92 0.92 0.92 0.92
两个词需要稍微不同的方法,这里是一个快速尝试:
require(tm)
data("crude")
tdm <- TermDocumentMatrix(crude)
# Compute correlations and store in data frame...
toi1 <- "oil" # term of interest
toi2 <- "winter"
corlimit <- 0.7 # lower correlation bound limit.
corr1 <- findAssocs(tdm, toi1, corlimit)[[1]]
corr1 <- cbind(read.table(text = names(corr1), stringsAsFactors = FALSE), corr1)
corr2 <- findAssocs(tdm, toi2, corlimit)[[1]]
corr2 <- cbind(read.table(text = names(corr2), stringsAsFactors = FALSE), corr2)
# join them together
library(dplyr)
two_terms_corrs <- full_join(corr1, corr2)
# gather for plotting
library(tidyr)
two_terms_corrs_gathered <- gather(two_terms_corrs, term, correlation, corr1:corr2)
# insert the actual terms of interest so they show up on the legend
two_terms_corrs_gathered$term <- ifelse(two_terms_corrs_gathered$term == "corr1", toi1, toi2)
# Draw the plot...
require(ggplot2)
ggplot(two_terms_corrs_gathered, aes(x = V1, y = correlation, colour = term ) ) +
geom_point(size = 3) +
ylab(paste0("Correlation with the terms ", "\"", toi1, "\"", " and ", "\"", toi2, "\"")) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))