简单频率后检测对
Detect pairs after simple frequency
完成这些步骤后:
library(quanteda)
df <- data.frame(text = c("only a small text","only a small text","only a small text","only a small text","only a small text","only a small text","remove this word lower frequency"))
tdfm <- df$text %>%
tokens(remove_punct = TRUE, remove_numbers = TRUE) %>%
dfm()
dfm_keep(tdfm, pattern = featnames(tdfm)[docfreq(tdfm) > 5])
如何找到存在于 5 个以上文档中的词对或词组 (ngram = 2:3)?
与上一个问题一样,只需扩展您要查找的内容即可。
tdfm <- df$text %>%
tokens(remove_punct = TRUE, remove_numbers = TRUE) %>%
# 2 and 3 grams
tokens_ngrams(n = 2:3) %>%
dfm()
dfm_keep(tdfm, pattern = featnames(tdfm)[docfreq(tdfm) > 5])
Document-feature matrix of: 7 documents, 5 features (14.3% sparse).
features
docs only_a a_small small_text only_a_small a_small_text
text1 1 1 1 1 1
text2 1 1 1 1 1
text3 1 1 1 1 1
text4 1 1 1 1 1
text5 1 1 1 1 1
text6 1 1 1 1 1
text7 0 0 0 0 0
ngrams需要在转换为dfm之前构建。因为 dfm 中的单词顺序丢失了。
那么干净的量化方式是:
library(quanteda)
df <- data.frame(text = c("only a small text","only a small text","only a small text","only a small text","only a small text","only a small text","remove this word lower frequency"))
tdfm <- df %>%
corpus() %>% # when you have a data.frame it usually makes sense to construct a corpus first to retain the other columns as meta-data
tokens(remove_punct = TRUE,
remove_numbers = TRUE) %>%
tokens_ngrams(n = 2:3) %>% # construct ngrams
dfm() %>% # convert to dfm
dfm_trim(min_docfreq = 5) # select ngrams that appear in at least 5 documents
tdfm
#> Document-feature matrix of: 7 documents, 5 features (14.3% sparse).
#> features
#> docs only_a a_small small_text only_a_small a_small_text
#> text1 1 1 1 1 1
#> text2 1 1 1 1 1
#> text3 1 1 1 1 1
#> text4 1 1 1 1 1
#> text5 1 1 1 1 1
#> text6 1 1 1 1 1
#> [ reached max_ndoc ... 1 more document ]
由 reprex package (v0.3.0)
于 2020 年 7 月 22 日创建
根据评论更新
如果你只想从出现在超过 4 个文档中的单词创建 ngram,我认为首先构建一个没有 ngram 的 dfm 最有意义,过滤出现在超过 4 个文档中的术语并使用这个 dfm 来在构建 ngrams 之前对标记进行子集化(因为不存在 tokens_trim 函数):
# first construct dfm without ngrams and
dfm_onegram <- df %>%
corpus() %>%
dfm() %>%
dfm_trim(min_docfreq = 4)
dfm_ngram <- df %>%
corpus() %>%
tokens(remove_punct = TRUE,
remove_numbers = TRUE) %>%
tokens_keep(featnames(dfm_onegram)) %>% # keep only tokens that appear in more than 4 docs (in the dfm_onegram object)
tokens_ngrams(n = 2:3) %>%
dfm() %>%
dfm_trim(min_docfreq = 5)
请记住,在 ngram 中,现在会忽略不常见的单词。如果您的文本是“only a rare small text”,生成的 ngram 仍将是“only_a_small”。
完成这些步骤后:
library(quanteda)
df <- data.frame(text = c("only a small text","only a small text","only a small text","only a small text","only a small text","only a small text","remove this word lower frequency"))
tdfm <- df$text %>%
tokens(remove_punct = TRUE, remove_numbers = TRUE) %>%
dfm()
dfm_keep(tdfm, pattern = featnames(tdfm)[docfreq(tdfm) > 5])
如何找到存在于 5 个以上文档中的词对或词组 (ngram = 2:3)?
与上一个问题一样,只需扩展您要查找的内容即可。
tdfm <- df$text %>%
tokens(remove_punct = TRUE, remove_numbers = TRUE) %>%
# 2 and 3 grams
tokens_ngrams(n = 2:3) %>%
dfm()
dfm_keep(tdfm, pattern = featnames(tdfm)[docfreq(tdfm) > 5])
Document-feature matrix of: 7 documents, 5 features (14.3% sparse).
features
docs only_a a_small small_text only_a_small a_small_text
text1 1 1 1 1 1
text2 1 1 1 1 1
text3 1 1 1 1 1
text4 1 1 1 1 1
text5 1 1 1 1 1
text6 1 1 1 1 1
text7 0 0 0 0 0
ngrams需要在转换为dfm之前构建。因为 dfm 中的单词顺序丢失了。
那么干净的量化方式是:
library(quanteda)
df <- data.frame(text = c("only a small text","only a small text","only a small text","only a small text","only a small text","only a small text","remove this word lower frequency"))
tdfm <- df %>%
corpus() %>% # when you have a data.frame it usually makes sense to construct a corpus first to retain the other columns as meta-data
tokens(remove_punct = TRUE,
remove_numbers = TRUE) %>%
tokens_ngrams(n = 2:3) %>% # construct ngrams
dfm() %>% # convert to dfm
dfm_trim(min_docfreq = 5) # select ngrams that appear in at least 5 documents
tdfm
#> Document-feature matrix of: 7 documents, 5 features (14.3% sparse).
#> features
#> docs only_a a_small small_text only_a_small a_small_text
#> text1 1 1 1 1 1
#> text2 1 1 1 1 1
#> text3 1 1 1 1 1
#> text4 1 1 1 1 1
#> text5 1 1 1 1 1
#> text6 1 1 1 1 1
#> [ reached max_ndoc ... 1 more document ]
由 reprex package (v0.3.0)
于 2020 年 7 月 22 日创建根据评论更新
如果你只想从出现在超过 4 个文档中的单词创建 ngram,我认为首先构建一个没有 ngram 的 dfm 最有意义,过滤出现在超过 4 个文档中的术语并使用这个 dfm 来在构建 ngrams 之前对标记进行子集化(因为不存在 tokens_trim 函数):
# first construct dfm without ngrams and
dfm_onegram <- df %>%
corpus() %>%
dfm() %>%
dfm_trim(min_docfreq = 4)
dfm_ngram <- df %>%
corpus() %>%
tokens(remove_punct = TRUE,
remove_numbers = TRUE) %>%
tokens_keep(featnames(dfm_onegram)) %>% # keep only tokens that appear in more than 4 docs (in the dfm_onegram object)
tokens_ngrams(n = 2:3) %>%
dfm() %>%
dfm_trim(min_docfreq = 5)
请记住,在 ngram 中,现在会忽略不常见的单词。如果您的文本是“only a rare small text”,生成的 ngram 仍将是“only_a_small”。