标记 R 中特定单词的词性
Tagging part of speech for a particular word in R
我的 A 列有句子,B 列有一些单词。我想检查 B 列单词属于 A 列中出现的句子的词性。
目前我可以使用以下代码获取单个句子的词性:
我试图让词性对应于文本文件中的每个句子。请为此建议代码。
s <- unlist(lapply(posText, function(x) { str_split(x, "\n") }))
tagPOS <- function(x, ...) {
s <- as.String(x)
word_token_annotator <- Maxent_Word_Token_Annotator()
a2 <- Annotation(1L, "sentence", 1L, nchar(s))
a2 <- annotate(s, word_token_annotator, a2)
a3 <- annotate(s, Maxent_POS_Tag_Annotator(), a2)
a3w <- a3[a3$type == "word"]
POStags <- unlist(lapply(a3w$features, `[[`, "POS"))
POStagged <- paste(sprintf("%s/%s", s[a3w], POStags), collapse = " ")
list(POStagged = POStagged, POStags = POStags)
}
tagged_str <- tagPOS(s)
使用lapply您可以标记多个句子。由于您没有提供可重现的数据,我创建了自己的数据。
代码
#Reproducible data - Quotes from Wuthering Heights by Emily Bronte
posText<- "I gave him my heart, and he took and pinched it to death; and flung it back to me.
People feel with their hearts, Ellen, and since he has destroyed mine, I have not power to feel for him."
library(stringr)
#Spliting into sentence based on carriage return
s <- unlist(lapply(posText, function(x) { str_split(x, "\n") }))
library(NLP)
library(openNLP)
tagPOS <- function(x, ...) {
s <- as.String(x)
word_token_annotator <- Maxent_Word_Token_Annotator()
a2 <- Annotation(1L, "sentence", 1L, nchar(s))
a2 <- annotate(s, word_token_annotator, a2)
a3 <- annotate(s, Maxent_POS_Tag_Annotator(), a2)
a3w <- a3[a3$type == "word"]
POStags <- unlist(lapply(a3w$features, `[[`, "POS"))
POStagged <- paste(sprintf("%s/%s", s[a3w], POStags), collapse = " ")
list(POStagged = POStagged, POStags = POStags)
}
result <- lapply(s,tagPOS)
result <- as.data.frame(do.call(rbind,result))
输出创建了一个包含两列的数据框,其中一列是包含单词的句子,标签由“/”分隔。第二列是按照句子中出现的方式排序的标签集。
输出:
> print(result)
POStagged
1 I/PRP gave/VBD him/PRP my/PRP$ heart/NN ,/, and/CC he/PRP took/VBD and/CC pinched/VBD it/PRP to/TO death/NN ;/: and/CC flung/VBD it/PRP back/RB to/TO me/PRP ./.
2 People/NNS feel/VBP with/IN their/PRP$ hearts/NNS ,/, Ellen/NNP ,/, and/CC since/IN he/PRP has/VBZ destroyed/VBN mine/NN ,/, I/PRP have/VBP not/RB power/NN to/TO feel/VB for/IN him/PRP ./.
POStags
1 PRP, VBD, PRP, PRP$, NN, ,, CC, PRP, VBD, CC, VBD, PRP, TO, NN, :, CC, VBD, PRP, RB, TO, PRP, .
2 NNS, VBP, IN, PRP$, NNS, ,, NNP, ,, CC, IN, PRP, VBZ, VBN, NN, ,, PRP, VBP, RB, NN, TO, VB, IN, PRP, .
>
我维护的 tagger 包在这里可能有用,可以让生活更轻松。它有一些类似于 Python 的行为,我将在下面进行演示:
数据
posText<- "I gave him my heart, and he took and pinched it to death; and flung it back to me.
People feel with their hearts, Ellen, and since he has destroyed mine, I have not power to feel for him."
获取包
if (!require("pacman")) install.packages("pacman")
pacman::p_load_gh(c(
"trinker/termco",
"trinker/tagger",
"trinker/textshape"
))
标记
tagged <- tag_pos(split_sentence(posText)[[1]])
tagged
## 1] "I/PRP gave/VBD him/PRP my/PRP$ heart/NN ,/, and/CC he/PRP took/VBD and/CC pinched/VBD it/PRP to/TO death/NN ;/: and/CC flung/VBD it/PRP back/RB to/TO me/PRP ./."
## [2] "People/NNS feel/VBP with/IN their/PRP$ hearts/NNS ,/, Ellen/NNP ,/, and/CC since/IN he/PRP has/VBZ destroyed/VBN mine/NN ,/, I/PRP have/VBP not/RB power/NN to/TO feel/VB for/IN him/PRP ./."
实际输出是什么(pos 命名向量列表)
c(tagged)
## [[1]]
## PRP VBD PRP PRP$ NN , CC PRP
## "I" "gave" "him" "my" "heart" "," "and" "he"
## VBD CC VBD PRP TO NN : CC
## "took" "and" "pinched" "it" "to" "death" ";" "and"
## VBD PRP RB TO PRP .
## "flung" "it" "back" "to" "me" "."
##
## [[2]]
## NNS VBP IN PRP$ NNS ,
## "People" "feel" "with" "their" "hearts" ","
## NNP , CC IN PRP VBZ
## "Ellen" "," "and" "since" "he" "has"
## VBN NN , PRP VBP RB
## "destroyed" "mine" "," "I" "have" "not"
## NN TO VB IN PRP .
## "power" "to" "feel" "for" "him" "."
选择标签(正则表达式也可用)
select_tags(tagged, c("NN", "NNP", "NNPS", "NNS"))
## [1] "heart/NN death/NN"
## [2] "People/NNS hearts/NNS Ellen/NNP mine/NN power/NN"
基本 POS 类型
as_basic(tagged)
## [1] "I/pronoun gave/verb him/pronoun my/pronoun heart/noun ,/. and/conjunction he/pronoun took/verb and/conjunction pinched/verb it/pronoun to/preposition death/noun ;/. and/conjunction flung/verb it/pronoun back/adverb to/preposition me/pronoun ./."
## [2] "People/noun feel/verb with/preposition their/pronoun hearts/noun ,/. Ellen/noun ,/. and/conjunction since/preposition he/pronoun has/verb destroyed/verb mine/noun ,/. I/pronoun have/verb not/adverb power/noun to/preposition feel/verb for/preposition him/pronoun ./."
获取 POS 计数
count_tags(tagged, pretty = FALSE)
## n.tokens , . : CC IN NN NNP NNS PRP PRP$ RB TO VB VBD VBN VBP VBZ id
## 1: 22 1 1 1 3 0 2 0 0 6 1 1 2 0 4 0 0 0 1
## 2: 24 3 1 0 1 3 2 1 2 3 1 1 1 1 0 1 2 1 2
仅获取 POS 标签
lapply(tagged, names)
## [[1]]
## [1] "PRP" "VBD" "PRP" "PRP$" "NN" "," "CC" "PRP" "VBD" "CC"
## [11] "VBD" "PRP" "TO" "NN" ":" "CC" "VBD" "PRP" "RB" "TO"
## [21] "PRP" "."
##
## [[2]]
## [1] "NNS" "VBP" "IN" "PRP$" "NNS" "," "NNP" "," "CC" "IN"
## [11] "PRP" "VBZ" "VBN" "NN" "," "PRP" "VBP" "RB" "NN" "TO"
## [21] "VB" "IN" "PRP" "."
我的 A 列有句子,B 列有一些单词。我想检查 B 列单词属于 A 列中出现的句子的词性。
目前我可以使用以下代码获取单个句子的词性:
我试图让词性对应于文本文件中的每个句子。请为此建议代码。
s <- unlist(lapply(posText, function(x) { str_split(x, "\n") }))
tagPOS <- function(x, ...) {
s <- as.String(x)
word_token_annotator <- Maxent_Word_Token_Annotator()
a2 <- Annotation(1L, "sentence", 1L, nchar(s))
a2 <- annotate(s, word_token_annotator, a2)
a3 <- annotate(s, Maxent_POS_Tag_Annotator(), a2)
a3w <- a3[a3$type == "word"]
POStags <- unlist(lapply(a3w$features, `[[`, "POS"))
POStagged <- paste(sprintf("%s/%s", s[a3w], POStags), collapse = " ")
list(POStagged = POStagged, POStags = POStags)
}
tagged_str <- tagPOS(s)
使用lapply您可以标记多个句子。由于您没有提供可重现的数据,我创建了自己的数据。
代码
#Reproducible data - Quotes from Wuthering Heights by Emily Bronte
posText<- "I gave him my heart, and he took and pinched it to death; and flung it back to me.
People feel with their hearts, Ellen, and since he has destroyed mine, I have not power to feel for him."
library(stringr)
#Spliting into sentence based on carriage return
s <- unlist(lapply(posText, function(x) { str_split(x, "\n") }))
library(NLP)
library(openNLP)
tagPOS <- function(x, ...) {
s <- as.String(x)
word_token_annotator <- Maxent_Word_Token_Annotator()
a2 <- Annotation(1L, "sentence", 1L, nchar(s))
a2 <- annotate(s, word_token_annotator, a2)
a3 <- annotate(s, Maxent_POS_Tag_Annotator(), a2)
a3w <- a3[a3$type == "word"]
POStags <- unlist(lapply(a3w$features, `[[`, "POS"))
POStagged <- paste(sprintf("%s/%s", s[a3w], POStags), collapse = " ")
list(POStagged = POStagged, POStags = POStags)
}
result <- lapply(s,tagPOS)
result <- as.data.frame(do.call(rbind,result))
输出创建了一个包含两列的数据框,其中一列是包含单词的句子,标签由“/”分隔。第二列是按照句子中出现的方式排序的标签集。
输出:
> print(result)
POStagged
1 I/PRP gave/VBD him/PRP my/PRP$ heart/NN ,/, and/CC he/PRP took/VBD and/CC pinched/VBD it/PRP to/TO death/NN ;/: and/CC flung/VBD it/PRP back/RB to/TO me/PRP ./.
2 People/NNS feel/VBP with/IN their/PRP$ hearts/NNS ,/, Ellen/NNP ,/, and/CC since/IN he/PRP has/VBZ destroyed/VBN mine/NN ,/, I/PRP have/VBP not/RB power/NN to/TO feel/VB for/IN him/PRP ./.
POStags
1 PRP, VBD, PRP, PRP$, NN, ,, CC, PRP, VBD, CC, VBD, PRP, TO, NN, :, CC, VBD, PRP, RB, TO, PRP, .
2 NNS, VBP, IN, PRP$, NNS, ,, NNP, ,, CC, IN, PRP, VBZ, VBN, NN, ,, PRP, VBP, RB, NN, TO, VB, IN, PRP, .
>
我维护的 tagger 包在这里可能有用,可以让生活更轻松。它有一些类似于 Python 的行为,我将在下面进行演示:
数据
posText<- "I gave him my heart, and he took and pinched it to death; and flung it back to me.
People feel with their hearts, Ellen, and since he has destroyed mine, I have not power to feel for him."
获取包
if (!require("pacman")) install.packages("pacman")
pacman::p_load_gh(c(
"trinker/termco",
"trinker/tagger",
"trinker/textshape"
))
标记
tagged <- tag_pos(split_sentence(posText)[[1]])
tagged
## 1] "I/PRP gave/VBD him/PRP my/PRP$ heart/NN ,/, and/CC he/PRP took/VBD and/CC pinched/VBD it/PRP to/TO death/NN ;/: and/CC flung/VBD it/PRP back/RB to/TO me/PRP ./."
## [2] "People/NNS feel/VBP with/IN their/PRP$ hearts/NNS ,/, Ellen/NNP ,/, and/CC since/IN he/PRP has/VBZ destroyed/VBN mine/NN ,/, I/PRP have/VBP not/RB power/NN to/TO feel/VB for/IN him/PRP ./."
实际输出是什么(pos 命名向量列表)
c(tagged)
## [[1]]
## PRP VBD PRP PRP$ NN , CC PRP
## "I" "gave" "him" "my" "heart" "," "and" "he"
## VBD CC VBD PRP TO NN : CC
## "took" "and" "pinched" "it" "to" "death" ";" "and"
## VBD PRP RB TO PRP .
## "flung" "it" "back" "to" "me" "."
##
## [[2]]
## NNS VBP IN PRP$ NNS ,
## "People" "feel" "with" "their" "hearts" ","
## NNP , CC IN PRP VBZ
## "Ellen" "," "and" "since" "he" "has"
## VBN NN , PRP VBP RB
## "destroyed" "mine" "," "I" "have" "not"
## NN TO VB IN PRP .
## "power" "to" "feel" "for" "him" "."
选择标签(正则表达式也可用)
select_tags(tagged, c("NN", "NNP", "NNPS", "NNS"))
## [1] "heart/NN death/NN"
## [2] "People/NNS hearts/NNS Ellen/NNP mine/NN power/NN"
基本 POS 类型
as_basic(tagged)
## [1] "I/pronoun gave/verb him/pronoun my/pronoun heart/noun ,/. and/conjunction he/pronoun took/verb and/conjunction pinched/verb it/pronoun to/preposition death/noun ;/. and/conjunction flung/verb it/pronoun back/adverb to/preposition me/pronoun ./."
## [2] "People/noun feel/verb with/preposition their/pronoun hearts/noun ,/. Ellen/noun ,/. and/conjunction since/preposition he/pronoun has/verb destroyed/verb mine/noun ,/. I/pronoun have/verb not/adverb power/noun to/preposition feel/verb for/preposition him/pronoun ./."
获取 POS 计数
count_tags(tagged, pretty = FALSE)
## n.tokens , . : CC IN NN NNP NNS PRP PRP$ RB TO VB VBD VBN VBP VBZ id
## 1: 22 1 1 1 3 0 2 0 0 6 1 1 2 0 4 0 0 0 1
## 2: 24 3 1 0 1 3 2 1 2 3 1 1 1 1 0 1 2 1 2
仅获取 POS 标签
lapply(tagged, names)
## [[1]]
## [1] "PRP" "VBD" "PRP" "PRP$" "NN" "," "CC" "PRP" "VBD" "CC"
## [11] "VBD" "PRP" "TO" "NN" ":" "CC" "VBD" "PRP" "RB" "TO"
## [21] "PRP" "."
##
## [[2]]
## [1] "NNS" "VBP" "IN" "PRP$" "NNS" "," "NNP" "," "CC" "IN"
## [11] "PRP" "VBZ" "VBN" "NN" "," "PRP" "VBP" "RB" "NN" "TO"
## [21] "VB" "IN" "PRP" "."