在 R 中执行多个单词搜索的优雅方法是什么?
What is the elegant way to perform multiple word searches in R?
我正在构建一个单词搜索游戏,正在寻找一种“R”方式来完成此 reprex 的最后一行,这对于 2 个单词来说是微不足道的,但我希望它能够处理 n 字。我认为 *apply
函数之一可以在这里工作,但不知何故我无法解决。
library(tidyverse)
# Sample word list (173,000 words in reality)
words <- data.frame(word = c('test', 'word', 'active', 'angina', 'endite', 'endive', 'engine', 'entire', 'alanine', 'evening', 'escape', 'entered'),
word_length = c(4, 4, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7))
# Find a 6 letter word with 2nd letter n and a 7 letter word with letters 4 and 6 n
find <- '.n.... ...n.n.'
find_words <- unlist(str_split(find, ' '))
find_regex <- paste0('\b', find_words, '\b')
words %>%
filter(word_length == nchar(find_words[1])) %>%
filter(str_detect(word, find_regex[1])) %>%
full_join(
words %>%
filter(word_length == nchar(find_words[2])) %>%
filter(str_detect(word, find_regex[2]))
, by = character(), suffix = c('1', '2')) %>%
select(word1, word2)
另一个问题是做同样的事情,但是数字代表未知字母,因为这会显着减少匹配
# Find a 6 letter word with 2nd letter n and a 7 letter word with letters 4 and 6 n
# where the 1st and 6th letters of the first word and 1st and 3rd letters of the second word are all the same (1 in find)
# and the 4th letter of the first word matches the 5th letter of the second word (3 in find)
find <- c('1n2341', '151n3n6')
# "Manual" solution
words %>%
filter(word_length == 6 & str_sub(word, 2, 2) == 'n') %>%
full_join(words %>% filter(word_length == 7 & str_sub(word, 4, 4) == 'n' & str_sub(word, 6, 6) == 'n'),
by = character(), suffix = c('1', '2')) %>%
# match letter represented by '1'
filter(str_sub(word1, 1, 1) == str_sub(word1, 6, 6)) %>%
filter(str_sub(word1, 1, 1) == str_sub(word2, 1, 1)) %>%
filter(str_sub(word1, 1, 1) == str_sub(word2, 3, 3)) %>%
# match letter represented by '3'
filter(str_sub(word1, 4, 4) == str_sub(word2, 5, 5)) %>%
select(word1, word2)
对于第一个问题,您可以简单地使用lapply()
或map()
来循环模式。 str_detect()
已经为字符串矢量化。
然后如果你想将这些条目组合成一个新的数据框,你可以简单地使用 expand.grid()
来重现你的例子。但请注意,这种格式可能不是最好的,具体取决于您之后想用它做什么。
map(find_words, ~ words$word[str_detect(words$word, .x)]) %>%
expand.grid()
# Var1 Var2
# 1 angina alanine
# 2 endite alanine
# 3 endive alanine
# 4 engine alanine
# 5 entire alanine
# 6 entered alanine
# 7 angina evening
# 8 endite evening
# 9 endive evening
# 10 engine evening
# 11 entire evening
# 12 entered evening
对于进一步的问题,这里有一个分几个步骤的解决方案,但这只适用于 2 个单词(我使用第一个为第二个创建模式)。我认为对于更大的单词集,性能应该保持相当不错,但这需要测试。
library(tidyverse)
words <- tibble(word = c('alanins', 'snoops', 'test', 'word', 'active', 'angina', 'endite', 'endive', 'engine', 'entire', 'alanine', 'evening', 'escape', 'entered'),
word_length = nchar(word))
pattern1 <- '1n2341'
pattern2 <- '151n3n6'
candidates1 <- words$word[words$word_length == nchar(pattern1)]
has_consistent_names <- function(vec){
# TRUE if each name is associated with a single value
map_lgl(names(vec),
~ length(unique(vec[names(vec) == .x])) == 1) %>%
all()
}
candidates1_chars <- map(candidates1, ~ setNames(str_split(.x, "")[[1]],
str_split(pattern1, "")[[1]]))
candidates1_chars <- Filter(has_consistent_names, candidates1_chars)
candidates2 <- words$word[words$word_length == nchar(pattern2)]
pattern2_chars <- str_split(pattern2, "")[[1]]
build_regex <- function(xx, ptrn){
# for xx a named dictionary of numbers to characters, make regex pattern
xx <- xx[ptrn]
xx[is.na(xx)] <- "."
paste(xx, collapse="")
}
pattern2_regex <- map_chr(candidates1_chars,
~ build_regex(.x, pattern2_chars))
tibble(word1 = map_chr(candidates1_chars, ~paste(.x, collapse="")),
word2 = map(pattern2_regex, ~candidates2[str_detect(candidates2, .x)])) %>%
unnest(word2)
#> # A tibble: 6 x 2
#> word1 word2
#> <chr> <chr>
#> 1 angina alanins
#> 2 angina alanine
#> 3 endite evening
#> 4 endive evening
#> 5 engine evening
#> 6 entire evening
由 reprex package (v0.3.0)
于 2020-12-30 创建
想法比较简单,但是实现起来有很多实际问题:我先取所有长度合适的词来匹配第一个模式(candidates1),然后按字母拆分,这样我就可以看是否他们自己的字母遵循 pattern1(即在该示例中,如果第一个和最后一个字符相同)。
因此,我从第一个单词中获取了位置 <-> 字母的列表,并使用它为第二个单词生成正则表达式模式:我可以使用命名向量轻松地在字母和字符之间进行转换。如果第二个单词中存在数字但第一个单词中不存在,我有一个 NA
,我可以简单地将其替换为 .
以匹配任何内容,然后我可以将它们全部粘贴在一起。
最后,剩下的就是在每个实际单词上测试每个 candidate1 模式,以找到匹配的那些。如果我总是只有一个匹配项,我可以使用 map_chr()
;在那个例子中,一个模式不匹配,所以我将结果存储为一个列表,准备进行一些 post 处理(过滤长度(word2)为 0 的行,如果长度 > 1,则只保留第一个元素, ...)
我正在构建一个单词搜索游戏,正在寻找一种“R”方式来完成此 reprex 的最后一行,这对于 2 个单词来说是微不足道的,但我希望它能够处理 n 字。我认为 *apply
函数之一可以在这里工作,但不知何故我无法解决。
library(tidyverse)
# Sample word list (173,000 words in reality)
words <- data.frame(word = c('test', 'word', 'active', 'angina', 'endite', 'endive', 'engine', 'entire', 'alanine', 'evening', 'escape', 'entered'),
word_length = c(4, 4, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7))
# Find a 6 letter word with 2nd letter n and a 7 letter word with letters 4 and 6 n
find <- '.n.... ...n.n.'
find_words <- unlist(str_split(find, ' '))
find_regex <- paste0('\b', find_words, '\b')
words %>%
filter(word_length == nchar(find_words[1])) %>%
filter(str_detect(word, find_regex[1])) %>%
full_join(
words %>%
filter(word_length == nchar(find_words[2])) %>%
filter(str_detect(word, find_regex[2]))
, by = character(), suffix = c('1', '2')) %>%
select(word1, word2)
另一个问题是做同样的事情,但是数字代表未知字母,因为这会显着减少匹配
# Find a 6 letter word with 2nd letter n and a 7 letter word with letters 4 and 6 n
# where the 1st and 6th letters of the first word and 1st and 3rd letters of the second word are all the same (1 in find)
# and the 4th letter of the first word matches the 5th letter of the second word (3 in find)
find <- c('1n2341', '151n3n6')
# "Manual" solution
words %>%
filter(word_length == 6 & str_sub(word, 2, 2) == 'n') %>%
full_join(words %>% filter(word_length == 7 & str_sub(word, 4, 4) == 'n' & str_sub(word, 6, 6) == 'n'),
by = character(), suffix = c('1', '2')) %>%
# match letter represented by '1'
filter(str_sub(word1, 1, 1) == str_sub(word1, 6, 6)) %>%
filter(str_sub(word1, 1, 1) == str_sub(word2, 1, 1)) %>%
filter(str_sub(word1, 1, 1) == str_sub(word2, 3, 3)) %>%
# match letter represented by '3'
filter(str_sub(word1, 4, 4) == str_sub(word2, 5, 5)) %>%
select(word1, word2)
对于第一个问题,您可以简单地使用lapply()
或map()
来循环模式。 str_detect()
已经为字符串矢量化。
然后如果你想将这些条目组合成一个新的数据框,你可以简单地使用 expand.grid()
来重现你的例子。但请注意,这种格式可能不是最好的,具体取决于您之后想用它做什么。
map(find_words, ~ words$word[str_detect(words$word, .x)]) %>%
expand.grid()
# Var1 Var2
# 1 angina alanine
# 2 endite alanine
# 3 endive alanine
# 4 engine alanine
# 5 entire alanine
# 6 entered alanine
# 7 angina evening
# 8 endite evening
# 9 endive evening
# 10 engine evening
# 11 entire evening
# 12 entered evening
对于进一步的问题,这里有一个分几个步骤的解决方案,但这只适用于 2 个单词(我使用第一个为第二个创建模式)。我认为对于更大的单词集,性能应该保持相当不错,但这需要测试。
library(tidyverse)
words <- tibble(word = c('alanins', 'snoops', 'test', 'word', 'active', 'angina', 'endite', 'endive', 'engine', 'entire', 'alanine', 'evening', 'escape', 'entered'),
word_length = nchar(word))
pattern1 <- '1n2341'
pattern2 <- '151n3n6'
candidates1 <- words$word[words$word_length == nchar(pattern1)]
has_consistent_names <- function(vec){
# TRUE if each name is associated with a single value
map_lgl(names(vec),
~ length(unique(vec[names(vec) == .x])) == 1) %>%
all()
}
candidates1_chars <- map(candidates1, ~ setNames(str_split(.x, "")[[1]],
str_split(pattern1, "")[[1]]))
candidates1_chars <- Filter(has_consistent_names, candidates1_chars)
candidates2 <- words$word[words$word_length == nchar(pattern2)]
pattern2_chars <- str_split(pattern2, "")[[1]]
build_regex <- function(xx, ptrn){
# for xx a named dictionary of numbers to characters, make regex pattern
xx <- xx[ptrn]
xx[is.na(xx)] <- "."
paste(xx, collapse="")
}
pattern2_regex <- map_chr(candidates1_chars,
~ build_regex(.x, pattern2_chars))
tibble(word1 = map_chr(candidates1_chars, ~paste(.x, collapse="")),
word2 = map(pattern2_regex, ~candidates2[str_detect(candidates2, .x)])) %>%
unnest(word2)
#> # A tibble: 6 x 2
#> word1 word2
#> <chr> <chr>
#> 1 angina alanins
#> 2 angina alanine
#> 3 endite evening
#> 4 endive evening
#> 5 engine evening
#> 6 entire evening
由 reprex package (v0.3.0)
于 2020-12-30 创建想法比较简单,但是实现起来有很多实际问题:我先取所有长度合适的词来匹配第一个模式(candidates1),然后按字母拆分,这样我就可以看是否他们自己的字母遵循 pattern1(即在该示例中,如果第一个和最后一个字符相同)。
因此,我从第一个单词中获取了位置 <-> 字母的列表,并使用它为第二个单词生成正则表达式模式:我可以使用命名向量轻松地在字母和字符之间进行转换。如果第二个单词中存在数字但第一个单词中不存在,我有一个 NA
,我可以简单地将其替换为 .
以匹配任何内容,然后我可以将它们全部粘贴在一起。
最后,剩下的就是在每个实际单词上测试每个 candidate1 模式,以找到匹配的那些。如果我总是只有一个匹配项,我可以使用 map_chr()
;在那个例子中,一个模式不匹配,所以我将结果存储为一个列表,准备进行一些 post 处理(过滤长度(word2)为 0 的行,如果长度 > 1,则只保留第一个元素, ...)