如何使用 map_dfr() 更有效地过滤 () 数据集?
How do I filter() over a dataset with map_dfr() more efficiently?
我有一个单词对列表:
library(tidyverse)
word_pairs <- structure(list(V1 = c("cup", "cup", "cup"), V2 = c("kilo", "slice","bacon")), row.names = c(NA, -3L), class = "data.frame")
我有以下数据:
data <- structure(list(keyword_pair = c("cup-bacon", "cup-kilo", "cup-slice"
), kwe_1 = c("cup", "cup", "cup"), kwe_2 = c("bacon", "kilo", "slice"), cup = c(2L, 2L, 2L), kilo = c(7L, 7L, 7L), lot = c(3L,3L, 3L), pound = c(5L, 5L, 5L), slice = c(7L, 7L, 7L), bacon = c(4L,4L, 4L), bowl = c(3L, 3L, 3L), box = c(2L, 2L, 2L), fruit = c(2L, 2L, 2L), plate = c(4L, 4L, 4L), bag = c(2L, 2L, 2L), bunch = c(3L, 3L, 3L), chop = c(3L, 3L, 3L), ground = c(2L, 2L, 2L), lettuc = c(2L,2L, 2L), lean = c(2L, 2L, 2L), appl = c(4L, 4L, 4L), barbel = c(2L,2L, 2L), potato = c(2L, 2L, 2L), shoulder = c(2L, 2L, 2L), carrot = c(2L,2L, 2L), mango = c(2L, 2L, 2L), chicken = c(4L, 4L, 4L), press = c(3L,3L, 3L), strawberri = c(3L, 3L, 3L), pint = c(3L, 3L, 3L), sausag = c(2L,2L, 2L), orang = c(2L, 2L, 2L), up = c(2L, 2L, 2L), breast = c(2L,2L, 2L), head = c(2L, 2L, 2L), frozen = c(2L, 2L, 2L), peach = c(2L,2L, 2L), berri = c(2L, 2L, 2L), cherri = c(2L, 2L, 2L), flower = c(2L, 2L, 2L), tomato = c(2L, 2L, 2L), egg = c(2L, 2L, 2L)), row.names = c(NA, -3L), class = "data.frame")
我想从匹配单词对的数据中提取每一行(频率)的数值。
以下函数将执行此操作:
my_function <- function(x) {
data %>%
filter(kwe_1 == word_pairs[x,1] & kwe_2 == word_pairs[x,2]) %>%
select(keyword_pair:kwe_2,
starts_with(word_pairs[x,1]),
starts_with(word_pairs[x,2])) %>%
rename(freq_kwe_1 = 4,
freq_kwe_2 = 5)
}
如果我把这个函数插入一个map_dfr()
,它会生成我要找的东西,但是如果数据集很长,运行时间就很长。
我希望有人能够回答的两个问题:
- 我怎样才能加快速度?
- 我需要学习什么原理才能自己弄清楚如何做到这一点?
1:nrow(word_pairs) %>%
map_dfr(
my_function)
#> keyword_pair kwe_1 kwe_2 freq_kwe_1 freq_kwe_2
#> 1 cup-kilo cup kilo 2 7
#> 2 cup-slice cup slice 2 7
#> 3 cup-bacon cup bacon 2 4
由 reprex package (v2.0.1)
于 2022-04-29 创建
也许我遗漏了什么,但你可能根本不需要使用 purrr
:
data %>%
rowwise() %>%
transmute(keyword_pair,
kwe_1,
kwe_2,
across(c(kwe_1, kwe_2), ~ get(.), .names = "freq_{.col}"))
keyword_pair kwe_1 kwe_2 freq_kwe_1 freq_kwe_2
<chr> <chr> <chr> <int> <int>
1 cup-bacon cup bacon 2 4
2 cup-kilo cup kilo 2 7
3 cup-slice cup slice 2 7
我有一个单词对列表:
library(tidyverse)
word_pairs <- structure(list(V1 = c("cup", "cup", "cup"), V2 = c("kilo", "slice","bacon")), row.names = c(NA, -3L), class = "data.frame")
我有以下数据:
data <- structure(list(keyword_pair = c("cup-bacon", "cup-kilo", "cup-slice"
), kwe_1 = c("cup", "cup", "cup"), kwe_2 = c("bacon", "kilo", "slice"), cup = c(2L, 2L, 2L), kilo = c(7L, 7L, 7L), lot = c(3L,3L, 3L), pound = c(5L, 5L, 5L), slice = c(7L, 7L, 7L), bacon = c(4L,4L, 4L), bowl = c(3L, 3L, 3L), box = c(2L, 2L, 2L), fruit = c(2L, 2L, 2L), plate = c(4L, 4L, 4L), bag = c(2L, 2L, 2L), bunch = c(3L, 3L, 3L), chop = c(3L, 3L, 3L), ground = c(2L, 2L, 2L), lettuc = c(2L,2L, 2L), lean = c(2L, 2L, 2L), appl = c(4L, 4L, 4L), barbel = c(2L,2L, 2L), potato = c(2L, 2L, 2L), shoulder = c(2L, 2L, 2L), carrot = c(2L,2L, 2L), mango = c(2L, 2L, 2L), chicken = c(4L, 4L, 4L), press = c(3L,3L, 3L), strawberri = c(3L, 3L, 3L), pint = c(3L, 3L, 3L), sausag = c(2L,2L, 2L), orang = c(2L, 2L, 2L), up = c(2L, 2L, 2L), breast = c(2L,2L, 2L), head = c(2L, 2L, 2L), frozen = c(2L, 2L, 2L), peach = c(2L,2L, 2L), berri = c(2L, 2L, 2L), cherri = c(2L, 2L, 2L), flower = c(2L, 2L, 2L), tomato = c(2L, 2L, 2L), egg = c(2L, 2L, 2L)), row.names = c(NA, -3L), class = "data.frame")
我想从匹配单词对的数据中提取每一行(频率)的数值。
以下函数将执行此操作:
my_function <- function(x) {
data %>%
filter(kwe_1 == word_pairs[x,1] & kwe_2 == word_pairs[x,2]) %>%
select(keyword_pair:kwe_2,
starts_with(word_pairs[x,1]),
starts_with(word_pairs[x,2])) %>%
rename(freq_kwe_1 = 4,
freq_kwe_2 = 5)
}
如果我把这个函数插入一个map_dfr()
,它会生成我要找的东西,但是如果数据集很长,运行时间就很长。
我希望有人能够回答的两个问题:
- 我怎样才能加快速度?
- 我需要学习什么原理才能自己弄清楚如何做到这一点?
1:nrow(word_pairs) %>%
map_dfr(
my_function)
#> keyword_pair kwe_1 kwe_2 freq_kwe_1 freq_kwe_2
#> 1 cup-kilo cup kilo 2 7
#> 2 cup-slice cup slice 2 7
#> 3 cup-bacon cup bacon 2 4
由 reprex package (v2.0.1)
于 2022-04-29 创建也许我遗漏了什么,但你可能根本不需要使用 purrr
:
data %>%
rowwise() %>%
transmute(keyword_pair,
kwe_1,
kwe_2,
across(c(kwe_1, kwe_2), ~ get(.), .names = "freq_{.col}"))
keyword_pair kwe_1 kwe_2 freq_kwe_1 freq_kwe_2
<chr> <chr> <chr> <int> <int>
1 cup-bacon cup bacon 2 4
2 cup-kilo cup kilo 2 7
3 cup-slice cup slice 2 7