按跨多个变量分隔的名称汇总数据
Summarizing data by name separated across multiple variables
我正在尝试计算每位球员的总进球数、主要助攻数和次要助攻数。我的问题是我无法理解这样做的逻辑,因为我想按(球员姓名)总结的数据在三个变量(目标、主要助攻和次要助攻)中列出
这是我的可重现数据(来自 dput()
,很抱歉造成混乱)。
mydata <- structure(list(primary_assist = c("Dmitry Gilyazitdinov", "Evgeny Orlov",
"Anton Burdasov", "Sergei Kalinin", "Stanislav Solovyov", "Vasily Streltsov",
NA, "Bogdan Potekhin", "Bogdan Potekhin", "Vasily Streltsov",
"Vasily Streltsov", "Viktor Postnikov", "Danil Kaskov", NA, NA,
"Artemy Panarin"), secondary_assist = c("Andrei Badrutdinov",
NA, NA, NA, "Danil Gubarev", "Nikita Manukhov", NA, "Evgeny Grigorenko",
"Daniil Apalkov", "Ivan Boiko", NA, "Viktor Antipin", "Vitaly Sychov",
NA, NA, "Stanislav Levin"), goal = c("Vitaly Kropachyov", "Dmitry Kozlov",
"Stanislav Solovyov", "Kirill Polyansky", "Anton Burdasov", "Ilya Solodov",
"Alexander Antropov", "Daniil Apalkov", "Evgeny Grigorenko",
"Alexander Antropov", "Alexander Antropov", "Evgeny Grigorenko",
"Denis Belonogov", "Vitaly Sychov", "Alexander Streltsov", "Pyotr Kopyttsov"
), team = c("Belye Medvedi", "Omskie Yastreby", "Belye Medvedi",
"Omskie Yastreby", "Belye Medvedi", "Avto", "Avto", "Stalnye Lisy",
"Stalnye Lisy", "Avto", "Avto", "Stalnye Lisy", "Avto", "Avto",
"Avto", "Russkie Vityazi"), game_strength = c("PP", "EV", "EV",
"EV", "EV", "PP", "SO", "EV", "PP", "PP", "EV", "PP", "PP", "EV",
"PP", "EV"), season = c("2009-10", "2009-10", "2009-10", "2009-10",
"2009-10", "2009-10", "2009-10", "2009-10", "2009-10", "2009-10",
"2009-10", "2009-10", "2009-10", "2009-10", "2009-10", "2009-10"
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-16L), .Names = c("primary_assist", "secondary_assist", "goal",
"team", "game_strength", "season"))
mydata
#> # A tibble: 16 x 6
#> primary_assist secondary_assist goal team game_strength season
#> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 Dmitry Gilyazitdinov Andrei Badrutdin~ Vita~ Bely~ PP 2009-~
#> 2 Evgeny Orlov <NA> Dmit~ Omsk~ EV 2009-~
#> 3 Anton Burdasov <NA> Stan~ Bely~ EV 2009-~
#> 4 Sergei Kalinin <NA> Kiri~ Omsk~ EV 2009-~
#> 5 Stanislav Solovyov Danil Gubarev Anto~ Bely~ EV 2009-~
#> 6 Vasily Streltsov Nikita Manukhov Ilya~ Avto PP 2009-~
#> 7 <NA> <NA> Alex~ Avto SO 2009-~
#> 8 Bogdan Potekhin Evgeny Grigorenko Dani~ Stal~ EV 2009-~
#> 9 Bogdan Potekhin Daniil Apalkov Evge~ Stal~ PP 2009-~
#> 10 Vasily Streltsov Ivan Boiko Alex~ Avto PP 2009-~
#> 11 Vasily Streltsov <NA> Alex~ Avto EV 2009-~
#> 12 Viktor Postnikov Viktor Antipin Evge~ Stal~ PP 2009-~
#> 13 Danil Kaskov Vitaly Sychov Deni~ Avto PP 2009-~
#> 14 <NA> <NA> Vita~ Avto EV 2009-~
#> 15 <NA> <NA> Alex~ Avto PP 2009-~
#> 16 Artemy Panarin Stanislav Levin Pyot~ Russ~ EV 2009-~
所以,我想统计每个球员的进球数、主要助攻数和次要助攻数,然后每个球员有 1 行。假设名字 "Artemy Panarin" 在目标中出现 1 次,在主要助攻中出现 0 次,在辅助助攻中出现 2 次,我的输出将如下所示:
tibble::tibble(name = c("Artemy Panarin", "Stanislav Levin", "Danil Kaskov"), team = c("Russkie Vityazi", "Russkie Vityazi", "Avto"), goals = c(1, 1, 0), primary_assists = c(0, 0, 1), secondary_assists = c(2, 0, 0))
#> # A tibble: 3 x 5
#> name team goals primary_assists secondary_assists
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 Artemy Panarin Russkie Vityazi 1.00 0 2.00
#> 2 Stanislav Levin Russkie Vityazi 1.00 0 0
#> 3 Danil Kaskov Avto 0 1.00 0
这有意义吗?有任何想法吗?首选 Tidyverse 解决方案。谢谢!
获得结果的一种方法是使用 gather()/spread()
除了您的汇总策略来重塑您的数据。
library(tidyverse)
scoring_summary <- mydata %>%
select(primary_assist:team) %>%
gather("key", "player", -team) %>%
group_by(player) %>%
count(key) %>%
spread(key, n)
# convert NAs to 0
scoring_summary[is.na(scoring_summary)] <- 0
scoring_summary
# A tibble: 28 x 4
# Groups: player [28]
player goal primary_assist secondary_assist
<chr> <dbl> <dbl> <dbl>
1 Alexander Antropov 3 0 0
2 Alexander Streltsov 1 0 0
3 Andrei Badrutdinov 0 0 1
4 Anton Burdasov 1 1 0
5 Artemy Panarin 0 1 0
6 Bogdan Potekhin 0 2 0
7 Daniil Apalkov 1 0 1
8 Danil Gubarev 0 0 1
9 Danil Kaskov 0 1 0
10 Denis Belonogov 1 0 0
count()
正在做与您最初尝试 summarise(count(goals)
相同的事情
我们可以gather
到'long'格式,按'name'、'team'和'key'列分组(来自gather
) , summarise
获取计数并 spread
返回 'wide' 格式
library(tidyverse)
gather(mydata, key, name, primary_assist:goal) %>%
group_by(name, team, key) %>%
summarise(n = n()) %>%
spread(key, n, fill = 0)
# A tibble: 30 x 5
# Groups: name, team [30]
# name team goal primary_assist secondary_assist
# <chr> <chr> <dbl> <dbl> <dbl>
# 1 Alexander Antropov Avto 3 0 0
# 2 Alexander Streltsov Avto 1 0 0
# 3 Andrei Badrutdinov Belye Medvedi 0 0 1
# 4 Anton Burdasov Belye Medvedi 1 1 0
# 5 Artemy Panarin Russkie Vityazi 0 1 0
# 6 Bogdan Potekhin Stalnye Lisy 0 2 0
# 7 Daniil Apalkov Stalnye Lisy 1 0 1
# 8 Danil Gubarev Belye Medvedi 0 0 1
# 9 Danil Kaskov Avto 0 1 0
#10 Denis Belonogov Avto 1 0 0
# ... with 20 more rows
您可以使用收集和传播。先把进球数和助攻数列汇总成一个"key",然后按key和球员分组。您可以稍后将 NA 转换为 0
library(tidyverse)
mydata_tidy <- mydata %>%
gather(key = "key", value = "player", primary_assist, secondary_assist, goal) %>%
na.omit()
mydata_tidy %>%
group_by(key, player) %>%
summarize(count = n()) %>%
spread(key, count) %>%
filter(player %in% c("Artemy Panarin", "Stanislav Levin", "Danil Kaskov"))
#> # A tibble: 3 x 4
#> player goal primary_assist secondary_assist
#> <chr> <int> <int> <int>
#> 1 Artemy Panarin NA 1 NA
#> 2 Danil Kaskov NA 1 NA
#> 3 Stanislav Levin NA NA 1
由 reprex 创建于 2018-07-18
包 (v0.2.0).
我正在尝试计算每位球员的总进球数、主要助攻数和次要助攻数。我的问题是我无法理解这样做的逻辑,因为我想按(球员姓名)总结的数据在三个变量(目标、主要助攻和次要助攻)中列出
这是我的可重现数据(来自 dput()
,很抱歉造成混乱)。
mydata <- structure(list(primary_assist = c("Dmitry Gilyazitdinov", "Evgeny Orlov",
"Anton Burdasov", "Sergei Kalinin", "Stanislav Solovyov", "Vasily Streltsov",
NA, "Bogdan Potekhin", "Bogdan Potekhin", "Vasily Streltsov",
"Vasily Streltsov", "Viktor Postnikov", "Danil Kaskov", NA, NA,
"Artemy Panarin"), secondary_assist = c("Andrei Badrutdinov",
NA, NA, NA, "Danil Gubarev", "Nikita Manukhov", NA, "Evgeny Grigorenko",
"Daniil Apalkov", "Ivan Boiko", NA, "Viktor Antipin", "Vitaly Sychov",
NA, NA, "Stanislav Levin"), goal = c("Vitaly Kropachyov", "Dmitry Kozlov",
"Stanislav Solovyov", "Kirill Polyansky", "Anton Burdasov", "Ilya Solodov",
"Alexander Antropov", "Daniil Apalkov", "Evgeny Grigorenko",
"Alexander Antropov", "Alexander Antropov", "Evgeny Grigorenko",
"Denis Belonogov", "Vitaly Sychov", "Alexander Streltsov", "Pyotr Kopyttsov"
), team = c("Belye Medvedi", "Omskie Yastreby", "Belye Medvedi",
"Omskie Yastreby", "Belye Medvedi", "Avto", "Avto", "Stalnye Lisy",
"Stalnye Lisy", "Avto", "Avto", "Stalnye Lisy", "Avto", "Avto",
"Avto", "Russkie Vityazi"), game_strength = c("PP", "EV", "EV",
"EV", "EV", "PP", "SO", "EV", "PP", "PP", "EV", "PP", "PP", "EV",
"PP", "EV"), season = c("2009-10", "2009-10", "2009-10", "2009-10",
"2009-10", "2009-10", "2009-10", "2009-10", "2009-10", "2009-10",
"2009-10", "2009-10", "2009-10", "2009-10", "2009-10", "2009-10"
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-16L), .Names = c("primary_assist", "secondary_assist", "goal",
"team", "game_strength", "season"))
mydata
#> # A tibble: 16 x 6
#> primary_assist secondary_assist goal team game_strength season
#> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 Dmitry Gilyazitdinov Andrei Badrutdin~ Vita~ Bely~ PP 2009-~
#> 2 Evgeny Orlov <NA> Dmit~ Omsk~ EV 2009-~
#> 3 Anton Burdasov <NA> Stan~ Bely~ EV 2009-~
#> 4 Sergei Kalinin <NA> Kiri~ Omsk~ EV 2009-~
#> 5 Stanislav Solovyov Danil Gubarev Anto~ Bely~ EV 2009-~
#> 6 Vasily Streltsov Nikita Manukhov Ilya~ Avto PP 2009-~
#> 7 <NA> <NA> Alex~ Avto SO 2009-~
#> 8 Bogdan Potekhin Evgeny Grigorenko Dani~ Stal~ EV 2009-~
#> 9 Bogdan Potekhin Daniil Apalkov Evge~ Stal~ PP 2009-~
#> 10 Vasily Streltsov Ivan Boiko Alex~ Avto PP 2009-~
#> 11 Vasily Streltsov <NA> Alex~ Avto EV 2009-~
#> 12 Viktor Postnikov Viktor Antipin Evge~ Stal~ PP 2009-~
#> 13 Danil Kaskov Vitaly Sychov Deni~ Avto PP 2009-~
#> 14 <NA> <NA> Vita~ Avto EV 2009-~
#> 15 <NA> <NA> Alex~ Avto PP 2009-~
#> 16 Artemy Panarin Stanislav Levin Pyot~ Russ~ EV 2009-~
所以,我想统计每个球员的进球数、主要助攻数和次要助攻数,然后每个球员有 1 行。假设名字 "Artemy Panarin" 在目标中出现 1 次,在主要助攻中出现 0 次,在辅助助攻中出现 2 次,我的输出将如下所示:
tibble::tibble(name = c("Artemy Panarin", "Stanislav Levin", "Danil Kaskov"), team = c("Russkie Vityazi", "Russkie Vityazi", "Avto"), goals = c(1, 1, 0), primary_assists = c(0, 0, 1), secondary_assists = c(2, 0, 0))
#> # A tibble: 3 x 5
#> name team goals primary_assists secondary_assists
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 Artemy Panarin Russkie Vityazi 1.00 0 2.00
#> 2 Stanislav Levin Russkie Vityazi 1.00 0 0
#> 3 Danil Kaskov Avto 0 1.00 0
这有意义吗?有任何想法吗?首选 Tidyverse 解决方案。谢谢!
获得结果的一种方法是使用 gather()/spread()
除了您的汇总策略来重塑您的数据。
library(tidyverse)
scoring_summary <- mydata %>%
select(primary_assist:team) %>%
gather("key", "player", -team) %>%
group_by(player) %>%
count(key) %>%
spread(key, n)
# convert NAs to 0
scoring_summary[is.na(scoring_summary)] <- 0
scoring_summary
# A tibble: 28 x 4
# Groups: player [28]
player goal primary_assist secondary_assist
<chr> <dbl> <dbl> <dbl>
1 Alexander Antropov 3 0 0
2 Alexander Streltsov 1 0 0
3 Andrei Badrutdinov 0 0 1
4 Anton Burdasov 1 1 0
5 Artemy Panarin 0 1 0
6 Bogdan Potekhin 0 2 0
7 Daniil Apalkov 1 0 1
8 Danil Gubarev 0 0 1
9 Danil Kaskov 0 1 0
10 Denis Belonogov 1 0 0
count()
正在做与您最初尝试 summarise(count(goals)
我们可以gather
到'long'格式,按'name'、'team'和'key'列分组(来自gather
) , summarise
获取计数并 spread
返回 'wide' 格式
library(tidyverse)
gather(mydata, key, name, primary_assist:goal) %>%
group_by(name, team, key) %>%
summarise(n = n()) %>%
spread(key, n, fill = 0)
# A tibble: 30 x 5
# Groups: name, team [30]
# name team goal primary_assist secondary_assist
# <chr> <chr> <dbl> <dbl> <dbl>
# 1 Alexander Antropov Avto 3 0 0
# 2 Alexander Streltsov Avto 1 0 0
# 3 Andrei Badrutdinov Belye Medvedi 0 0 1
# 4 Anton Burdasov Belye Medvedi 1 1 0
# 5 Artemy Panarin Russkie Vityazi 0 1 0
# 6 Bogdan Potekhin Stalnye Lisy 0 2 0
# 7 Daniil Apalkov Stalnye Lisy 1 0 1
# 8 Danil Gubarev Belye Medvedi 0 0 1
# 9 Danil Kaskov Avto 0 1 0
#10 Denis Belonogov Avto 1 0 0
# ... with 20 more rows
您可以使用收集和传播。先把进球数和助攻数列汇总成一个"key",然后按key和球员分组。您可以稍后将 NA 转换为 0
library(tidyverse)
mydata_tidy <- mydata %>%
gather(key = "key", value = "player", primary_assist, secondary_assist, goal) %>%
na.omit()
mydata_tidy %>%
group_by(key, player) %>%
summarize(count = n()) %>%
spread(key, count) %>%
filter(player %in% c("Artemy Panarin", "Stanislav Levin", "Danil Kaskov"))
#> # A tibble: 3 x 4
#> player goal primary_assist secondary_assist
#> <chr> <int> <int> <int>
#> 1 Artemy Panarin NA 1 NA
#> 2 Danil Kaskov NA 1 NA
#> 3 Stanislav Levin NA NA 1
由 reprex 创建于 2018-07-18 包 (v0.2.0).