以迭代方式将值附加到由循环创建的数据框中的所有记录
Iteratively Attach a Value to All Records in a Data Frame Created by a Loop
试图从篮球运动员的职业生涯中抓取个人比赛统计数据-reference.com(这是有效的),但我想将球员姓名添加到与个人比赛结果相对应的结果 df 中。例如,对于抓取生成的 86 行,第一个循环只会重复“Kareem Abdul-Jabbar”86 次。我正在尝试使用 cbind 填充方法将下一个循环添加到名为“Player_Name”的现有列中,但 cbind 而是在每个循环中创建一个新列。任何有关如何将球员姓名放入单个列中的建议都将不胜感激。
library(rvest)
library(dplyr)
# Create df of players to be scraped
#########################################################################
players = data.frame(player_name = c(rep("Kareem Abdul-Jabbar",each=20),
rep("Karl Malone",each=19)),
player_id = c(rep("abdulka01",each=20),
rep("malonka01",each=19)),
initial = c(rep("a",each=20),
rep("m",each=19)),
year = c(seq(1970,1989,by=1),
seq(1986,2004,by=1)))
# Scrape data and stack in a df
#########################################################################
output <- data_frame()
for (i in 1:2){
url <- paste0("https://www.basketball-reference.com/players/",
players[i,3],"/",players[i,2],"/gamelog/",players[i,4])
webpage <- read_html(url)
temp <- webpage %>%
html_nodes("#pgl_basic") %>%
html_table()
player_name=players[i,1]
output <- cbind(bind_rows(output, temp),player_name)
}
我不确定你想要的最终形式是什么样的,但你可以尝试更改代码的最后一部分
output <- cbind(bind_rows(output, temp),player_name)
成为
output <- bind_rows(output, temp)
output$player_name <- player_name
有一种更简洁的方法可以通过函数式编程来解决这个问题。首先我们在小标题中设置参数。
library(tidyverse)
library(glue)
kareem <- tibble(
player_name = 'Kareem Abdul-Jabbar',
player_id = 'abdulka01',
initial = 'a',
year = 1970:1989)
karl <- tibble(
player_name = 'Karl Malone',
player_id = 'malonka01',
initial = 'm',
year = 1986:2004)
现在复制你的循环:
bind_rows(kareem, karl) %>%
mutate(
url = pmap_chr( # iterate over multiple variables and return a character vctr
list(initial, player_id, year), # choose these variables
function(initial, player_id, year) { # and apply this function
base <- "https://www.basketball-reference.com/players"
glue('{base}/{initial}/{player_id}/gamelog/{year}') # returns the url
}),
webpage = map(url, read_html), # iterate over urls and apply read_html
temp = map(
webpage, # iterate over webpage
~ html_nodes(.x, "#pgl_basic") %>% # and apply this function
html_table())
) -> # assign to a new tibble
scrapped_data
请注意,lambda 表示法 ~ .x
只是指定函数的另一种方式:
square <- function(v) {v^2}
map_dbl(1:4, ~ .x^2)
map_dbl(1:4, function(.x) .x^2)
map_dbl(1:4, square)
您可以创建 URL 来抓取并使用 map_df
将它们组合成一个数据帧。
library(rvest)
library(tidyverse)
urls <- sprintf("https://www.basketball-reference.com/players/%s/%s/gamelog/%s",
players$initial, players$player_id, players$year)
result <- map_df(urls, ~.x %>%
read_html() %>%
html_nodes("#pgl_basic") %>%
html_table(), .id = 'playername') %>%
mutate(playername = players$player_name[as.numeric(playername)])
试图从篮球运动员的职业生涯中抓取个人比赛统计数据-reference.com(这是有效的),但我想将球员姓名添加到与个人比赛结果相对应的结果 df 中。例如,对于抓取生成的 86 行,第一个循环只会重复“Kareem Abdul-Jabbar”86 次。我正在尝试使用 cbind 填充方法将下一个循环添加到名为“Player_Name”的现有列中,但 cbind 而是在每个循环中创建一个新列。任何有关如何将球员姓名放入单个列中的建议都将不胜感激。
library(rvest)
library(dplyr)
# Create df of players to be scraped
#########################################################################
players = data.frame(player_name = c(rep("Kareem Abdul-Jabbar",each=20),
rep("Karl Malone",each=19)),
player_id = c(rep("abdulka01",each=20),
rep("malonka01",each=19)),
initial = c(rep("a",each=20),
rep("m",each=19)),
year = c(seq(1970,1989,by=1),
seq(1986,2004,by=1)))
# Scrape data and stack in a df
#########################################################################
output <- data_frame()
for (i in 1:2){
url <- paste0("https://www.basketball-reference.com/players/",
players[i,3],"/",players[i,2],"/gamelog/",players[i,4])
webpage <- read_html(url)
temp <- webpage %>%
html_nodes("#pgl_basic") %>%
html_table()
player_name=players[i,1]
output <- cbind(bind_rows(output, temp),player_name)
}
我不确定你想要的最终形式是什么样的,但你可以尝试更改代码的最后一部分
output <- cbind(bind_rows(output, temp),player_name)
成为
output <- bind_rows(output, temp)
output$player_name <- player_name
有一种更简洁的方法可以通过函数式编程来解决这个问题。首先我们在小标题中设置参数。
library(tidyverse)
library(glue)
kareem <- tibble(
player_name = 'Kareem Abdul-Jabbar',
player_id = 'abdulka01',
initial = 'a',
year = 1970:1989)
karl <- tibble(
player_name = 'Karl Malone',
player_id = 'malonka01',
initial = 'm',
year = 1986:2004)
现在复制你的循环:
bind_rows(kareem, karl) %>%
mutate(
url = pmap_chr( # iterate over multiple variables and return a character vctr
list(initial, player_id, year), # choose these variables
function(initial, player_id, year) { # and apply this function
base <- "https://www.basketball-reference.com/players"
glue('{base}/{initial}/{player_id}/gamelog/{year}') # returns the url
}),
webpage = map(url, read_html), # iterate over urls and apply read_html
temp = map(
webpage, # iterate over webpage
~ html_nodes(.x, "#pgl_basic") %>% # and apply this function
html_table())
) -> # assign to a new tibble
scrapped_data
请注意,lambda 表示法 ~ .x
只是指定函数的另一种方式:
square <- function(v) {v^2}
map_dbl(1:4, ~ .x^2)
map_dbl(1:4, function(.x) .x^2)
map_dbl(1:4, square)
您可以创建 URL 来抓取并使用 map_df
将它们组合成一个数据帧。
library(rvest)
library(tidyverse)
urls <- sprintf("https://www.basketball-reference.com/players/%s/%s/gamelog/%s",
players$initial, players$player_id, players$year)
result <- map_df(urls, ~.x %>%
read_html() %>%
html_nodes("#pgl_basic") %>%
html_table(), .id = 'playername') %>%
mutate(playername = players$player_name[as.numeric(playername)])