R 抓取 Rvest 获取文本
R scraping Rvest get text
我正在搜索这个 url 并且在'目标'中我想从这个 url http://fcf.cat/acta/1920/futbol-11/tercera-divisio-nacional/grup-v/aa/vilafranca-fc-a/aa/igualada-cf-a
中获取客场球员的名字
在这种情况下,我想获取这些名称:
我尝试使用 xpath 方法得到这个:
url<-paste0("http://fcf.cat/acta/1920/futbol-11/tercera-divisio-nacional/grup-v/aa/vilafranca-fc-a/aa/igualada-cf-a")
web_partit<-read_html(url)
type_card_local_player <- web_partit %>% html_nodes(xpath = '/html/body/main/div/div/div/div[2]/div[4]/table[4]/tbody/tr/td[2]/div[1]/div[1]/div') %>% html_attr('class')
local_player <- web_partit %>% html_nodes('#acta-header+ .p-0_ml .acta-table~ .acta-table+ .acta-table a') %>% html_text()
type_card_away_player <- web_partit %>% html_nodes(xpath = '/html/body/main/div/div/div/div[2]/div[6]/table[4]/tbody/tr/td[2]/div/div[1]/div') %>% html_attr('class')
away_player <- web_partit %>% html_nodes(xpath = '/html/body/main/div/div/div/div[2]/div[6]/table[4]/tbody/tr/td[2]/a') %>% html_text()
和css选择器方法:
away_player <- web_partit %>% html_nodes('.p-0_ml+ .p-0_ml .acta-table2+ .acta-table a') %>% html_text()
在这两种情况下我得到的结果都是空的
我做错了什么?
谢谢!
这里有一个技巧可以帮助您找到正确的 table(在两个中)并让您进入下一步。您可能还需要进行一些数据清理。
web_partit %>%
html_nodes("table") %>%
Filter(function(a) grepl("Targetes", a), .) %>%
html_table()
# [[1]]
# Targetes Targetes Targetes
# 1 16\n\t\t\t\t\t\t\t\t\t\tRIBEIRO FONSECA, LUCAS RIBEIRO FONSECA, LUCAS 17'
# 2 4\n\t\t\t\t\t\t\t\t\t\tFONTANILS LLACH, DAVID FONTANILS LLACH, DAVID 34'
# 3 17\n\t\t\t\t\t\t\t\t\t\tDE LA TORRE PEREZ, ANGEL DE LA TORRE PEREZ, ANGEL 75'
# 4 12\n\t\t\t\t\t\t\t\t\t\tBOADA CASANELLAS, ORIOL BOADA CASANELLAS, ORIOL 80'
# [[2]]
# Targetes Targetes Targetes
# 1 13\n\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\tBOSCH SOLÃ<U+0089>, PAU BOSCH SOLÃ<U+0089>, PAU 70'
# 2 5\n\t\t\t\t\t\t\t\t\t\tBARALDES ESCUDE, DAVID BARALDES ESCUDE, DAVID 74'
# 3 22\n\t\t\t\t\t\t\t\t\t\tGARCIA GONZALEZ, OSCAR GARCIA GONZALEZ, OSCAR 89'
(我不是 rvest
大师,所以也许有人会有更有效的方法。)
您可以在页面上看到所有 tables:
html_nodes(web_partit, "table")
# {xml_nodeset (16)}
# [1] <table class="d-n d-b_impr" style="width:100%;margin-top:30px;margin-bottom:-200px;"><tr>\n<td class="w-33 tc">\n ...
# [2] <table class="acta-table">\n<thead><tr>\n<th colspan="3">Titulars</th>\n\t\t\t\t\t\t\t</tr></thead>\n<tbody>\n<tr>\n ...
# [3] <table class="acta-table">\n<thead><tr>\n<th colspan="3">Suplents</th>\n\t\t\t\t\t\t\t</tr></thead>\n<tbody>\n<tr>\n ...
# [4] <table class="acta-table">\n<thead><tr>\n<th colspan="2">Equip Tècnic</th>\n\t\t\t\t\t\t\t</tr></thead>\n<tbody>\n<t ...
# [5] <table class="acta-table2">\n<thead><tr>\n<th colspan="4">Substitucions</th>\n\t\t\t\t\t\t\t</tr></thead>\n<tbody>\n ...
# [6] <table class="acta-table">\n<thead><tr>\n<th colspan="3">Targetes</th>\n\t\t\t\t\t\t\t</tr></thead>\n<tbody>\n<tr>\n ...
# [7] <table class="acta-table">\n<thead><tr>\n<th colspan="2">Àrbitres</th>\n\t\t\t\t\t\t\t</tr></thead>\n<tbody>\n<tr>\n ...
# [8] <table class="acta-table">\n<thead><tr>\n<th colspan="4">Gols</th>\n\t\t\t\t\t\t\t</tr></thead>\n<tbody><tr>\n<td cl ...
# [9] <table class="acta-table">\n<thead><tr class="th">\n<th class="tc uppercase">Estadi</th>\n ...
# [10] <table class="acta-table">\n<thead><tr class="th">\n<th class="tc uppercase">Comparativa</th>\n ...
# [11] <table class="acta-table">\n<thead><tr>\n<th colspan="3">Titulars</th>\n\t\t\t\t\t\t\t</tr></thead>\n<tbody>\n<tr>\n ...
# [12] <table class="acta-table">\n<thead><tr>\n<th colspan="3">Suplents</th>\n\t\t\t\t\t\t\t</tr></thead>\n<tbody>\n<tr>\n ...
# [13] <table class="acta-table">\n<thead><tr>\n<th colspan="2">Equip Tècnic</th>\n\t\t\t\t\t\t\t</tr></thead>\n<tbody>\n< ...
# [14] <table class="acta-table2">\n<thead><tr>\n<th colspan="4">Substitucions</th>\n\t\t\t\t\t\t\t</tr></thead>\n<tbody>\n ...
# [15] <table class="acta-table">\n<thead><tr>\n<th colspan="3">Targetes</th>\n\t\t\t\t\t\t\t</tr></thead>\n<tbody>\n<tr>\n ...
# [16] <table class="d-n d-b_impr" style="position:relative;bottom:0;left:0;right:0;width:100%;"><tr>\n<td class="w-33 tc"> ...
并且当我们 grepl
它时,在该节点上有一个隐含的 as.character
,它会在
中查找模式
as.character(html_nodes(web_partit, "table")[[15]])
# [1] "<table class=\"acta-table\">\n<thead><tr>\n<th colspan=\"3\">Targetes</th>\n\t\t\t\t\t\t\t</tr></thead>\n<tbody>\n<tr>\n<td class=\"white oswald tc p-r z-0 w-44px\">\n\t\t\t\t\t\t\t\t\t\t<span class=\"num-samarreta-acta2\">13</span>\n\t\t\t\t\t\t\t\t\t\t<div class=\"samarreta-acta2\">\n<span class=\"p-a faf-base\" style=\"color:#000;\"></span>\n\t\t\t\t\t\t\t\t\t\t<span class=\"p-a faf-base shirt-border\" style=\"color:transparent;\"></span><td><a href=\"http://fcf.cat/jugador/1920/futbol-11/tercera-divisio-nacional/grup-v/35060669/37589706\">BOSCH SOLÃ<U+0089>, PAU</a></td>\n\t\t\t\t\t\t\t\t\t\t</div>\n\t\t\t\t\t\t\t\t\t\t</td>\n\t\t\t\t\t\t\t\t\t\t<td class=\"p-0 tr\">\n\t\t\t\t\t\t\t\t\t\t\t<div class=\"acta-stats\">\n\t\t\t\t\t\t\t\t\t\t\t\t<div class=\"acta-stat-box\">\n\t\t\t\t\t\t\t\t\t\t\t\t\t<div class=\"groga-s\">\n\t\t\t\t\t\t\t\t\t\t\t\t\t</div>\n\t\t\t\t\t\t\t\t\t\t\t\t</div>\n\t\t\t\t\t\t\t\t\t\t\t\t<div class=\"acta-minut-targeta\">70'</div>\n\t\t\t\t\t\t\t\t\t\t\t</div>\n\t\t\t\t\t\t\t\t\t\t</td>\n\t\t\t\t\t\t\t\t\t</tr>\n<tr>\n<td class=\"white oswald tc p-r z-0 w-44px\">\n\t\t\t\t\t\t\t\t\t\t<span class=\"num-samarreta-acta2\">5</span>\n\t\t\t\t\t\t\t\t\t\t<div class=\"samarreta-acta2\">\n<span class=\"p-a faf-base\" style=\"color:#0884FF;\"></span><td><a href=\"http://fcf.cat/jugador/1920/futbol-11/tercera-divisio-nacional/grup-v/35060669/649805\">BARALDES ESCUDE, DAVID</a></td>\n\t\t\t\t\t\t\t\t\t\t</div>\n\t\t\t\t\t\t\t\t\t\t</td>\n\t\t\t\t\t\t\t\t\t\t<td class=\"p-0 tr\">\n\t\t\t\t\t\t\t\t\t\t\t<div class=\"acta-stats\">\n\t\t\t\t\t\t\t\t\t\t\t\t<div class=\"acta-stat-box\">\n\t\t\t\t\t\t\t\t\t\t\t\t\t<div class=\"groga-s\">\n\t\t\t\t\t\t\t\t\t\t\t\t\t</div>\n\t\t\t\t\t\t\t\t\t\t\t\t</div>\n\t\t\t\t\t\t\t\t\t\t\t\t<div class=\"acta-minut-targeta\">74'</div>\n\t\t\t\t\t\t\t\t\t\t\t</div>\n\t\t\t\t\t\t\t\t\t\t</td>\n\t\t\t\t\t\t\t\t\t</tr>\n<tr>\n<td class=\"white oswald tc p-r z-0 w-44px\">\n\t\t\t\t\t\t\t\t\t\t<span class=\"num-samarreta-acta2\">22</span>\n\t\t\t\t\t\t\t\t\t\t<div class=\"samarreta-acta2\">\n<span class=\"p-a faf-base\" style=\"color:#0884FF;\"></span><td><a href=\"http://fcf.cat/jugador/1920/futbol-11/tercera-divisio-nacional/grup-v/35060669/647599\">GARCIA GONZALEZ, OSCAR</a></td>\n\t\t\t\t\t\t\t\t\t\t</div>\n\t\t\t\t\t\t\t\t\t\t</td>\n\t\t\t\t\t\t\t\t\t\t<td class=\"p-0 tr\">\n\t\t\t\t\t\t\t\t\t\t\t<div class=\"acta-stats\">\n\t\t\t\t\t\t\t\t\t\t\t\t<div class=\"acta-stat-box\">\n\t\t\t\t\t\t\t\t\t\t\t\t\t<div class=\"groga-s\">\n\t\t\t\t\t\t\t\t\t\t\t\t\t</div>\n\t\t\t\t\t\t\t\t\t\t\t\t</div>\n\t\t\t\t\t\t\t\t\t\t\t\t<div class=\"acta-minut-targeta\">89'</div>\n\t\t\t\t\t\t\t\t\t\t\t</div>\n\t\t\t\t\t\t\t\t\t\t</td>\n\t\t\t\t\t\t\t\t\t</tr>\n</tbody>\n</table>"
你可以用这个做更多的清洁工作。我不知道这对任何其他细微差别是否稳健(例如,如果提取的列更少或更多),但它确实清理了一些东西:
library(dplyr)
web_partit %>%
html_nodes("table") %>%
Filter(function(a) grepl("Targetes", a), .) %>%
html_table() %>%
.[[2]] %>% # you need to find a way to "know" this definitively
as_tibble(., .name_repair = "unique") %>%
mutate(
num = gsub("^([0-9]+).*", "\1", Targetes...1),
Targetes...1 = gsub("^[0-9]+\s*", "", Targetes...1)
)
# New names:
# * Targetes -> Targetes...1
# * Targetes -> Targetes...2
# * Targetes -> Targetes...3
# # A tibble: 3 x 4
# Targetes...1 Targetes...2 Targetes...3 num
# <chr> <chr> <chr> <chr>
# 1 "BOSCH SOLÃ\u0089, PAU" "BOSCH SOLÃ\u0089, PAU" 70' 13
# 2 "BARALDES ESCUDE, DAVID" "BARALDES ESCUDE, DAVID" 74' 5
# 3 "GARCIA GONZALEZ, OSCAR" "GARCIA GONZALEZ, OSCAR" 89' 22
我正在搜索这个 url 并且在'目标'中我想从这个 url http://fcf.cat/acta/1920/futbol-11/tercera-divisio-nacional/grup-v/aa/vilafranca-fc-a/aa/igualada-cf-a
中获取客场球员的名字在这种情况下,我想获取这些名称:
我尝试使用 xpath 方法得到这个:
url<-paste0("http://fcf.cat/acta/1920/futbol-11/tercera-divisio-nacional/grup-v/aa/vilafranca-fc-a/aa/igualada-cf-a")
web_partit<-read_html(url)
type_card_local_player <- web_partit %>% html_nodes(xpath = '/html/body/main/div/div/div/div[2]/div[4]/table[4]/tbody/tr/td[2]/div[1]/div[1]/div') %>% html_attr('class')
local_player <- web_partit %>% html_nodes('#acta-header+ .p-0_ml .acta-table~ .acta-table+ .acta-table a') %>% html_text()
type_card_away_player <- web_partit %>% html_nodes(xpath = '/html/body/main/div/div/div/div[2]/div[6]/table[4]/tbody/tr/td[2]/div/div[1]/div') %>% html_attr('class')
away_player <- web_partit %>% html_nodes(xpath = '/html/body/main/div/div/div/div[2]/div[6]/table[4]/tbody/tr/td[2]/a') %>% html_text()
和css选择器方法:
away_player <- web_partit %>% html_nodes('.p-0_ml+ .p-0_ml .acta-table2+ .acta-table a') %>% html_text()
在这两种情况下我得到的结果都是空的
我做错了什么?
谢谢!
这里有一个技巧可以帮助您找到正确的 table(在两个中)并让您进入下一步。您可能还需要进行一些数据清理。
web_partit %>%
html_nodes("table") %>%
Filter(function(a) grepl("Targetes", a), .) %>%
html_table()
# [[1]]
# Targetes Targetes Targetes
# 1 16\n\t\t\t\t\t\t\t\t\t\tRIBEIRO FONSECA, LUCAS RIBEIRO FONSECA, LUCAS 17'
# 2 4\n\t\t\t\t\t\t\t\t\t\tFONTANILS LLACH, DAVID FONTANILS LLACH, DAVID 34'
# 3 17\n\t\t\t\t\t\t\t\t\t\tDE LA TORRE PEREZ, ANGEL DE LA TORRE PEREZ, ANGEL 75'
# 4 12\n\t\t\t\t\t\t\t\t\t\tBOADA CASANELLAS, ORIOL BOADA CASANELLAS, ORIOL 80'
# [[2]]
# Targetes Targetes Targetes
# 1 13\n\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\tBOSCH SOLÃ<U+0089>, PAU BOSCH SOLÃ<U+0089>, PAU 70'
# 2 5\n\t\t\t\t\t\t\t\t\t\tBARALDES ESCUDE, DAVID BARALDES ESCUDE, DAVID 74'
# 3 22\n\t\t\t\t\t\t\t\t\t\tGARCIA GONZALEZ, OSCAR GARCIA GONZALEZ, OSCAR 89'
(我不是 rvest
大师,所以也许有人会有更有效的方法。)
您可以在页面上看到所有 tables:
html_nodes(web_partit, "table")
# {xml_nodeset (16)}
# [1] <table class="d-n d-b_impr" style="width:100%;margin-top:30px;margin-bottom:-200px;"><tr>\n<td class="w-33 tc">\n ...
# [2] <table class="acta-table">\n<thead><tr>\n<th colspan="3">Titulars</th>\n\t\t\t\t\t\t\t</tr></thead>\n<tbody>\n<tr>\n ...
# [3] <table class="acta-table">\n<thead><tr>\n<th colspan="3">Suplents</th>\n\t\t\t\t\t\t\t</tr></thead>\n<tbody>\n<tr>\n ...
# [4] <table class="acta-table">\n<thead><tr>\n<th colspan="2">Equip Tècnic</th>\n\t\t\t\t\t\t\t</tr></thead>\n<tbody>\n<t ...
# [5] <table class="acta-table2">\n<thead><tr>\n<th colspan="4">Substitucions</th>\n\t\t\t\t\t\t\t</tr></thead>\n<tbody>\n ...
# [6] <table class="acta-table">\n<thead><tr>\n<th colspan="3">Targetes</th>\n\t\t\t\t\t\t\t</tr></thead>\n<tbody>\n<tr>\n ...
# [7] <table class="acta-table">\n<thead><tr>\n<th colspan="2">Àrbitres</th>\n\t\t\t\t\t\t\t</tr></thead>\n<tbody>\n<tr>\n ...
# [8] <table class="acta-table">\n<thead><tr>\n<th colspan="4">Gols</th>\n\t\t\t\t\t\t\t</tr></thead>\n<tbody><tr>\n<td cl ...
# [9] <table class="acta-table">\n<thead><tr class="th">\n<th class="tc uppercase">Estadi</th>\n ...
# [10] <table class="acta-table">\n<thead><tr class="th">\n<th class="tc uppercase">Comparativa</th>\n ...
# [11] <table class="acta-table">\n<thead><tr>\n<th colspan="3">Titulars</th>\n\t\t\t\t\t\t\t</tr></thead>\n<tbody>\n<tr>\n ...
# [12] <table class="acta-table">\n<thead><tr>\n<th colspan="3">Suplents</th>\n\t\t\t\t\t\t\t</tr></thead>\n<tbody>\n<tr>\n ...
# [13] <table class="acta-table">\n<thead><tr>\n<th colspan="2">Equip Tècnic</th>\n\t\t\t\t\t\t\t</tr></thead>\n<tbody>\n< ...
# [14] <table class="acta-table2">\n<thead><tr>\n<th colspan="4">Substitucions</th>\n\t\t\t\t\t\t\t</tr></thead>\n<tbody>\n ...
# [15] <table class="acta-table">\n<thead><tr>\n<th colspan="3">Targetes</th>\n\t\t\t\t\t\t\t</tr></thead>\n<tbody>\n<tr>\n ...
# [16] <table class="d-n d-b_impr" style="position:relative;bottom:0;left:0;right:0;width:100%;"><tr>\n<td class="w-33 tc"> ...
并且当我们 grepl
它时,在该节点上有一个隐含的 as.character
,它会在
as.character(html_nodes(web_partit, "table")[[15]])
# [1] "<table class=\"acta-table\">\n<thead><tr>\n<th colspan=\"3\">Targetes</th>\n\t\t\t\t\t\t\t</tr></thead>\n<tbody>\n<tr>\n<td class=\"white oswald tc p-r z-0 w-44px\">\n\t\t\t\t\t\t\t\t\t\t<span class=\"num-samarreta-acta2\">13</span>\n\t\t\t\t\t\t\t\t\t\t<div class=\"samarreta-acta2\">\n<span class=\"p-a faf-base\" style=\"color:#000;\"></span>\n\t\t\t\t\t\t\t\t\t\t<span class=\"p-a faf-base shirt-border\" style=\"color:transparent;\"></span><td><a href=\"http://fcf.cat/jugador/1920/futbol-11/tercera-divisio-nacional/grup-v/35060669/37589706\">BOSCH SOLÃ<U+0089>, PAU</a></td>\n\t\t\t\t\t\t\t\t\t\t</div>\n\t\t\t\t\t\t\t\t\t\t</td>\n\t\t\t\t\t\t\t\t\t\t<td class=\"p-0 tr\">\n\t\t\t\t\t\t\t\t\t\t\t<div class=\"acta-stats\">\n\t\t\t\t\t\t\t\t\t\t\t\t<div class=\"acta-stat-box\">\n\t\t\t\t\t\t\t\t\t\t\t\t\t<div class=\"groga-s\">\n\t\t\t\t\t\t\t\t\t\t\t\t\t</div>\n\t\t\t\t\t\t\t\t\t\t\t\t</div>\n\t\t\t\t\t\t\t\t\t\t\t\t<div class=\"acta-minut-targeta\">70'</div>\n\t\t\t\t\t\t\t\t\t\t\t</div>\n\t\t\t\t\t\t\t\t\t\t</td>\n\t\t\t\t\t\t\t\t\t</tr>\n<tr>\n<td class=\"white oswald tc p-r z-0 w-44px\">\n\t\t\t\t\t\t\t\t\t\t<span class=\"num-samarreta-acta2\">5</span>\n\t\t\t\t\t\t\t\t\t\t<div class=\"samarreta-acta2\">\n<span class=\"p-a faf-base\" style=\"color:#0884FF;\"></span><td><a href=\"http://fcf.cat/jugador/1920/futbol-11/tercera-divisio-nacional/grup-v/35060669/649805\">BARALDES ESCUDE, DAVID</a></td>\n\t\t\t\t\t\t\t\t\t\t</div>\n\t\t\t\t\t\t\t\t\t\t</td>\n\t\t\t\t\t\t\t\t\t\t<td class=\"p-0 tr\">\n\t\t\t\t\t\t\t\t\t\t\t<div class=\"acta-stats\">\n\t\t\t\t\t\t\t\t\t\t\t\t<div class=\"acta-stat-box\">\n\t\t\t\t\t\t\t\t\t\t\t\t\t<div class=\"groga-s\">\n\t\t\t\t\t\t\t\t\t\t\t\t\t</div>\n\t\t\t\t\t\t\t\t\t\t\t\t</div>\n\t\t\t\t\t\t\t\t\t\t\t\t<div class=\"acta-minut-targeta\">74'</div>\n\t\t\t\t\t\t\t\t\t\t\t</div>\n\t\t\t\t\t\t\t\t\t\t</td>\n\t\t\t\t\t\t\t\t\t</tr>\n<tr>\n<td class=\"white oswald tc p-r z-0 w-44px\">\n\t\t\t\t\t\t\t\t\t\t<span class=\"num-samarreta-acta2\">22</span>\n\t\t\t\t\t\t\t\t\t\t<div class=\"samarreta-acta2\">\n<span class=\"p-a faf-base\" style=\"color:#0884FF;\"></span><td><a href=\"http://fcf.cat/jugador/1920/futbol-11/tercera-divisio-nacional/grup-v/35060669/647599\">GARCIA GONZALEZ, OSCAR</a></td>\n\t\t\t\t\t\t\t\t\t\t</div>\n\t\t\t\t\t\t\t\t\t\t</td>\n\t\t\t\t\t\t\t\t\t\t<td class=\"p-0 tr\">\n\t\t\t\t\t\t\t\t\t\t\t<div class=\"acta-stats\">\n\t\t\t\t\t\t\t\t\t\t\t\t<div class=\"acta-stat-box\">\n\t\t\t\t\t\t\t\t\t\t\t\t\t<div class=\"groga-s\">\n\t\t\t\t\t\t\t\t\t\t\t\t\t</div>\n\t\t\t\t\t\t\t\t\t\t\t\t</div>\n\t\t\t\t\t\t\t\t\t\t\t\t<div class=\"acta-minut-targeta\">89'</div>\n\t\t\t\t\t\t\t\t\t\t\t</div>\n\t\t\t\t\t\t\t\t\t\t</td>\n\t\t\t\t\t\t\t\t\t</tr>\n</tbody>\n</table>"
你可以用这个做更多的清洁工作。我不知道这对任何其他细微差别是否稳健(例如,如果提取的列更少或更多),但它确实清理了一些东西:
library(dplyr)
web_partit %>%
html_nodes("table") %>%
Filter(function(a) grepl("Targetes", a), .) %>%
html_table() %>%
.[[2]] %>% # you need to find a way to "know" this definitively
as_tibble(., .name_repair = "unique") %>%
mutate(
num = gsub("^([0-9]+).*", "\1", Targetes...1),
Targetes...1 = gsub("^[0-9]+\s*", "", Targetes...1)
)
# New names:
# * Targetes -> Targetes...1
# * Targetes -> Targetes...2
# * Targetes -> Targetes...3
# # A tibble: 3 x 4
# Targetes...1 Targetes...2 Targetes...3 num
# <chr> <chr> <chr> <chr>
# 1 "BOSCH SOLÃ\u0089, PAU" "BOSCH SOLÃ\u0089, PAU" 70' 13
# 2 "BARALDES ESCUDE, DAVID" "BARALDES ESCUDE, DAVID" 74' 5
# 3 "GARCIA GONZALEZ, OSCAR" "GARCIA GONZALEZ, OSCAR" 89' 22