使用 R 抓取维基百科 HTML table 中的图像、文本和空白单元格
Scraping Wikipedia HTML table with images, text, and blank cells with R
我感兴趣的table是纽约市米其林星级餐厅的维基百科table,星级数以图片为准
我能够使用两个步骤来抓取 table(首先获取 "Name" 和 "Borough" 列中的单词,其次获取 [=37= 中的 alt 标签] body), 但我想知道是否可以一步完成。我能够使用 rvest 包抓取数据。
由于 XML::readHTMLTable 函数无法读取维基百科页面,我尝试了 htmltab 包但没有成功,因为我无法弄清楚 bodyFun 参数所需的函数。说实话,我是网络抓取...和功能的新手。
我参考的问题:
Scraping html table with images using XML R package
Scraping html tables into R data frames using the XML package
这是我的代码:
library(stringr)
library(rvest)
library(data.table)
url <- "http://en.wikipedia.org/wiki/List_of_Michelin_starred_restaurants_in_New_York_City"
#Scrape the first two columns, restaurant name and borough
name.boro <- url %>% read_html() %>% html_nodes("table") %>% html_table(fill = TRUE)
name.boro <- as.data.table(name.boro[[1]])
name.boro[, 3:length(name.boro) := NULL]
135 * 13 #1,755 cells in first table
#scrape tables for img alt
#note that because I used the "td" node, entries for all cells in all tables were pulled
stars <- url %>% read_html() %>% html_nodes("td") %>% html_node("img") %>% html_attr("alt")
stars
#Make vector of numbers to index each column
df <- vector("list", 13)
for (i in 1:13){
df[[i]] <- seq(i, 1755, 13)
}
#Put everything together
Mich.Guide <- name.boro
Mich.Guide[, c("X2006", "X2007", "X2008", "X2009", "X2010", "X2011", "X2012", "X2013", "X2014", "X2015",
"X2016") := .(stars[unlist(df[3])], stars[unlist(df[4])], stars[unlist(df[5])],
stars[unlist(df[6])], stars[unlist(df[7])], stars[unlist(df[8])],
stars[unlist(df[9])], stars[unlist(df[10])], stars[unlist(df[11])],
stars[unlist(df[12])], stars[unlist(df[13])] )]
谢谢!
您可以尝试以下方法
require(rvest)
url <- "http://en.wikipedia.org/wiki/List_of_Michelin_starred_restaurants_in_New_York_City"
doc <- read_html(url)
col_names <- doc %>% html_nodes("#mw-content-text > table > tr:nth-child(1) > th") %>% html_text()
tbody <- doc %>% html_nodes("#mw-content-text > table > tr:not(:first-child)")
extract_tr <- function(tr){
scope <- tr %>% html_children()
c(scope[1:2] %>% html_text(),
scope[3:length(scope)] %>% html_node("img") %>% html_attr("alt"))
}
res <- tbody %>% sapply(extract_tr)
res <- as.data.frame(t(res), stringsAsFactors = FALSE)
colnames(res) <- col_names
现在你有了 raw-table。我将列的解析留给整数,列名留给你
方法略有不同:
library(rvest)
library(purrr)
library(stringi)
pg <- read_html("http://en.wikipedia.org/wiki/List_of_Michelin_starred_restaurants_in_New_York_City")
html_nodes(pg, xpath=".//table[contains(@class, 'wikitable')]/tr[not(th)]") %>%
map_df(function(x) {
r_name <- html_text(html_nodes(x, xpath=".//td[1]"))
borough <- html_text(html_nodes(x, xpath=".//td[2]"))
map(3:13, function(y) {
stars <- html_attr(html_nodes(x, xpath=sprintf(".//td[%d]/a", y)), "href")
if (length(stars)==0) {
NA
} else {
stri_match_first_regex(stars, "Michelin-([[:digit:]])")[,2]
}
}) -> refs
refs <- setNames(refs, c(2006:2016))
as.data.frame(c(r_name=r_name, borough=borough, refs), stringsAsFactors=FALSE)
}) -> michelin_nyc
dplyr::glimpse(michelin_nyc)
## Observations: 135
## Variables: 13
## $ r_name <chr> "Adour", "Ai Fiori", "Alain Ducasse at the...
## $ borough <chr> "Manhattan", "Manhattan", "Manhattan", "Ma...
## $ X2006 <chr> NA, NA, "3", NA, NA, NA, NA, "1", NA, NA, ...
## $ X2007 <chr> NA, NA, NA, NA, NA, NA, NA, "1", NA, NA, N...
## $ X2008 <chr> NA, NA, NA, NA, NA, NA, NA, "1", "1", NA, ...
## $ X2009 <chr> "2", NA, NA, NA, "1", "1", NA, "1", "1", N...
## $ X2010 <chr> "1", NA, NA, NA, NA, "2", NA, "1", "1", NA...
## $ X2011 <chr> "1", NA, NA, "1", NA, "2", NA, "1", "1", N...
## $ X2012 <chr> "1", "1", NA, "1", NA, NA, NA, "1", NA, NA...
## $ X2013 <chr> "1", "1", NA, "1", NA, NA, NA, "1", NA, "1...
## $ X2014 <chr> NA, "1", NA, "1", NA, NA, NA, "1", NA, "1"...
## $ X2015 <chr> NA, "1", NA, "1", NA, NA, "1", NA, NA, "2"...
## $ X2016 <chr> NA, "1", NA, "1", NA, NA, "1", NA, NA, "2"...
完全 可以使用 XML
包,如下所示:
library(XML)
library(RCurl)
library(stringi)
pg <- getURL("https://en.wikipedia.org/wiki/List_of_Michelin_starred_restaurants_in_New_York_City")
pg <- htmlParse(pg)
rows <- getNodeSet(pg, "//table[contains(@class, 'wikitable')]/tr[not(th)]")
do.call(rbind, lapply(rows, function(x) {
r_name <- xpathSApply(x, ".//td[1]", xmlValue)
borough <- xpathSApply(x, ".//td[2]", xmlValue)
lapply(3:13, function(y) {
stars <- xpathSApply(x, sprintf(".//td[%d]/a", y), xmlGetAttr, "href")
if (length(stars)==0) {
NA
} else {
stri_match_first_regex(stars, "Michelin-([[:digit:]])")[,2]
}
}) -> refs
refs <- setNames(refs, c(2006:2016))
as.data.frame(c(r_name=r_name, borough=borough, refs), stringsAsFactors=FALSE)
})) -> michelin_nyc
str(michelin_nyc)
## 'data.frame': 135 obs. of 13 variables:
## $ r_name : chr "Adour" "Ai Fiori" "Alain Ducasse at the Essex House" "Aldea" ...
## $ borough: chr "Manhattan" "Manhattan" "Manhattan" "Manhattan" ...
## $ X2006 : chr NA NA "3" NA ...
## $ X2007 : chr NA NA NA NA ...
## $ X2008 : chr NA NA NA NA ...
## $ X2009 : chr "2" NA NA NA ...
## $ X2010 : chr "1" NA NA NA ...
## $ X2011 : chr "1" NA NA "1" ...
## $ X2012 : chr "1" "1" NA "1" ...
## $ X2013 : chr "1" "1" NA "1" ...
## $ X2014 : chr NA "1" NA "1" ...
## $ X2015 : chr NA "1" NA "1" ...
## $ X2016 : chr NA "1" NA "1" ...
我感兴趣的table是纽约市米其林星级餐厅的维基百科table,星级数以图片为准
我能够使用两个步骤来抓取 table(首先获取 "Name" 和 "Borough" 列中的单词,其次获取 [=37= 中的 alt 标签] body), 但我想知道是否可以一步完成。我能够使用 rvest 包抓取数据。
由于 XML::readHTMLTable 函数无法读取维基百科页面,我尝试了 htmltab 包但没有成功,因为我无法弄清楚 bodyFun 参数所需的函数。说实话,我是网络抓取...和功能的新手。
我参考的问题:
Scraping html table with images using XML R package
Scraping html tables into R data frames using the XML package
这是我的代码:
library(stringr)
library(rvest)
library(data.table)
url <- "http://en.wikipedia.org/wiki/List_of_Michelin_starred_restaurants_in_New_York_City"
#Scrape the first two columns, restaurant name and borough
name.boro <- url %>% read_html() %>% html_nodes("table") %>% html_table(fill = TRUE)
name.boro <- as.data.table(name.boro[[1]])
name.boro[, 3:length(name.boro) := NULL]
135 * 13 #1,755 cells in first table
#scrape tables for img alt
#note that because I used the "td" node, entries for all cells in all tables were pulled
stars <- url %>% read_html() %>% html_nodes("td") %>% html_node("img") %>% html_attr("alt")
stars
#Make vector of numbers to index each column
df <- vector("list", 13)
for (i in 1:13){
df[[i]] <- seq(i, 1755, 13)
}
#Put everything together
Mich.Guide <- name.boro
Mich.Guide[, c("X2006", "X2007", "X2008", "X2009", "X2010", "X2011", "X2012", "X2013", "X2014", "X2015",
"X2016") := .(stars[unlist(df[3])], stars[unlist(df[4])], stars[unlist(df[5])],
stars[unlist(df[6])], stars[unlist(df[7])], stars[unlist(df[8])],
stars[unlist(df[9])], stars[unlist(df[10])], stars[unlist(df[11])],
stars[unlist(df[12])], stars[unlist(df[13])] )]
谢谢!
您可以尝试以下方法
require(rvest)
url <- "http://en.wikipedia.org/wiki/List_of_Michelin_starred_restaurants_in_New_York_City"
doc <- read_html(url)
col_names <- doc %>% html_nodes("#mw-content-text > table > tr:nth-child(1) > th") %>% html_text()
tbody <- doc %>% html_nodes("#mw-content-text > table > tr:not(:first-child)")
extract_tr <- function(tr){
scope <- tr %>% html_children()
c(scope[1:2] %>% html_text(),
scope[3:length(scope)] %>% html_node("img") %>% html_attr("alt"))
}
res <- tbody %>% sapply(extract_tr)
res <- as.data.frame(t(res), stringsAsFactors = FALSE)
colnames(res) <- col_names
现在你有了 raw-table。我将列的解析留给整数,列名留给你
方法略有不同:
library(rvest)
library(purrr)
library(stringi)
pg <- read_html("http://en.wikipedia.org/wiki/List_of_Michelin_starred_restaurants_in_New_York_City")
html_nodes(pg, xpath=".//table[contains(@class, 'wikitable')]/tr[not(th)]") %>%
map_df(function(x) {
r_name <- html_text(html_nodes(x, xpath=".//td[1]"))
borough <- html_text(html_nodes(x, xpath=".//td[2]"))
map(3:13, function(y) {
stars <- html_attr(html_nodes(x, xpath=sprintf(".//td[%d]/a", y)), "href")
if (length(stars)==0) {
NA
} else {
stri_match_first_regex(stars, "Michelin-([[:digit:]])")[,2]
}
}) -> refs
refs <- setNames(refs, c(2006:2016))
as.data.frame(c(r_name=r_name, borough=borough, refs), stringsAsFactors=FALSE)
}) -> michelin_nyc
dplyr::glimpse(michelin_nyc)
## Observations: 135
## Variables: 13
## $ r_name <chr> "Adour", "Ai Fiori", "Alain Ducasse at the...
## $ borough <chr> "Manhattan", "Manhattan", "Manhattan", "Ma...
## $ X2006 <chr> NA, NA, "3", NA, NA, NA, NA, "1", NA, NA, ...
## $ X2007 <chr> NA, NA, NA, NA, NA, NA, NA, "1", NA, NA, N...
## $ X2008 <chr> NA, NA, NA, NA, NA, NA, NA, "1", "1", NA, ...
## $ X2009 <chr> "2", NA, NA, NA, "1", "1", NA, "1", "1", N...
## $ X2010 <chr> "1", NA, NA, NA, NA, "2", NA, "1", "1", NA...
## $ X2011 <chr> "1", NA, NA, "1", NA, "2", NA, "1", "1", N...
## $ X2012 <chr> "1", "1", NA, "1", NA, NA, NA, "1", NA, NA...
## $ X2013 <chr> "1", "1", NA, "1", NA, NA, NA, "1", NA, "1...
## $ X2014 <chr> NA, "1", NA, "1", NA, NA, NA, "1", NA, "1"...
## $ X2015 <chr> NA, "1", NA, "1", NA, NA, "1", NA, NA, "2"...
## $ X2016 <chr> NA, "1", NA, "1", NA, NA, "1", NA, NA, "2"...
完全 可以使用 XML
包,如下所示:
library(XML)
library(RCurl)
library(stringi)
pg <- getURL("https://en.wikipedia.org/wiki/List_of_Michelin_starred_restaurants_in_New_York_City")
pg <- htmlParse(pg)
rows <- getNodeSet(pg, "//table[contains(@class, 'wikitable')]/tr[not(th)]")
do.call(rbind, lapply(rows, function(x) {
r_name <- xpathSApply(x, ".//td[1]", xmlValue)
borough <- xpathSApply(x, ".//td[2]", xmlValue)
lapply(3:13, function(y) {
stars <- xpathSApply(x, sprintf(".//td[%d]/a", y), xmlGetAttr, "href")
if (length(stars)==0) {
NA
} else {
stri_match_first_regex(stars, "Michelin-([[:digit:]])")[,2]
}
}) -> refs
refs <- setNames(refs, c(2006:2016))
as.data.frame(c(r_name=r_name, borough=borough, refs), stringsAsFactors=FALSE)
})) -> michelin_nyc
str(michelin_nyc)
## 'data.frame': 135 obs. of 13 variables:
## $ r_name : chr "Adour" "Ai Fiori" "Alain Ducasse at the Essex House" "Aldea" ...
## $ borough: chr "Manhattan" "Manhattan" "Manhattan" "Manhattan" ...
## $ X2006 : chr NA NA "3" NA ...
## $ X2007 : chr NA NA NA NA ...
## $ X2008 : chr NA NA NA NA ...
## $ X2009 : chr "2" NA NA NA ...
## $ X2010 : chr "1" NA NA NA ...
## $ X2011 : chr "1" NA NA "1" ...
## $ X2012 : chr "1" "1" NA "1" ...
## $ X2013 : chr "1" "1" NA "1" ...
## $ X2014 : chr NA "1" NA "1" ...
## $ X2015 : chr NA "1" NA "1" ...
## $ X2016 : chr NA "1" NA "1" ...