R中的网页抓取错误
Web-scraping error in R
我正在学习如何在 R 中进行网络抓取,我想我会通过使用内置 table 的页面来尝试一下。我的最终目标是拥有包含四个变量的数据框(名称、政党、选区、Link 到个人网页)。
library(rvest)
library(XML)
url <- "http://www.parliament.uk/mps-lords-and-offices/mps/?sort=0"
constituency <- read_html(url)
print(constituency)
constituency_red <- constituency %>% html_nodes('td') %>% html_text()
constituency_red <- paste0(url, constituency_red)
constituency_red <- unique(constituency_red)
constituency_red
完成这些步骤后得到的输出看起来我的方向是正确的。但是,正如您在向右滚动时看到的那样,它仍然有点乱。关于我可以做些什么来清理它有什么想法吗?
[974] "http://www.parliament.uk/mps-lords-and-offices/mps/?sort=0\r\n Poulter, Dr\r\n (Conservative)\r\n "
[975] "http://www.parliament.uk/mps-lords-and-offices/mps/?sort=0Central Suffolk and North Ipswich"
[976] "http://www.parliament.uk/mps-lords-and-offices/mps/?sort=0\r\n Pound, Stephen\r\n (Labour)\r\n "
[977] "http://www.parliament.uk/mps-lords-and-offices/mps/?sort=0Ealing North"
在此之后我尝试了第二种方法。以下代码似乎为我提供了所有超链接的清晰列表。所以我想知道这是否可能是一个潜在的解决方法?
constituency_links <- constituency %>% html_nodes("tr") %>% html_nodes('td') %>% html_nodes("a") %>% html_attr("href")
constituency_links <- paste0(url, constituency_links)
constituency_links <- unique(constituency_links)
constituency_links
我的第三次也是最后一次尝试是使用以下代码:
all_constituency <- lapply(constituency_links, function(x) read_html(x))
all_constituency
当我 运行 这件事慢了很多然后我开始变得 Error in open.connection(x, "rb") : HTTP error 400.
所以我尝试 运行 将它作为一个循环。
for(i in constituency_links){
all_constituency[[i]] <- read_html(i)
}
我用这种方法得到了同样的错误信息。非常感谢任何有关如何提取和清理此信息的建议。
我们可以从获取带有议员姓名、党派和选区的文本字符串开始:
text <- constituency %>% html_nodes('table') %>% html_nodes('tr') %>% html_text()
head(text, 3)
# [1] "Surname, First name\r\n Constituency\r\n\t\r\n "
# [2] "A\r\n back to top\r\n "
# [3] "\r\n Abbott, Ms Diane\r\n (Labour)\r\n \r\n\t\tHackney North and Stoke Newington\r\n\t"
我们可以遍历 text
,解析每个元素并将字符串拆分为我们想要的字段(姓名、政党、选区):
dd <- lapply(text, function(x) {
out <- unlist(strsplit(x, "\r\n"))[c(2, 3, 5)] # Use "\r\n" to split the strings
as.vector(sapply(out, function(x) sub("(\t)+|\s+", "", x))) # Remove spaces and the "\t"
})
# [[1]]
# [1] "Constituency" "" NA
# [[2]]
# [1] "back to top" "" NA
# [[3]]
# [1] "Abbott, Ms Diane" "(Labour)"
# [3] "Hackney North and Stoke Newington"
现在,将 dd
放入数据框,过滤掉不相关的行(其中 party
为空白):
df <- data.frame(matrix(unlist(dd), nc = 3, byrow = TRUE), stringsAsFactors = FALSE)
names(df) <- c("name", "party", "con")
df$party <- sub("\((.*)\)", "\1", df$party) # Remove parentheses
df <- df[df$party != "", ] # Remove rows where party is blank
head(df, 3)
# name party con
# 3 Abbott, Ms Diane Labour Hackney North and Stoke Newington
# 4 Abrahams, Debbie Labour Oldham East and Saddleworth
# 5 Adams, Nigel Conservative Selby and Ainsty
我们现在可以处理链接了。当我们检查链接时,那些与 MPs 相关的链接中有单词 "biographies",因此我们使用它来过滤列表:
links <- constituency %>% html_nodes("a") %>% html_attr("href")
links <- links[grepl("biographies", links)]
head(links, 3)
# [1] "http://www.parliament.uk/biographies/commons/ms-diane-abbott/172"
# [2] "http://www.parliament.uk/biographies/commons/debbie-abrahams/4212"
# [3] "http://www.parliament.uk/biographies/commons/nigel-adams/4057"
并通过添加链接来完成我们的数据框:
df$links <- links
str(head(df, 3))
# 'data.frame': 3 obs. of 4 variables:
# $ name : chr "Abbott, Ms Diane" "Abrahams, Debbie" "Adams, Nigel"
# $ party: chr "Labour" "Labour" "Conservative"
# $ con : chr "Hackney North and Stoke Newington" "Oldham East and Saddleworth" "Selby and Ainsty"
# $ links: chr "http://www.parliament.uk/biographies/commons/ms-diane-abbott/172" "http://www.parliament.uk/biographies/commons/debbie-abrahams/4212" "http://www.parliament.uk/biographies/commons/nigel-adams/4057"
非常简单:
library(rvest)
library(stringi)
library(purrr)
library(dplyr)
pg <- read_html("http://www.parliament.uk/mps-lords-and-offices/mps/?sort=0")
td_1 <- html_nodes(pg, xpath=".//td[contains(@id,'ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_rptMembers_ctl')]")
data_frame(mp_name=html_text(html_nodes(td_1, "a")),
href=html_attr(html_nodes(td_1, "a"), "href"),
party=map_chr(stri_match_all_regex(html_text(td_1), "\((.*)\)"), 2),
constituency=html_text(html_nodes(pg, xpath=".//tr/td[2]"))) -> df
glimpse(df)
## Observations: 649
## Variables: 4
## $ mp_name <chr> "Abbott, Ms Diane", "Abrahams, Debbie", "Adams, N...
## $ href <chr> "http://www.parliament.uk/biographies/commons/ms-...
## $ party <chr> "Labour", "Labour", "Conservative", "Conservative...
## $ constituency <chr> "Hackney North and Stoke Newington", "Oldham East...
我正在学习如何在 R 中进行网络抓取,我想我会通过使用内置 table 的页面来尝试一下。我的最终目标是拥有包含四个变量的数据框(名称、政党、选区、Link 到个人网页)。
library(rvest)
library(XML)
url <- "http://www.parliament.uk/mps-lords-and-offices/mps/?sort=0"
constituency <- read_html(url)
print(constituency)
constituency_red <- constituency %>% html_nodes('td') %>% html_text()
constituency_red <- paste0(url, constituency_red)
constituency_red <- unique(constituency_red)
constituency_red
完成这些步骤后得到的输出看起来我的方向是正确的。但是,正如您在向右滚动时看到的那样,它仍然有点乱。关于我可以做些什么来清理它有什么想法吗?
[974] "http://www.parliament.uk/mps-lords-and-offices/mps/?sort=0\r\n Poulter, Dr\r\n (Conservative)\r\n "
[975] "http://www.parliament.uk/mps-lords-and-offices/mps/?sort=0Central Suffolk and North Ipswich"
[976] "http://www.parliament.uk/mps-lords-and-offices/mps/?sort=0\r\n Pound, Stephen\r\n (Labour)\r\n "
[977] "http://www.parliament.uk/mps-lords-and-offices/mps/?sort=0Ealing North"
在此之后我尝试了第二种方法。以下代码似乎为我提供了所有超链接的清晰列表。所以我想知道这是否可能是一个潜在的解决方法?
constituency_links <- constituency %>% html_nodes("tr") %>% html_nodes('td') %>% html_nodes("a") %>% html_attr("href")
constituency_links <- paste0(url, constituency_links)
constituency_links <- unique(constituency_links)
constituency_links
我的第三次也是最后一次尝试是使用以下代码:
all_constituency <- lapply(constituency_links, function(x) read_html(x))
all_constituency
当我 运行 这件事慢了很多然后我开始变得 Error in open.connection(x, "rb") : HTTP error 400.
所以我尝试 运行 将它作为一个循环。
for(i in constituency_links){
all_constituency[[i]] <- read_html(i)
}
我用这种方法得到了同样的错误信息。非常感谢任何有关如何提取和清理此信息的建议。
我们可以从获取带有议员姓名、党派和选区的文本字符串开始:
text <- constituency %>% html_nodes('table') %>% html_nodes('tr') %>% html_text()
head(text, 3)
# [1] "Surname, First name\r\n Constituency\r\n\t\r\n "
# [2] "A\r\n back to top\r\n "
# [3] "\r\n Abbott, Ms Diane\r\n (Labour)\r\n \r\n\t\tHackney North and Stoke Newington\r\n\t"
我们可以遍历 text
,解析每个元素并将字符串拆分为我们想要的字段(姓名、政党、选区):
dd <- lapply(text, function(x) {
out <- unlist(strsplit(x, "\r\n"))[c(2, 3, 5)] # Use "\r\n" to split the strings
as.vector(sapply(out, function(x) sub("(\t)+|\s+", "", x))) # Remove spaces and the "\t"
})
# [[1]]
# [1] "Constituency" "" NA
# [[2]]
# [1] "back to top" "" NA
# [[3]]
# [1] "Abbott, Ms Diane" "(Labour)"
# [3] "Hackney North and Stoke Newington"
现在,将 dd
放入数据框,过滤掉不相关的行(其中 party
为空白):
df <- data.frame(matrix(unlist(dd), nc = 3, byrow = TRUE), stringsAsFactors = FALSE)
names(df) <- c("name", "party", "con")
df$party <- sub("\((.*)\)", "\1", df$party) # Remove parentheses
df <- df[df$party != "", ] # Remove rows where party is blank
head(df, 3)
# name party con
# 3 Abbott, Ms Diane Labour Hackney North and Stoke Newington
# 4 Abrahams, Debbie Labour Oldham East and Saddleworth
# 5 Adams, Nigel Conservative Selby and Ainsty
我们现在可以处理链接了。当我们检查链接时,那些与 MPs 相关的链接中有单词 "biographies",因此我们使用它来过滤列表:
links <- constituency %>% html_nodes("a") %>% html_attr("href")
links <- links[grepl("biographies", links)]
head(links, 3)
# [1] "http://www.parliament.uk/biographies/commons/ms-diane-abbott/172"
# [2] "http://www.parliament.uk/biographies/commons/debbie-abrahams/4212"
# [3] "http://www.parliament.uk/biographies/commons/nigel-adams/4057"
并通过添加链接来完成我们的数据框:
df$links <- links
str(head(df, 3))
# 'data.frame': 3 obs. of 4 variables:
# $ name : chr "Abbott, Ms Diane" "Abrahams, Debbie" "Adams, Nigel"
# $ party: chr "Labour" "Labour" "Conservative"
# $ con : chr "Hackney North and Stoke Newington" "Oldham East and Saddleworth" "Selby and Ainsty"
# $ links: chr "http://www.parliament.uk/biographies/commons/ms-diane-abbott/172" "http://www.parliament.uk/biographies/commons/debbie-abrahams/4212" "http://www.parliament.uk/biographies/commons/nigel-adams/4057"
非常简单:
library(rvest)
library(stringi)
library(purrr)
library(dplyr)
pg <- read_html("http://www.parliament.uk/mps-lords-and-offices/mps/?sort=0")
td_1 <- html_nodes(pg, xpath=".//td[contains(@id,'ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_rptMembers_ctl')]")
data_frame(mp_name=html_text(html_nodes(td_1, "a")),
href=html_attr(html_nodes(td_1, "a"), "href"),
party=map_chr(stri_match_all_regex(html_text(td_1), "\((.*)\)"), 2),
constituency=html_text(html_nodes(pg, xpath=".//tr/td[2]"))) -> df
glimpse(df)
## Observations: 649
## Variables: 4
## $ mp_name <chr> "Abbott, Ms Diane", "Abrahams, Debbie", "Adams, N...
## $ href <chr> "http://www.parliament.uk/biographies/commons/ms-...
## $ party <chr> "Labour", "Labour", "Conservative", "Conservative...
## $ constituency <chr> "Hackney North and Stoke Newington", "Oldham East...