R 中的 Web 抓取 - 获取 "Error in records[[x]] .... more elements supplied than there are to replace"
Webscraping In R - Getting "Error in records[[x]] .... more elements supplied than there are to replace"
请注意,我对 R 和 R 本身的网络抓取非常陌生,因此在解释回复时请注意这一点...
我正在尝试通过网络抓取入住日期、评论标题和评论
这是我生成我想使用的 URL 列表的地方:
library(rvest)
#GENERATING THE URLS
webpage_list <- vector(mode = "list")
#creating empty list
webpage_list
for(n in seq(from=5, to=15, by=5)){
webpage_list[[n]] <- glue::glue("https://www.sampleURL.com#REVIEWS")
}
#droping the empty values
webpage_list[sapply(webpage_list,is.null)] <- NULL
webpage_list
然后将列表转换为字符向量并遍历开始识别网页上我想要抓取的区域
webpage_list2 <- unlist(webpage_list)
class(webpage_list2)
for(i in seq_along(webpage_list2)){
webpage <- read_html(webpage_list2[i])
results <- webpage %>% html_nodes(".oETBfkHU , ._3hDPbqWO")
print(results)
# Building the dataset
records <- vector("character", length = (length(results)))
print(records)
}
到目前为止(我认为)似乎都在按照我的意愿工作
for (x in seq_along(results)) {
url <- read_html(webpage_list2[x])
dateOfStay <- str_c(url %>%
html_nodes("._34Xs-BQm") %>%
html_text())
reviewTitle <- str_sub(url %>%
html_nodes(".glasR4aX")%>%
html_text())
review <- str_sub(url %>%
html_nodes(".IRsGHoPm") %>%
html_text())
records[[x]] <- data_frame(dateOfStay = dateOfStay, reviewTitle = reviewTitle, review = review)#, reviewTitle = reviewTitle, review = review
}
#Build DF
DF <- bind_rows(records)
由此我得到以下错误:
Error in records[[x]] <- data_frame(dateOfStay = dateOfStay, reviewTitle = reviewTitle, : more elements supplied than there are to replace
任何帮助将不胜感激,另请注意,我对 R 和 R 本身的网络抓取非常陌生,因此在解释回复时请注意这一点。
无需抓取,我们就能找到您的问题。您正在尝试将数据框放入字符向量中。数据框不是字符。所以这是错误的维度。您可以通过将记录设为列表来修复它,或者将您的数据框包装在列表中以将其强制为单个项目。我建议将记录设为列表。
records <- vector("character", length = (3))
records[[2]] <- data.frame(test = "A",test2 = "B")
# Error in records[[2]] <- data.frame(test = "A", test2 = "B") :
# more elements supplied than there are to replace
# Option 1:
records <- list(length = (3))
records[[2]] <- data.frame(test = "A",test2 = "B")
records
# $`length`
# [1] 3
#
# [[2]]
# test test2
# 1 A B
# Option 2:
records <- vector("character", length = (3))
records[[2]] <- list(data.frame(test = "A",test2 = "B"))
# records
# [[1]]
# [1] ""
#
# [[2]]
# [[2]][[1]]
# test test2
# 1 A B
#
#
# [[3]]
# [1] ""
请注意,我对 R 和 R 本身的网络抓取非常陌生,因此在解释回复时请注意这一点...
我正在尝试通过网络抓取入住日期、评论标题和评论
这是我生成我想使用的 URL 列表的地方:
library(rvest)
#GENERATING THE URLS
webpage_list <- vector(mode = "list")
#creating empty list
webpage_list
for(n in seq(from=5, to=15, by=5)){
webpage_list[[n]] <- glue::glue("https://www.sampleURL.com#REVIEWS")
}
#droping the empty values
webpage_list[sapply(webpage_list,is.null)] <- NULL
webpage_list
然后将列表转换为字符向量并遍历开始识别网页上我想要抓取的区域
webpage_list2 <- unlist(webpage_list)
class(webpage_list2)
for(i in seq_along(webpage_list2)){
webpage <- read_html(webpage_list2[i])
results <- webpage %>% html_nodes(".oETBfkHU , ._3hDPbqWO")
print(results)
# Building the dataset
records <- vector("character", length = (length(results)))
print(records)
}
到目前为止(我认为)似乎都在按照我的意愿工作
for (x in seq_along(results)) {
url <- read_html(webpage_list2[x])
dateOfStay <- str_c(url %>%
html_nodes("._34Xs-BQm") %>%
html_text())
reviewTitle <- str_sub(url %>%
html_nodes(".glasR4aX")%>%
html_text())
review <- str_sub(url %>%
html_nodes(".IRsGHoPm") %>%
html_text())
records[[x]] <- data_frame(dateOfStay = dateOfStay, reviewTitle = reviewTitle, review = review)#, reviewTitle = reviewTitle, review = review
}
#Build DF
DF <- bind_rows(records)
由此我得到以下错误:
Error in records[[x]] <- data_frame(dateOfStay = dateOfStay, reviewTitle = reviewTitle, : more elements supplied than there are to replace
任何帮助将不胜感激,另请注意,我对 R 和 R 本身的网络抓取非常陌生,因此在解释回复时请注意这一点。
无需抓取,我们就能找到您的问题。您正在尝试将数据框放入字符向量中。数据框不是字符。所以这是错误的维度。您可以通过将记录设为列表来修复它,或者将您的数据框包装在列表中以将其强制为单个项目。我建议将记录设为列表。
records <- vector("character", length = (3))
records[[2]] <- data.frame(test = "A",test2 = "B")
# Error in records[[2]] <- data.frame(test = "A", test2 = "B") :
# more elements supplied than there are to replace
# Option 1:
records <- list(length = (3))
records[[2]] <- data.frame(test = "A",test2 = "B")
records
# $`length`
# [1] 3
#
# [[2]]
# test test2
# 1 A B
# Option 2:
records <- vector("character", length = (3))
records[[2]] <- list(data.frame(test = "A",test2 = "B"))
# records
# [[1]]
# [1] ""
#
# [[2]]
# [[2]][[1]]
# test test2
# 1 A B
#
#
# [[3]]
# [1] ""