rvest 数据抓取用 NA 替换缺失的 html_node
rvest data scraping replacing missing html_node with NA
大家好!
很长一段时间以来,我一直在尝试抓取数据并以某种方式用 NA 或其他任何东西替换丢失的 html_nodes。但是,我一直没有成功。
谁能帮我弄清楚该怎么做?或者去哪里查看以了解如何操作?
我目前的抓取代码如下:
library('rvest')
header_bind <- c()
page <- 0
price <- c()
ebay <- c()
runtime <- c()
pages <- 2
for (i in 1:pages) {
page <- page + 1
link <- paste("https://www.ebay.com/b/Cell-Phones-Smartphones/9355/bn_320094?LH_BIN=1&LH_ItemCondition=1000&rt=nc&_from=R40&_pgn=",page, sep="")
webpage <- read_html(link)
#read the name of the item
header <- html_nodes(webpage, ".s-item__title")
header_Text <- html_text(header)
header_bind <- rbind(header_bind,as.data.frame(header_Text))
#i get the price
prim_html <- html_nodes(webpage, ".s-item__price")
text_prim <- html_text(prim_html)
price <- rbind(price,as.data.frame(text_prim))
#i get the (amount sold this is missing sometimes)
runtime_html <- html_nodes(webpage, ".NEGATIVE")
text_runtime <- html_text(runtime_html)
runtime <- rbind(runtime,as.data.frame(text_runtime))
#prints 0 so i know that it went throught the for(){}
print(0)
}
P.s。我知道它看起来很糟糕,但我每天都在学习如何更好地编码。
代码输出 48 obs 的价格和产品名称,但是,当涉及到销售数量或剩余数量时,它会给出 43。
我试图通过查看类似的 stackoverflow 帖子来了解其他人是如何做到这一点的,但是,我不知何故无法理解他们的想法。
我的想法是我可以将这个函数用于缺少节点的元素,但它似乎不起作用:
text_runtime<- webpage %>%
html_nodes(".NEGATIVE") %>%
html_text() %>%
{if(length(.) == 0) NA else .}
这个函数也给了我 43 个元素,并且没有在节点缺失的地方放置任何 NA。
您应该首先使用 html_nodes
提取 .s-item__details
,然后使用 html_node
(不带 s)从每个节点提取 .NEGATIVE
或 .s-item__hotness
。
library('rvest')
pages <- 1
page <- 0
output <- data.frame(header = character(), price = character(), runtime = character())
for (i in 1:pages) {
page <- page + 1
link <- paste("https://www.ebay.com/b/Cell-Phones-Smartphones/9355/bn_320094?LH_BIN=1&LH_ItemCondition=1000&rt=nc&_from=R40&_pgn=",page, sep="")
webpage <- read_html(link)
#read the name of the item
header <- html_nodes(webpage, ".s-item__title")
header_text <- html_text(header)
#i get the price
prim_html <- html_nodes(webpage, ".s-item__price")
text_prim <- html_text(prim_html)
price <- rbind(price,as.data.frame(text_prim))
#i get the (amount sold this is missing sometimes)
item <- html_nodes(webpage, ".s-item__details")
runtime_html <- html_node(item, ".s-item__hotness")
text_runtime <- html_text(runtime_html)
text_runtime[is.na(text_runtime)] <- "0"
# combine
out <- data.frame(header_text, text_prim, text_runtime)
output <- rbind(output, out)
#prints 0 so i know that it went throught the for(){}
print(0)
}
output
output
# header_text text_prim text_runtime
# 1 Google Nexus 5X H791 32GB (FACTORY UNLOCKED) 5.2" HD - Mint Green LG .88 42 sold
# 2 Motorola Moto Z3 Play 32GB - Unlocked - Deep Indigo - Brand New - XT1929-4 7.02 7 watching
# 3 LG V20 -Brand New - H915 - Unlocked - Ships Express Canada 2.16 5 watching
# 4 Samsung Galaxy J3 Unlocked 5" 16GB GSM 4G LTE Android Smartphone Black SM-J320W8 .89 0
# 5 New ListingSamsung Galaxy A30s SM-A307GN/DS Dual Sim (FACTORY UNLOCKED) 6.4" 64GB 4GB RAM 2.43 0
# ...
# ...
# 42 Black phone 2 - 32GB - Black (Unlocked) Smartphone (Rest of World Version) 8.65 5 watching
# 43 Sagem MC939 9.00 0
# 44 Nokia 6220 classic 9.00 0
# 45 New ListingSmart Mini Wireless HD Dual WiFi Pocket Projector 2G RAM 16G ROM Android 7.1 3.35 0
# 46 nokia 7260 9.25 0
# 47 smartphone 0.00 0
大家好!
很长一段时间以来,我一直在尝试抓取数据并以某种方式用 NA 或其他任何东西替换丢失的 html_nodes。但是,我一直没有成功。
谁能帮我弄清楚该怎么做?或者去哪里查看以了解如何操作?
我目前的抓取代码如下:
library('rvest')
header_bind <- c()
page <- 0
price <- c()
ebay <- c()
runtime <- c()
pages <- 2
for (i in 1:pages) {
page <- page + 1
link <- paste("https://www.ebay.com/b/Cell-Phones-Smartphones/9355/bn_320094?LH_BIN=1&LH_ItemCondition=1000&rt=nc&_from=R40&_pgn=",page, sep="")
webpage <- read_html(link)
#read the name of the item
header <- html_nodes(webpage, ".s-item__title")
header_Text <- html_text(header)
header_bind <- rbind(header_bind,as.data.frame(header_Text))
#i get the price
prim_html <- html_nodes(webpage, ".s-item__price")
text_prim <- html_text(prim_html)
price <- rbind(price,as.data.frame(text_prim))
#i get the (amount sold this is missing sometimes)
runtime_html <- html_nodes(webpage, ".NEGATIVE")
text_runtime <- html_text(runtime_html)
runtime <- rbind(runtime,as.data.frame(text_runtime))
#prints 0 so i know that it went throught the for(){}
print(0)
}
P.s。我知道它看起来很糟糕,但我每天都在学习如何更好地编码。
代码输出 48 obs 的价格和产品名称,但是,当涉及到销售数量或剩余数量时,它会给出 43。
我试图通过查看类似的 stackoverflow 帖子来了解其他人是如何做到这一点的,但是,我不知何故无法理解他们的想法。 我的想法是我可以将这个函数用于缺少节点的元素,但它似乎不起作用:
text_runtime<- webpage %>%
html_nodes(".NEGATIVE") %>%
html_text() %>%
{if(length(.) == 0) NA else .}
这个函数也给了我 43 个元素,并且没有在节点缺失的地方放置任何 NA。
您应该首先使用 html_nodes
提取 .s-item__details
,然后使用 html_node
(不带 s)从每个节点提取 .NEGATIVE
或 .s-item__hotness
。
library('rvest')
pages <- 1
page <- 0
output <- data.frame(header = character(), price = character(), runtime = character())
for (i in 1:pages) {
page <- page + 1
link <- paste("https://www.ebay.com/b/Cell-Phones-Smartphones/9355/bn_320094?LH_BIN=1&LH_ItemCondition=1000&rt=nc&_from=R40&_pgn=",page, sep="")
webpage <- read_html(link)
#read the name of the item
header <- html_nodes(webpage, ".s-item__title")
header_text <- html_text(header)
#i get the price
prim_html <- html_nodes(webpage, ".s-item__price")
text_prim <- html_text(prim_html)
price <- rbind(price,as.data.frame(text_prim))
#i get the (amount sold this is missing sometimes)
item <- html_nodes(webpage, ".s-item__details")
runtime_html <- html_node(item, ".s-item__hotness")
text_runtime <- html_text(runtime_html)
text_runtime[is.na(text_runtime)] <- "0"
# combine
out <- data.frame(header_text, text_prim, text_runtime)
output <- rbind(output, out)
#prints 0 so i know that it went throught the for(){}
print(0)
}
output
output
# header_text text_prim text_runtime
# 1 Google Nexus 5X H791 32GB (FACTORY UNLOCKED) 5.2" HD - Mint Green LG .88 42 sold
# 2 Motorola Moto Z3 Play 32GB - Unlocked - Deep Indigo - Brand New - XT1929-4 7.02 7 watching
# 3 LG V20 -Brand New - H915 - Unlocked - Ships Express Canada 2.16 5 watching
# 4 Samsung Galaxy J3 Unlocked 5" 16GB GSM 4G LTE Android Smartphone Black SM-J320W8 .89 0
# 5 New ListingSamsung Galaxy A30s SM-A307GN/DS Dual Sim (FACTORY UNLOCKED) 6.4" 64GB 4GB RAM 2.43 0
# ...
# ...
# 42 Black phone 2 - 32GB - Black (Unlocked) Smartphone (Rest of World Version) 8.65 5 watching
# 43 Sagem MC939 9.00 0
# 44 Nokia 6220 classic 9.00 0
# 45 New ListingSmart Mini Wireless HD Dual WiFi Pocket Projector 2G RAM 16G ROM Android 7.1 3.35 0
# 46 nokia 7260 9.25 0
# 47 smartphone 0.00 0