使用 rselenium 和 rvest 通过 while 循环从跨多个页面的多个表中提取数据

Using rselenium and rvest to pull data from multiple tables across multiple pages with while loop

所以我正在编写一个 R 代码,它将登录到受密码保护的网站,进入该网站的特定页面,然后从特定 table 收集数据。 table 由特定日期的销售数据组成。现在,在那些日子里,有不止一个 "page"(必须按下一个按钮)。因此,对于每一天,我都必须在多个页面上抓取 tables,然后从开始日期开始抓取多天。

例如。我调出显示 01/01/2020 销售数据的页面。假设这个特定 table 有三页的数据。此代码应获取当天所有三页价值 table 的数据,然后切换输入​​并转到 01/02/2020 的页面,并重复相同的操作直到今天。

现在,我已经完成了大部分工作,但是我遇到了这个烦人的问题

Error in UseMethod("html_table") : no applicable method for 'html_table' applied to an object of class "xml_missing"

错误。它存在于以下循环函数中:

###loop to collect all data for each day when there are multiple pages
#set it so we can input custom date ranges
remDr$findElement(using = 'xpath', '//*[@id="date-dropdown-container"]/button')$clickElement()
remDr$findElement(using = 'xpath', '//*[@id="date-dropdown-container"]/ul/li[9]/a')$clickElement()

#set up final dataframe
items_table_final.df <- data.frame()


date <- start.date.date


#loop start for cycling through days
while (date <= end.date.date){


#create text version of the date to enter into the webpage
date.char <- format(as.Date(date, format = "%d-%m-%Y"), "%m-%d-%Y")



#fill in the date range
remDr$findElement("name", "reportDateStart")$sendKeysToElement(sendKeys = list(control = "\uE009", "a", delete = "\uE017"))
remDr$findElement("name", "reportDateStart")$sendKeysToElement(list(date.char))

remDr$findElement("name", "reportDateEnd")$sendKeysToElement(sendKeys = list(control = "\uE009", "a", delete = "\uE017"))
remDr$findElement("name", "reportDateEnd")$sendKeysToElement(list(date.char))

#setup and or clear temporary data frame
items_table.df <- data.frame("Menu Item" = character(),
                            "Menu Group" = character(),
                            "Menu" = character(),
                            "Item Quantity" = integer(),
                            "Net Amount" = integer(),
                            stringsAsFactors = FALSE)

#go to the data for the selected date range
remDr$findElement("id", "update-btn")$clickElement()


pages <- 1


 #loop start for cycling through pages within a specified day
 while (pages <= 100){
   #fills a second temp data frame with data from the displayed page
   items_html <- read_html(remDr$getPageSource()[[1]])
   items_table_new <- items_html %>%
       rvest::html_node("table#top-items") %>%
       rvest::html_table(fill = TRUE)


     #test if the page loop needs to stop
     if(nrow(items_table_new) == nrow(match_df(items_table.df, items_table_new))){

       break

     } else {
       #add the new data to the earlier temp data frame IF it isnt a match to something already there
       items_table.df <- rbind(items_table.df, items_table_new)

       #hit the next page arrow button
       remDr$findElement("link text", "Next →")$clickElement()
     }
   pages <- pages + 1
}

#add the new data to the final data frame 
items_table_final.df <- rbind(items_table_final.df, items_table.df)

date <- date + 1
}

当我执行 traceback() 时,我得到以下输出:

9: rvest::html_table(., fill = TRUE)
8: function_list[[k]](value)
7: withVisible(function_list[[k]](value))
6: freduce(value, `_function_list`)
5: `_fseq`(`_lhs`)
4: eval(quote(`_fseq`(`_lhs`)), env, env)
3: eval(quote(`_fseq`(`_lhs`)), env, env)
2: withVisible(eval(quote(`_fseq`(`_lhs`)), env, env))
1: items_html %>% rvest::html_node("table#top-items") %>% rvest::html_table(fill = TRUE)

所以这让我觉得从 table 获取数据的那段代码有问题。但是当我运行手动的时候,没问题。事实上,如果我 运行 手动执行整个循环,我不会遇到任何错误。我什至可以 运行 嵌套循环,用于循环浏览页面的循环,它包含假定有问题的代码,就好了。它只是外循环是一个问题。

我已经使用网站上空的和填充的 table 数据对此进行了测试。我已确认 table 具有一致的名称。我已确认数据已从网页中正确获取并保存到我指定的数据框中。

如有任何想法或建议,我们将不胜感激!您将在下方找到完整代码(网页和密码数据已删除)。

library(RSelenium)
library(rvest)
library(tidyverse)
library(plyr)


####adjustable variables####
#enter in the date you wish to grab data starting from in the format MONTH-DAY-YEAR where it is all numbers, and there are at least two digits for month and day, and four digits for year 
start.date <- "01-01-2020"

start.date.date <- as.Date(start.date, format = "%m-%d-%Y")


#change this to follow the format as specified for start.date if you want to specify a different end date than the current date this program is running
end.date <- Sys.Date()

end.date.date <- as.Date(end.date, format = "%m-%d-%Y")



####data retrieval code####
#create a server based on the chrome broswer. If you are running version 84, then put "Latest"
rD <- rsDriver(chromever = "83.0.4103.39", verbose = F)
remDr <- rD$client

#navigate to login page
remDr$navigate("**LOGIN WEB PAGE LINK**")

#fill in login info and submit
remDr$findElement("id", "email")$sendKeysToElement(list("**LOGIN DETAIL:USERNAME**"))
remDr$findElement("id", "password")$sendKeysToElement(list("**LOGIN DETAIL: PASSWORD**"))
remDr$findElement("id", "log-in")$clickElement()

#go to the data page
remDr$navigate("**WEB PAGE THAT HAS THE DATA TABLE DISPLAYED**")



###loop to collect all data for each day when there are multiple pages
#set it so we can input custom date ranges
remDr$findElement(using = 'xpath', '//*[@id="date-dropdown-container"]/button')$clickElement()
remDr$findElement(using = 'xpath', '//*[@id="date-dropdown-container"]/ul/li[9]/a')$clickElement()

#set up final dataframe
items_table_final.df <- data.frame()


date <- start.date.date


#loop start for cycling through days
while (date <= end.date.date){


  #create text version of the date to enter into the webpage
  date.char <- format(as.Date(date, format = "%d-%m-%Y"), "%m-%d-%Y")



  #fill in the date range
  remDr$findElement("name", "reportDateStart")$sendKeysToElement(sendKeys = list(control = "\uE009", "a", delete = "\uE017"))
  remDr$findElement("name", "reportDateStart")$sendKeysToElement(list(date.char))

  remDr$findElement("name", "reportDateEnd")$sendKeysToElement(sendKeys = list(control = "\uE009", "a", delete = "\uE017"))
  remDr$findElement("name", "reportDateEnd")$sendKeysToElement(list(date.char))

  #setup and or clear temporary data frame
  items_table.df <- data.frame("Menu Item" = character(),
                               "Menu Group" = character(),
                               "Menu" = character(),
                               "Item Quantity" = integer(),
                               "Net Amount" = integer(),
                               stringsAsFactors = FALSE)

  #go to the data for the selected date range
  remDr$findElement("id", "update-btn")$clickElement()


  pages <- 1


    #loop start for cycling through pages within a specified day
    while (pages <= 100){
      #fills a second temp data frame with data from the displayed page
      items_html <- read_html(remDr$getPageSource()[[1]])
      items_table_new <- items_html %>%
          rvest::html_node("table#top-items") %>%
          rvest::html_table(fill = TRUE)


        #test if the page loop needs to stop
        if(nrow(items_table_new) == nrow(match_df(items_table.df, items_table_new))){

          break

        } else {
          #add the new data to the earlier temp data frame IF it isnt a match to something already there
          items_table.df <- rbind(items_table.df, items_table_new)

          #hit the next page arrow button
          remDr$findElement("link text", "Next →")$clickElement()
        }
      pages <- pages + 1
  }

  #add the new data to the final data frame 
  items_table_final.df <- rbind(items_table_final.df, items_table.df)

  date <- date + 1
}

我刚刚解决了!原来有两个问题。首先,代码在页面完成加载之前执行从 table 收集数据的行。因此,table ID 在技术上并不存在,无法从中收集数据。为了解决这个问题,我只是添加了一个 Sys.sleep(5) 命令让系统等待 5 秒。下一个问题是如果有一个页面有一个空 table 或只有一个 table 页面,将没有元素 "next" 来翻页。所以我添加了一个 try 句柄来跳过它并让它 运行 在上面的 while 语句中进入计数器,因为这只需要 2 秒。我正在为遇到类似问题的任何人发布更正后的循环!

#loop start for cycling through days
while (date <= end.date.date){


  #create text version of the date to enter into the webpage
  date.char <- format(as.Date(date, format = "%d-%m-%Y"), "%m-%d-%Y")



  #fill in the date range
  remDr$findElement("name", "reportDateStart")$sendKeysToElement(sendKeys = list(control = "\uE009", "a", delete = "\uE017"))
  remDr$findElement("name", "reportDateStart")$sendKeysToElement(list(date.char))

  remDr$findElement("name", "reportDateEnd")$sendKeysToElement(sendKeys = list(control = "\uE009", "a", delete = "\uE017"))
  remDr$findElement("name", "reportDateEnd")$sendKeysToElement(list(date.char))

  #setup and or clear temporary data frame
  items_table.df <- data.frame("Menu Item" = character(),
                               "Menu Group" = character(),
                               "Menu" = character(),
                               "Item Quantity" = integer(),
                               "Net Amount" = integer(),
                               stringsAsFactors = FALSE)

  #go to the data for the selected date range
  remDr$findElement("id", "update-btn")$clickElement()


  pages <- 1


  #add a system pause to avoid an error where the page is not yet loaded 
  Sys.sleep(5)


    #loop start for cycling through pages within a specified day
    while (pages <= 20){
      #fills a second temp data frame with data from the displayed page
      items_html <- read_html(remDr$getPageSource()[[1]])
      items_table_new <- items_html %>%
          rvest::html_node("table#top-items") %>%
          rvest::html_table(fill = TRUE)

        #add the date of the data to the dataframe
        items_table_new$date <- date.char

        #test if the page loop needs to stop
        if(nrow(items_table_new) == nrow(match_df(items_table.df, items_table_new))){

          break

        } else {
          #add the new data to the earlier temp data frame IF it isnt a match to something already there
          items_table.df <- rbind(items_table.df, items_table_new)

          #hit the next page arrow button. Ignore the error of there not being one of these if theres only one page, and proceed 
          try(remDr$findElement("link text", "Next →")$clickElement(), silent = TRUE)
        }

      pages <- pages + 1

    }


  #add the new data to the final data frame 
  items_table_final.df <- rbind(items_table_final.df, items_table.df)

  date <- date + 1
}