使用嵌套循环将变量保存到 dataframe/list
Saving variables to a dataframe/list with a nested loop
我目前正在从事一个从网站源代码中抓取信息的项目。这是我得到的代码:
require(dplyr)
require(tidyverse)
require(stringi)
require(stringr)
require(rvest)
require(purrr)
library(data.table)
datalist = list()
# Looping through all pages on the website
for (page in 1:91){
# Constructing the URL to download the html source code from
WebUrl <- paste0("https://www.apple.com/newsroom/archive/?page=",
page)
download.file(WebUrl,
destfile = paste0("tempdir/Source code", page, ".txt"))
# Grabbing the relevant node from the source code and converting it to a df
webpages_df <- webpages %>%
html_nodes("a") %>%
map(html_attrs) %>%
map_df(~as.list(.))
# Removing NA values from the "aria-label" column where the relevant string is and
# renaming the column
headlines <- as.data.frame(webpages_df$`aria-label`) %>%
filter(!is.na(webpages_df$`aria-label`)) %>%
setnames(old = "webpages_df$`aria-label`", new = "Strings")
# Removing the not relevant strings
# Regex is matching for any word with a number, comma and four digits behind. E.g
# September 1, 2021
headlines <- headlines %>%
filter(grepl("([A-Z]\w+\s[0-9][0-9]?,+\s[0-9][0-9][0-9][0-9]?)", Strings))
# Looping over the rows to extract the different variables and store them
# Each variable is created with regex to extract the relevant information
# The goal for the loop is to extract the values from a source file for a given
# node with relevant information
for (r in 1:nrow(headlines)){
dates <- stri_extract_all(headlines$Strings, regex = "([A-Z]\w+\s[0-9][0-9]?,+\s[0- 9][0-9][0-9][0-9]?)")
category <- stri_extract_all(headlines$Strings, regex = "([A-Z][A-Z][A-Z][A-Z][A-Z]\s\w+|[A-Z][A-Z][A-Z]\w+)")
titles <- str_remove_all(headlines$Strings, pattern = "([A-Z]\w+\s[0-9][0-9]?,+\s[0-9][0-9][0-9][0-9]) (-\s[A-Z]?\w+.[A-Z]+..)" )
article.url <- webpages_df %>%
filter(grepl( pattern = "(/[a-z]?\w[0-9]+/[0-9]+/[a-z]?\w+.\w+[a-z]?)", href))
article.url <- paste0("https://www.apple.com", article.url$href)
tempmatrix <- matrix(c(dates, category, titles, article.url), ncol = 4)
datalist[[r]] <- rbind(tempmatrix, datalist)
}
}
这可以将页面上的所有各种源代码下载到设置的目录中。但我似乎无法让嵌套循环工作。我的目标是遍历每个源代码文件并创建变量日期、类别、标题和 url,然后将其附加到循环外的列表中。稍后将其转换为具有上面列出的列的结构化数据框。
虽然这个代码块不起作用,但我可以在没有嵌套循环和 tempmatrix/datalist[[r]]
的情况下让它工作。这样的结果只是我想要的结构中的最后一个文件信息。
非常感谢 input/tips 如何解决我手头的问题。我是 R 的新手,所以我的代码可能效率低下。
尝试在你的内部循环中迭代构建一个 data.frame,然后在你的外部循环的每次迭代中将该数据帧附加到一个列表中。如果这不起作用,请告诉我。
for (page in 1:91){
intermediate <- c()
...
for (r in 1:nrow(headlines)){
...
intermediate <- rbind(tempmatrix, intermediate)
}
datalist[[page]] <- intermediate
}
我目前正在从事一个从网站源代码中抓取信息的项目。这是我得到的代码:
require(dplyr)
require(tidyverse)
require(stringi)
require(stringr)
require(rvest)
require(purrr)
library(data.table)
datalist = list()
# Looping through all pages on the website
for (page in 1:91){
# Constructing the URL to download the html source code from
WebUrl <- paste0("https://www.apple.com/newsroom/archive/?page=",
page)
download.file(WebUrl,
destfile = paste0("tempdir/Source code", page, ".txt"))
# Grabbing the relevant node from the source code and converting it to a df
webpages_df <- webpages %>%
html_nodes("a") %>%
map(html_attrs) %>%
map_df(~as.list(.))
# Removing NA values from the "aria-label" column where the relevant string is and
# renaming the column
headlines <- as.data.frame(webpages_df$`aria-label`) %>%
filter(!is.na(webpages_df$`aria-label`)) %>%
setnames(old = "webpages_df$`aria-label`", new = "Strings")
# Removing the not relevant strings
# Regex is matching for any word with a number, comma and four digits behind. E.g
# September 1, 2021
headlines <- headlines %>%
filter(grepl("([A-Z]\w+\s[0-9][0-9]?,+\s[0-9][0-9][0-9][0-9]?)", Strings))
# Looping over the rows to extract the different variables and store them
# Each variable is created with regex to extract the relevant information
# The goal for the loop is to extract the values from a source file for a given
# node with relevant information
for (r in 1:nrow(headlines)){
dates <- stri_extract_all(headlines$Strings, regex = "([A-Z]\w+\s[0-9][0-9]?,+\s[0- 9][0-9][0-9][0-9]?)")
category <- stri_extract_all(headlines$Strings, regex = "([A-Z][A-Z][A-Z][A-Z][A-Z]\s\w+|[A-Z][A-Z][A-Z]\w+)")
titles <- str_remove_all(headlines$Strings, pattern = "([A-Z]\w+\s[0-9][0-9]?,+\s[0-9][0-9][0-9][0-9]) (-\s[A-Z]?\w+.[A-Z]+..)" )
article.url <- webpages_df %>%
filter(grepl( pattern = "(/[a-z]?\w[0-9]+/[0-9]+/[a-z]?\w+.\w+[a-z]?)", href))
article.url <- paste0("https://www.apple.com", article.url$href)
tempmatrix <- matrix(c(dates, category, titles, article.url), ncol = 4)
datalist[[r]] <- rbind(tempmatrix, datalist)
}
}
这可以将页面上的所有各种源代码下载到设置的目录中。但我似乎无法让嵌套循环工作。我的目标是遍历每个源代码文件并创建变量日期、类别、标题和 url,然后将其附加到循环外的列表中。稍后将其转换为具有上面列出的列的结构化数据框。
虽然这个代码块不起作用,但我可以在没有嵌套循环和 tempmatrix/datalist[[r]]
的情况下让它工作。这样的结果只是我想要的结构中的最后一个文件信息。
非常感谢 input/tips 如何解决我手头的问题。我是 R 的新手,所以我的代码可能效率低下。
尝试在你的内部循环中迭代构建一个 data.frame,然后在你的外部循环的每次迭代中将该数据帧附加到一个列表中。如果这不起作用,请告诉我。
for (page in 1:91){
intermediate <- c()
...
for (r in 1:nrow(headlines)){
...
intermediate <- rbind(tempmatrix, intermediate)
}
datalist[[page]] <- intermediate
}