调整雅虎股票数据网络抓取以循环日期

Adjusting Yahoo Stock Data Web Scraping to Loop over Dates

我正在使用与 相似的脚本。简而言之,代理问题(或其他问题)使 API 超时。我不得不解析雅虎财经数据的网址,而不是使用 quantmod 来获取历史股票数据。因为 yahoo finance 只加载 100 行,即使你将日期 运行ge 设置为大于 at,直到你向下滚动,我需要得到这个 "for loop" 来循环遍历 100 天的列表我创造的时间。开始日期采用 Yahoo Finance 使用的整数格式。

以下是 100 天增量的 df 示例,列表将 change/grow。证券列表也从文件中导入并动态更改,但我在下面提供了 "symbols" 作为示例。

在下文中,我希望 dateGroup[1,1] 和 dateGroup[1,2] 自动从 dateGroup 中获取第一行值,然后是第二行,依此类推 - 然后构建一个数据框具有所有值。

dateGroup <- data.frame(
    start = c(1509519600, 1518159600,1526799600,1535439600,1544079600),
    end = c(1518073200, 1526713200,1535353200,1543993200,1550732400)
)

for (s in symbols){
    url <- paste('https://finance.yahoo.com/quote/',s, '/history?period1=',dateGroup[1,1],'&period2=',dateGroup[1,2],'&interval=1d&filter=history&frequency=1d',sep="")
    webpage <- readLines(url,warn=FALSE)
    html <- htmlTreeParse(webpage, useInternalNodes = TRUE, asText = TRUE)
    tableNodes <- getNodeSet(html, "//table")
    assign(s, readHTMLTable(tableNodes[[1]], header=c("Date","Open","High","Low","Close","Adj. Close","Volume")))

    df <- get(s)
    df['symbol'] <- s
    assign(s, df)
}

symboldatalist <- cbind(mget(symbols))
symboldata <- do.call(rbind, symboldatalist)

symboldata <- symboldata[, c(ncol(symboldata), 1:ncol(symboldata)-1)]

write.table(symboldata, "[Location], sep=",", row.names=FALSE, col.names=TRUE)

任何帮助都会很棒。谢谢!

我尝试过一些事情。我试图创建一个 url 矩阵,并用顶部的符号和第 1 列和第 2 列中的日期抓取那些。我还尝试在 Yahoo Finance 页面上编写自动滚动程序,但 运行 进入相同的错误超时了。

考虑 mapply 或其非简化包装器 Map,通过开始日期和结束日期以及相应符号的对逐元素迭代。此外,避免使用 assignget 并在最后为最终 rbind 构建数据帧列表:

library(XML)
...
dateGroup <- data.frame(
    start = c(1509519600, 1518159600, 1526799600, 1535439600, 1544079600),
    end = c(1518073200, 1526713200, 1535353200, 1543993200, 1550732400)
)

# CROSS JOIN ALL SYMBOLS WITH EACH DATE PAIRING
dt_grp_sym <- merge(dateGroup, data.frame(symbols))

# DEFINED METHOD FOR HTML PROCESSING
proc_html <- function(sym, sd, ed) {    
    url <- paste0('https://finance.yahoo.com/quote/', sym, '/history?period1=',
                  sd, '&period2=', ed, '&interval=1d&filter=history&frequency=1d')
    print(url)

    webpage <- readLines(url, warn=FALSE)
    html <- htmlTreeParse(webpage, useInternalNodes = TRUE, asText = TRUE)
    tableNodes <- getNodeSet(html, "//table")

    html_df <- transform(readHTMLTable(tableNodes[[1]],
                                       header=c("Date", "Open", "High", "Low",
                                                "Close", "Adj. Close", "Volume")),
                         symbol = sym)
    return(html_df)
}

# ITERATE ELEMENTWISE THROUGH EVERY ROW of dt_grp_sym
df_list <- Map(proc_html, dt_grp_sym$symbols, dt_grp_sym$start, dt_grp_sym$end)

final_df <- do.call(rbind, df_list)

使用Class 1 U.S进行演示。铁路:

symbols <- c("UNP", "CSX", "NSC", "CNI", "KSU")

dateGroup <- data.frame(
  start = c(1509519600, 1518159600, 1526799600, 1535439600, 1544079600),
  end = c(1518073200, 1526713200, 1535353200, 1543993200, 1550732400)
)    
dt_grp_sym <- merge(dateGroup, data.frame(symbols))

# CALLING SAME ABOVE FUNCTION
df_list <- with(dt_grp_sym, Map(proc_html, symbols, start, end))   
final_df <- do.call(rbind, df_list)

输出

by(final_df, final_df$symbol, head)

# final_df$symbol: CNI
#              Date  Open  High   Low Close Adj..Close    Volume symbol
# 998  Feb 08, 2018 76.08 76.16 74.11 74.45      72.79 1,508,100    CNI
# 999  Feb 07, 2018 76.86 77.23 76.01 76.17      74.48 1,645,400    CNI
# 1000 Feb 06, 2018 76.21 77.42 74.81 77.14      75.42 2,293,300    CNI
# 1001 Feb 05, 2018 78.00 78.70 77.12 77.17      75.45 1,711,000    CNI
# 1002 Feb 02, 2018 79.17 79.24 78.17 78.46      76.71 1,331,400    CNI
# 1003 Feb 01, 2018 79.91 80.54 79.24 79.82      78.04 1,231,500    CNI
# ------------------------------------------------------------------------------ 
# final_df$symbol: CSX
#             Date  Open  High   Low Close Adj..Close     Volume symbol
# 333 Feb 08, 2018 52.91 53.16 50.46 50.47      49.80  7,798,100    CSX
# 334 Feb 07, 2018 53.38 54.36 52.94 52.97      52.26  6,496,200    CSX
# 335 Feb 06, 2018 51.27 54.00 50.12 53.82      53.10 10,563,700    CSX
# 336 Feb 05, 2018 54.89 55.04 51.96 51.99      51.30  9,070,200    CSX
# 337 Feb 02, 2018 56.19 56.35 55.20 55.25      54.51  9,275,800    CSX
# 338 Feb 01, 2018 56.10 57.10 56.04 56.58      55.83  4,079,100    CSX
# ------------------------------------------------------------------------------ 
# final_df$symbol: KSU
#              Date   Open   High    Low  Close Adj..Close    Volume symbol
# 1330 Feb 08, 2018 107.17 107.64 103.50 103.53     102.15 1,434,600    KSU
# 1331 Feb 07, 2018 106.59 108.27 106.59 107.10     105.67 1,326,800    KSU
# 1332 Feb 06, 2018 103.11 108.02 102.07 107.32     105.89 1,459,400    KSU
# 1333 Feb 05, 2018 109.73 110.44 105.12 105.18     103.77 1,272,100    KSU
# 1334 Feb 02, 2018 112.06 112.85 110.03 110.15     108.68 1,051,900    KSU
# 1335 Feb 01, 2018 112.80 114.00 112.17 112.87     111.36 1,011,200    KSU
# ------------------------------------------------------------------------------ 
# final_df$symbol: NSC
#             Date   Open   High    Low  Close Adj..Close    Volume symbol
# 665 Feb 08, 2018 142.62 143.27 136.87 136.89     134.22 2,657,200    NSC
# 666 Feb 07, 2018 142.09 144.45 141.37 142.68     139.89 1,464,500    NSC
# 667 Feb 06, 2018 136.99 143.45 134.55 143.05     140.26 2,455,000    NSC
# 668 Feb 05, 2018 144.74 146.73 138.18 138.61     135.90 2,508,900    NSC
# 669 Feb 02, 2018 147.15 147.85 144.61 145.03     142.20 1,774,600    NSC
# 670 Feb 01, 2018 149.28 150.35 147.90 148.47     145.57 1,427,000    NSC
# ------------------------------------------------------------------------------ 
# final_df$symbol: UNP
#           Date   Open   High    Low  Close Adj..Close     Volume symbol
# 1 Feb 08, 2018 128.70 128.70 124.81 124.86     122.27  6,325,100    UNP
# 2 Feb 07, 2018 130.34 131.82 128.94 128.96     126.29  5,053,000    UNP
# 3 Feb 06, 2018 122.28 131.50 121.50 131.15     128.43 15,734,300    UNP
# 4 Feb 05, 2018 128.59 131.78 124.13 124.14     121.57  6,744,400    UNP
# 5 Feb 02, 2018 131.66 131.73 127.22 129.36     126.68  8,181,200    UNP
# 6 Feb 01, 2018 132.51 133.74 131.86 132.38     129.64  5,597,600    UNP