使用 R 抓取网站的 Tableau 仪表板
Scraping a website's Tableau dashboard using R
我正在尝试从 this site and put it into RStudio. Also, this is the Tableau URL I found from the HTML: https://results.mo.gov/t/COVID19/views/Demographics/Public-Demographics?:embed=y&:showVizHome=no&:host_url=https%3A%2F%2Fresults.mo.gov%2F&:embed_code_version=3&:tabs=no&:toolbar=no&:showAppBanner=false&:%E2%80%98iframeSizedToWindow%E2%80%99=%E2%80%98true%E2%80%99&:dataDetails=no&:display_spinner=no&:loadOrderID=0
的密苏里州 COVID 人口统计数据中抓取数据
我一直在使用此论坛 here 获取有关如何将 public 画面仪表板抓取到 R 中的指导,但它似乎没有为我的数据框提供任何行或列.
这是我尝试重新创建的代码:
library(rvest)
library(rjson)
library(httr)
library(stringr)
tableauHost <- "https://results.mo.gov"
url <- httr::modify_url(tableauHost,
path = "/t/COVID19/views/Demographics/Public-Demographics",
query = list(":embed" = "y",
":showVizHome" = "no",
":host_url" = "https://results.mo.gov/",
":embed_code_version" = 3,
":tabs" = "no",
":toolbar" = "no",
":showAppBanner" = "false",
":'iframeSizedToWindow'" = "'true'",
":dataDetails" = "no",
":display_spinner" = "no",
":loadOrderID" = 0)
)
body <- read_html(url)
data <- body %>%
html_nodes("textarea#tsConfigContainer") %>%
html_text()
json <- fromJSON(data)
url <- modify_url(tableauHost, path = paste(json$vizql_root, "/bootstrapSession/sessions/", json$sessionid, sep =""))
resp <- POST(url, body = list(sheet_id = json$sheetId), encode = "form")
data <- content(resp, "text")
extract <- str_match(data, "\d+;(\{.*\})\d+;(\{.*\})")
info <- fromJSON(extract[1,1])
data <- fromJSON(extract[1,3])
worksheet <- "+ PCR by age"
columnsData <- data$secondaryInfo$presModelMap$vizData$presModelHolder$genPresModelMapPresModel$presModelMap[[worksheet]]$presModelHolder$genVizDataPresModel$paneColumnsData
i <- 1
result <- list();
for(t in columnsData$vizDataColumns){
if (is.null(t[["localBaseColumnName"]]) == FALSE) {
result[[i]] <- list(
localBaseColumnName = t[["localBaseColumnName"]],
valueIndices = columnsData$paneColumnsList[[t$paneIndices + 1]]$vizPaneColumns[[t$columnIndices + 1]]$valueIndices,
aliasIndices = columnsData$paneColumnsList[[t$paneIndices + 1]]$vizPaneColumns[[t$columnIndices + 1]]$aliasIndices,
dataType = t[["dataType"]],
stringsAsFactors = FALSE
)
i <- i + 1
}
}
dataFull = data$secondaryInfo$presModelMap$dataDictionary$presModelHolder$genDataDictionaryPresModel$dataSegments[["0"]]$dataColumns
buildOffset = function(x){
return(if(x>=0) 0 else -1)
}
data_index <- 1
name_index <- 1
frameData <- list();
frameNames <- c()
for(t in dataFull) {
for(index in result) {
if (t$dataType == "cstring"){
if (length(index$valueIndices) > 0) {
j <- 1
vector <- character(length(index$valueIndices))
for (it in index$valueIndices){
vector[j] <- t$dataValues[abs(it)+1]
j <- j + 1
}
frameData[[data_index]] <- vector
frameNames[[name_index]] <- paste(index$localBaseColumnName, "value", sep="-")
data_index <- data_index + 1
name_index <- name_index + 1
}
if (length(index$aliasIndices) > 0) {
j <- 1
vector <- character(length(index$aliasIndices))
for (it in index$aliasIndices){
vector[j] <- t$dataValues[abs(it) + buildOffset(it) + 1]
j <- j + 1
}
frameData[[data_index]] <- vector
frameNames[[name_index]] <- paste(index$localBaseColumnName, "alias", sep="-")
data_index <- data_index + 1
name_index <- name_index + 1
}
}
}
}
columnToKeep = c('[Student Aid Program Type]-value','[Student Aid Program]-value', '[:Measure Names]-alias', '[Multiple Values]-alias')
df <- NULL
for(i in 1:length(frameNames)){
if (frameNames[i] %in% columnToKeep){
df[frameNames[i]] <- frameData[i]
}
}
options(width = 1200) #for readability
df <- as.data.frame(df, stringsAsFactors = FALSE)
print(df)
但它只是给我这个作为输出
print(df)
data frame with 0 columns and 0 rows
我知道我们为数据框定义列的 columnToKeep
部分与我要查找的数据不匹配,那么不正确的列数会影响这个吗?我一直在尝试破译原始代码并使用 Missouri COVID 的仪表板实现它,但每次看到它都让我非常困惑。如有任何帮助,我们将不胜感激!!
问题是它需要以不同的方式处理 aliasIndices
。流程如下:
select
下的工作表
data$secondaryInfo$presModelMap$vizData$presModelHolder$genPresModelMapPresModel$presModelMap
在presModelHolder$genVizDataPresModel$paneColumnsData
下:
- 获取
vizDataColumns
下的所有列,注意fieldCaption
作为列名
- 注意所有列的
paneIndices
和 columnIndices
- 对于每个列,在
paneColumnsList
下,将行分配给指定的 columnIndices 和 vizPaneColumns
下的值给出 valueIndices
和 aliasIndices
的索引字典中的实际数据
在字典中(在 data$secondaryInfo$presModelMap$dataDictionary$presModelHolder$genDataDictionaryPresModel$dataSegments
下),您有完整的数据。您只需要匹配您之前提取的索引。为了匹配这些索引,您需要让列的 dataType
知道字典中要 select 的对象,然后:
- 对于
valueIndices
只需在列表中分配索引
- 对于
aliasIndices
,如果值大于 0,则分配索引,如果它是负数,则将列表中的索引与 dataType
"cstring" 匹配
我不确定这个算法是否适用于所有画面数据,但它对我测试过的一些数据效果很好:
library(rvest)
library(rjson)
library(httr)
library(stringr)
#replace the hostname and the path if necessary
host_url <- "https://results.mo.gov"
path <- "/t/COVID19/views/Demographics/Public-Demographics"
body <- read_html(modify_url(host_url,
path = path,
query = list(":embed" = "y",":showVizHome" = "no")
))
data <- body %>%
html_nodes("textarea#tsConfigContainer") %>%
html_text()
json <- fromJSON(data)
url <- modify_url(host_url, path = paste(json$vizql_root, "/bootstrapSession/sessions/", json$sessionid, sep =""))
resp <- POST(url, body = list(sheet_id = json$sheetId), encode = "form")
data <- content(resp, "text")
extract <- str_match(data, "\d+;(\{.*\})\d+;(\{.*\})")
info <- fromJSON(extract[1,1])
data <- fromJSON(extract[1,3])
worksheets = names(data$secondaryInfo$presModelMap$vizData$presModelHolder$genPresModelMapPresModel$presModelMap)
for(i in 1:length(worksheets)){
print(paste("[",i,"] ",worksheets[i], sep=""))
}
selected <- readline(prompt="select worksheet by index: ");
worksheet <- worksheets[as.integer(selected)]
print(paste("you selected :", worksheet, sep=" "))
columnsData <- data$secondaryInfo$presModelMap$vizData$presModelHolder$genPresModelMapPresModel$presModelMap[[worksheet]]$presModelHolder$genVizDataPresModel$paneColumnsData
i <- 1
result <- list();
for(t in columnsData$vizDataColumns){
if (is.null(t[["fieldCaption"]]) == FALSE) {
paneIndex <- t$paneIndices
columnIndex <- t$columnIndices
if (length(t$paneIndices) > 1){
paneIndex <- t$paneIndices[1]
}
if (length(t$columnIndices) > 1){
columnIndex <- t$columnIndices[1]
}
result[[i]] <- list(
fieldCaption = t[["fieldCaption"]],
valueIndices = columnsData$paneColumnsList[[paneIndex + 1]]$vizPaneColumns[[columnIndex + 1]]$valueIndices,
aliasIndices = columnsData$paneColumnsList[[paneIndex + 1]]$vizPaneColumns[[columnIndex + 1]]$aliasIndices,
dataType = t[["dataType"]],
stringsAsFactors = FALSE
)
i <- i + 1
}
}
dataFull = data$secondaryInfo$presModelMap$dataDictionary$presModelHolder$genDataDictionaryPresModel$dataSegments[["0"]]$dataColumns
cstring <- list();
for(t in dataFull) {
if(t$dataType == "cstring"){
cstring <- t
break
}
}
data_index <- 1
name_index <- 1
frameData <- list()
frameNames <- c()
for(t in dataFull) {
for(index in result) {
if (t$dataType == index["dataType"]){
if (length(index$valueIndices) > 0) {
j <- 1
vector <- character(length(index$valueIndices))
for (it in index$valueIndices){
vector[j] <- t$dataValues[it+1]
j <- j + 1
}
frameData[[data_index]] <- vector
frameNames[[name_index]] <- paste(index$fieldCaption, "value", sep="-")
data_index <- data_index + 1
name_index <- name_index + 1
}
if (length(index$aliasIndices) > 0) {
j <- 1
vector <- character(length(index$aliasIndices))
for (it in index$aliasIndices){
if (it >= 0){
vector[j] <- t$dataValues[it+1]
} else {
vector[j] <- cstring$dataValues[abs(it)]
}
j <- j + 1
}
frameData[[data_index]] <- vector
frameNames[[name_index]] <- paste(index$fieldCaption, "alias", sep="-")
data_index <- data_index + 1
name_index <- name_index + 1
}
}
}
}
df <- NULL
lengthList <- c()
for(i in 1:length(frameNames)){
lengthList[i] <- length(frameData[[i]])
}
max <- max(lengthList)
for(i in 1:length(frameNames)){
if (length(frameData[[i]]) < max){
len <- length(frameData[[i]])
frameData[[i]][(len+1):max]<-""
}
df[frameNames[[i]]] <- frameData[i]
}
options(width = 1200)
df <- as.data.frame(df, stringsAsFactors = FALSE)
print(df)
我用 R 和 Python 脚本创建了一个存储库 here
我正在尝试从 this site and put it into RStudio. Also, this is the Tableau URL I found from the HTML: https://results.mo.gov/t/COVID19/views/Demographics/Public-Demographics?:embed=y&:showVizHome=no&:host_url=https%3A%2F%2Fresults.mo.gov%2F&:embed_code_version=3&:tabs=no&:toolbar=no&:showAppBanner=false&:%E2%80%98iframeSizedToWindow%E2%80%99=%E2%80%98true%E2%80%99&:dataDetails=no&:display_spinner=no&:loadOrderID=0
的密苏里州 COVID 人口统计数据中抓取数据我一直在使用此论坛 here 获取有关如何将 public 画面仪表板抓取到 R 中的指导,但它似乎没有为我的数据框提供任何行或列.
这是我尝试重新创建的代码:
library(rvest)
library(rjson)
library(httr)
library(stringr)
tableauHost <- "https://results.mo.gov"
url <- httr::modify_url(tableauHost,
path = "/t/COVID19/views/Demographics/Public-Demographics",
query = list(":embed" = "y",
":showVizHome" = "no",
":host_url" = "https://results.mo.gov/",
":embed_code_version" = 3,
":tabs" = "no",
":toolbar" = "no",
":showAppBanner" = "false",
":'iframeSizedToWindow'" = "'true'",
":dataDetails" = "no",
":display_spinner" = "no",
":loadOrderID" = 0)
)
body <- read_html(url)
data <- body %>%
html_nodes("textarea#tsConfigContainer") %>%
html_text()
json <- fromJSON(data)
url <- modify_url(tableauHost, path = paste(json$vizql_root, "/bootstrapSession/sessions/", json$sessionid, sep =""))
resp <- POST(url, body = list(sheet_id = json$sheetId), encode = "form")
data <- content(resp, "text")
extract <- str_match(data, "\d+;(\{.*\})\d+;(\{.*\})")
info <- fromJSON(extract[1,1])
data <- fromJSON(extract[1,3])
worksheet <- "+ PCR by age"
columnsData <- data$secondaryInfo$presModelMap$vizData$presModelHolder$genPresModelMapPresModel$presModelMap[[worksheet]]$presModelHolder$genVizDataPresModel$paneColumnsData
i <- 1
result <- list();
for(t in columnsData$vizDataColumns){
if (is.null(t[["localBaseColumnName"]]) == FALSE) {
result[[i]] <- list(
localBaseColumnName = t[["localBaseColumnName"]],
valueIndices = columnsData$paneColumnsList[[t$paneIndices + 1]]$vizPaneColumns[[t$columnIndices + 1]]$valueIndices,
aliasIndices = columnsData$paneColumnsList[[t$paneIndices + 1]]$vizPaneColumns[[t$columnIndices + 1]]$aliasIndices,
dataType = t[["dataType"]],
stringsAsFactors = FALSE
)
i <- i + 1
}
}
dataFull = data$secondaryInfo$presModelMap$dataDictionary$presModelHolder$genDataDictionaryPresModel$dataSegments[["0"]]$dataColumns
buildOffset = function(x){
return(if(x>=0) 0 else -1)
}
data_index <- 1
name_index <- 1
frameData <- list();
frameNames <- c()
for(t in dataFull) {
for(index in result) {
if (t$dataType == "cstring"){
if (length(index$valueIndices) > 0) {
j <- 1
vector <- character(length(index$valueIndices))
for (it in index$valueIndices){
vector[j] <- t$dataValues[abs(it)+1]
j <- j + 1
}
frameData[[data_index]] <- vector
frameNames[[name_index]] <- paste(index$localBaseColumnName, "value", sep="-")
data_index <- data_index + 1
name_index <- name_index + 1
}
if (length(index$aliasIndices) > 0) {
j <- 1
vector <- character(length(index$aliasIndices))
for (it in index$aliasIndices){
vector[j] <- t$dataValues[abs(it) + buildOffset(it) + 1]
j <- j + 1
}
frameData[[data_index]] <- vector
frameNames[[name_index]] <- paste(index$localBaseColumnName, "alias", sep="-")
data_index <- data_index + 1
name_index <- name_index + 1
}
}
}
}
columnToKeep = c('[Student Aid Program Type]-value','[Student Aid Program]-value', '[:Measure Names]-alias', '[Multiple Values]-alias')
df <- NULL
for(i in 1:length(frameNames)){
if (frameNames[i] %in% columnToKeep){
df[frameNames[i]] <- frameData[i]
}
}
options(width = 1200) #for readability
df <- as.data.frame(df, stringsAsFactors = FALSE)
print(df)
但它只是给我这个作为输出
print(df)
data frame with 0 columns and 0 rows
我知道我们为数据框定义列的 columnToKeep
部分与我要查找的数据不匹配,那么不正确的列数会影响这个吗?我一直在尝试破译原始代码并使用 Missouri COVID 的仪表板实现它,但每次看到它都让我非常困惑。如有任何帮助,我们将不胜感激!!
问题是它需要以不同的方式处理 aliasIndices
。流程如下:
select
下的工作表data$secondaryInfo$presModelMap$vizData$presModelHolder$genPresModelMapPresModel$presModelMap
在presModelHolder$genVizDataPresModel$paneColumnsData
下:
- 获取
vizDataColumns
下的所有列,注意fieldCaption
作为列名 - 注意所有列的
paneIndices
和columnIndices
- 对于每个列,在
paneColumnsList
下,将行分配给指定的 columnIndices 和vizPaneColumns
下的值给出valueIndices
和aliasIndices
的索引字典中的实际数据
在字典中(在 data$secondaryInfo$presModelMap$dataDictionary$presModelHolder$genDataDictionaryPresModel$dataSegments
下),您有完整的数据。您只需要匹配您之前提取的索引。为了匹配这些索引,您需要让列的 dataType
知道字典中要 select 的对象,然后:
- 对于
valueIndices
只需在列表中分配索引 - 对于
aliasIndices
,如果值大于 0,则分配索引,如果它是负数,则将列表中的索引与dataType
"cstring" 匹配
我不确定这个算法是否适用于所有画面数据,但它对我测试过的一些数据效果很好:
library(rvest)
library(rjson)
library(httr)
library(stringr)
#replace the hostname and the path if necessary
host_url <- "https://results.mo.gov"
path <- "/t/COVID19/views/Demographics/Public-Demographics"
body <- read_html(modify_url(host_url,
path = path,
query = list(":embed" = "y",":showVizHome" = "no")
))
data <- body %>%
html_nodes("textarea#tsConfigContainer") %>%
html_text()
json <- fromJSON(data)
url <- modify_url(host_url, path = paste(json$vizql_root, "/bootstrapSession/sessions/", json$sessionid, sep =""))
resp <- POST(url, body = list(sheet_id = json$sheetId), encode = "form")
data <- content(resp, "text")
extract <- str_match(data, "\d+;(\{.*\})\d+;(\{.*\})")
info <- fromJSON(extract[1,1])
data <- fromJSON(extract[1,3])
worksheets = names(data$secondaryInfo$presModelMap$vizData$presModelHolder$genPresModelMapPresModel$presModelMap)
for(i in 1:length(worksheets)){
print(paste("[",i,"] ",worksheets[i], sep=""))
}
selected <- readline(prompt="select worksheet by index: ");
worksheet <- worksheets[as.integer(selected)]
print(paste("you selected :", worksheet, sep=" "))
columnsData <- data$secondaryInfo$presModelMap$vizData$presModelHolder$genPresModelMapPresModel$presModelMap[[worksheet]]$presModelHolder$genVizDataPresModel$paneColumnsData
i <- 1
result <- list();
for(t in columnsData$vizDataColumns){
if (is.null(t[["fieldCaption"]]) == FALSE) {
paneIndex <- t$paneIndices
columnIndex <- t$columnIndices
if (length(t$paneIndices) > 1){
paneIndex <- t$paneIndices[1]
}
if (length(t$columnIndices) > 1){
columnIndex <- t$columnIndices[1]
}
result[[i]] <- list(
fieldCaption = t[["fieldCaption"]],
valueIndices = columnsData$paneColumnsList[[paneIndex + 1]]$vizPaneColumns[[columnIndex + 1]]$valueIndices,
aliasIndices = columnsData$paneColumnsList[[paneIndex + 1]]$vizPaneColumns[[columnIndex + 1]]$aliasIndices,
dataType = t[["dataType"]],
stringsAsFactors = FALSE
)
i <- i + 1
}
}
dataFull = data$secondaryInfo$presModelMap$dataDictionary$presModelHolder$genDataDictionaryPresModel$dataSegments[["0"]]$dataColumns
cstring <- list();
for(t in dataFull) {
if(t$dataType == "cstring"){
cstring <- t
break
}
}
data_index <- 1
name_index <- 1
frameData <- list()
frameNames <- c()
for(t in dataFull) {
for(index in result) {
if (t$dataType == index["dataType"]){
if (length(index$valueIndices) > 0) {
j <- 1
vector <- character(length(index$valueIndices))
for (it in index$valueIndices){
vector[j] <- t$dataValues[it+1]
j <- j + 1
}
frameData[[data_index]] <- vector
frameNames[[name_index]] <- paste(index$fieldCaption, "value", sep="-")
data_index <- data_index + 1
name_index <- name_index + 1
}
if (length(index$aliasIndices) > 0) {
j <- 1
vector <- character(length(index$aliasIndices))
for (it in index$aliasIndices){
if (it >= 0){
vector[j] <- t$dataValues[it+1]
} else {
vector[j] <- cstring$dataValues[abs(it)]
}
j <- j + 1
}
frameData[[data_index]] <- vector
frameNames[[name_index]] <- paste(index$fieldCaption, "alias", sep="-")
data_index <- data_index + 1
name_index <- name_index + 1
}
}
}
}
df <- NULL
lengthList <- c()
for(i in 1:length(frameNames)){
lengthList[i] <- length(frameData[[i]])
}
max <- max(lengthList)
for(i in 1:length(frameNames)){
if (length(frameData[[i]]) < max){
len <- length(frameData[[i]])
frameData[[i]][(len+1):max]<-""
}
df[frameNames[[i]]] <- frameData[i]
}
options(width = 1200)
df <- as.data.frame(df, stringsAsFactors = FALSE)
print(df)
我用 R 和 Python 脚本创建了一个存储库 here