从 OpenStreetMap 获取大学名称

Get universities names from OpenStreetMap

我正在尝试从嵌入到此网站 https://collegecrisis.shinyapps.io/dashboard/.

OpenStreetMap 获取大学名称

我尝试使用 Python Selenium Library 来自动执行此任务,我将鼠标悬停在所有大学上,一一取下它们的名字,这似乎很好,但是当我深入了解时,我发现了一些错误数据,我认为当脚本试图将鼠标悬停在一个到处都是大学的地方时发生了,这使得它悬停在另一所大学上并取了它的名字,我想 zooming 而不是采取名称而不是缩小,但这确实需要很长时间,并且可能会因错过放大或缩小而导致一些 运行 时间错误。

本人对地图了解不多,所以想请教一下有没有什么办法可以一次性把地图上标注的大学名字取下来

如果有人需要我试过的代码,那就是这个:

from selenium import webdriver
from bs4 import BeautifulSoup
import lxml
from selenium.webdriver.common.action_chains import ActionChains
from time import sleep

# setup drivers
PATH = "/Applications/chromedriver"
driver = webdriver.Chrome(PATH)
driver.implicitly_wait(10) # seconds
driver.get("https://collegecrisis.shinyapps.io/dashboard/")

# find all class elements =leaflet-interactive
nodes = driver.find_elements_by_class_name("leaflet-interactive")

# use actionchains
nodelist = []

# loop through each node
for node in nodes:
    ActionChains(driver).move_to_element(node).perform() # Used actionchains class to click to open popup
    sleep(.5)
    nodelist.append(BeautifulSoup(driver.page_source, 'lxml').find(class_=lambda value: value and 'leaflet-tooltip leaflet-zoom-animated' in value).text.lower())

灵感来自 this one

此服务使用 http 流式传输。它只会在以下端点上打开一个 http 连接:

POST https://collegecrisis.shinyapps.io/dashboard/__sockjs__/n={random_token}/t={token}/w={workerID}/s=0/{random_num}/{random_token2}/xhr_streaming

并且它将使用以下端点发送命令:

POST https://collegecrisis.shinyapps.io/dashboard/__sockjs__/n={random_token}/t={token}/w={workerID}/s=0/{random_num}/{random_token2}/xhr_send

您可以在 Chrome 开发控制台的网络选项卡中查看查找 xhr_streaming 的结果。

token 是从 :

的另一个 http 调用中检索到的
GET https://collegecrisis.shinyapps.io/dashboard/{workerIDFull}__token__

workerID 存在于原始页面本身中

一些名为 singletons 的参数是必需的,它们也位于原始页面中的 script 标记中,如下所示:

<script type="application/shiny-singletons">fafb5589cb5a9f24485f3df0511b50d5cd0c7497,603e796bcfc2ab3685167d58c426f64c15a95192</script>

以下脚本:

  • 从原始页面中抓取所需的元素
  • 使用 workerID 获取令牌
  • 在新线程中启动 POST /xhr_streaming
  • 在 POST /xhr_send 上发送“打开频道命令”,即 '["0#0|o|"]'
  • 使用之前抓取的单例值和一个大的 JSON 静态配置
  • 发送“初始化命令”

完整代码:

import requests
from bs4 import BeautifulSoup
import re
import time
from random import choice
from string import ascii_letters,digits
from threading import Thread
from time import sleep
import json

session = requests.Session()
r = session.get("https://collegecrisis.shinyapps.io/dashboard/")
soup = BeautifulSoup(r.content, "lxml")

singletons = soup.find("script", {"type":"application/shiny-singletons"}).text

workerIDFull = soup.find("base")["href"]
workerID = re.search('_w_(\w+)', workerIDFull).group(1)
timestamp = int(round(time.time() * 1000))

r = session.get(f"https://collegecrisis.shinyapps.io/dashboard/{workerIDFull}__token__",
    params = {
        "_": timestamp
})
token = r.text

random_token = ''.join(choice(ascii_letters) for i in range(18))
random_token2 = ''.join(choice(ascii_letters) for i in range(8))
random_num = ''.join(choice(digits) for i in range(3))

def getData():
    r = requests.Request("POST", f"https://collegecrisis.shinyapps.io/dashboard/__sockjs__/n={random_token}/t={token}/w={workerID}/s=0/{random_num}/{random_token2}/xhr_streaming").prepare()
    resp = session.send(r, stream=True)

    for line in resp.iter_lines():
        if line:
            print(line)
            splitted = str(line.decode('unicode_escape'))[2:-2].split("|")
            if (len(splitted) > 2):
                data = json.loads(splitted[2])
                if ("values" in data):
                    print([ t["args"][8] for t in data["values"]["homeMap"]["x"]["calls"] if t["method"] == "addCircles"][0])

def openChannel():
    r = session.post(f"https://collegecrisis.shinyapps.io/dashboard/__sockjs__/n={random_token}/t={token}/w={workerID}/s=0/{random_num}/{random_token2}/xhr_send",
    data = '["0#0|o|"]', headers = {"Content-Type":"text/plain;charset=UTF-8"})

def sendInit():
    data = json.dumps({
        "method":"init",
        "data":{
            "sidebarItemExpanded":None,
            "sidebarCollapsed":True,
            "resetAll:shiny.action":0,
            "fallResetAll:shiny.action":0,
            "lawResetAll:shiny.action":0,
            ".clientdata_output_authModal_hidden":False,
            ".clientdata_output_homefullOnlineVB_hidden":False,
            ".clientdata_output_homepOnlineVB_hidden":False,
            ".clientdata_output_homeHybridVB_hidden":False,
            ".clientdata_output_homepPersonVB_hidden":False,
            ".clientdata_output_homePersonVB_hidden":False,
            ".clientdata_output_homeTBDVB_hidden":False,
            ".clientdata_output_homeOtherVB_hidden":False,
            ".clientdata_output_homeTotalShownVB_hidden":False,
            ".clientdata_output_homeMap_hidden":False,
            ".clientdata_output_graphStateFilter_hidden":True,
            ".clientdata_output_fallBarGraph_hidden":True,
            ".clientdata_output_covidAthleticGraph_hidden":True,
            ".clientdata_output_schoolCovidPlot_hidden":True,
            ".clientdata_output_intlFilter_hidden":True,
            ".clientdata_output_intlGraph_hidden":True,
            ".clientdata_output_facultyBarGraph_hidden":True,
            ".clientdata_output_stateTrendsGraph_hidden":True,
            ".clientdata_output_covidHeatmap_hidden":True,
            ".clientdata_output_announceHeatmap_hidden":True,
            ".clientdata_output_onlineHeatmap_hidden":True,
            ".clientdata_output_springBreak_hidden":True,
            ".clientdata_output_peerInstPicker_hidden":True,
            ".clientdata_output_statusFilter_hidden":True,
            ".clientdata_output_rankcatFilter_hidden":True,
            ".clientdata_output_hospitalFilter_hidden":True,
            ".clientdata_output_covidFilter_hidden":True,
            ".clientdata_output_campusTypeFilter_hidden":True,
            ".clientdata_output_sectorFilter_hidden":True,
            ".clientdata_output_ccbasicFilter_hidden":True,
            ".clientdata_output_divisionFilter_hidden":True,
            ".clientdata_output_conferenceFilter_hidden":True,
            ".clientdata_output_sizeSlider_hidden":True,
            ".clientdata_output_resHallSlider_hidden":True,
            ".clientdata_output_sportsRevenueSlider_hidden":True,
            ".clientdata_output_intlSlider_hidden":True,
            ".clientdata_output_onlineVB_hidden":True,
            ".clientdata_output_announcedVB_hidden":True,
            ".clientdata_output_noDecisionVB_hidden":True,
            ".clientdata_output_totalVB_hidden":True,
            ".clientdata_output_dateSlider_hidden":True,
            ".clientdata_output_springMap_hidden":True,
            ".clientdata_output_fallPeerInstPicker_hidden":True,
            ".clientdata_output_fallStatusFilter_hidden":True,
            ".clientdata_output_fallRankcatFilter_hidden":True,
            ".clientdata_output_fallFacultyFilter_hidden":True,
            ".clientdata_output_fallHospitalFilter_hidden":True,
            ".clientdata_output_fallCovidFilter_hidden":True,
            ".clientdata_output_fallCampusTypeFilter_hidden":True,
            ".clientdata_output_fallSectorFilter_hidden":True,
            ".clientdata_output_fallCcbasicFilter_hidden":True,
            ".clientdata_output_fallStaffFilter_hidden":True,
            ".clientdata_output_fallDivisionFilter_hidden":True,
            ".clientdata_output_fallConferenceFilter_hidden":True,
            ".clientdata_output_fallSizeSlider_hidden":True,
            ".clientdata_output_fallResHallSlider_hidden":True,
            ".clientdata_output_fallSportsRevenueSlider_hidden":True,
            ".clientdata_output_fallIntlSlider_hidden":True,
            ".clientdata_output_fallfullOnlineVB_hidden":True,
            ".clientdata_output_fallpOnlineVB_hidden":True,
            ".clientdata_output_fallHybridVB_hidden":True,
            ".clientdata_output_fallpPersonVB_hidden":True,
            ".clientdata_output_fallPersonVB_hidden":True,
            ".clientdata_output_fallTBDVB_hidden":True,
            ".clientdata_output_fallOtherVB_hidden":True,
            ".clientdata_output_fallTotalShownVB_hidden":True,
            ".clientdata_output_fallMap_hidden":True,
            ".clientdata_output_greFilter_hidden":True,
            ".clientdata_output_modelFilter_hidden":True,
            ".clientdata_output_planFilter_hidden":True,
            ".clientdata_output_videoPlatformFilter_hidden":True,
            ".clientdata_output_lawSectorFilter_hidden":True,
            ".clientdata_output_lawMinoritySlider_hidden":True,
            ".clientdata_output_lawLSATtwofiveSlider_hidden":True,
            ".clientdata_output_lawLSATmedianSlider_hidden":True,
            ".clientdata_output_lawLSATsevenfiveSlider_hidden":True,
            ".clientdata_output_lawAcceptanceSlider_hidden":True,
            ".clientdata_output_lawFYSlider_hidden":True,
            ".clientdata_output_lawFullOnlineVB_hidden":True,
            ".clientdata_output_lawPartialOnlineVB_hidden":True,
            ".clientdata_output_lawHybridVB_hidden":True,
            ".clientdata_output_lawPersonVB_hidden":True,
            ".clientdata_output_lawNDVB_hidden":True,
            ".clientdata_output_lawTotalVB_hidden":True,
            ".clientdata_output_lawMap_hidden":True,
            ".clientdata_output_intlOnlineVB_hidden":True,
            ".clientdata_output_intlHybridVB_hidden":True,
            ".clientdata_output_intlInPersonVB_hidden":True,
            ".clientdata_output_intlCovidVB_hidden":True,
            ".clientdata_output_intlTBDVB_hidden":True,
            ".clientdata_output_intlTotalVB_hidden":True,
            ".clientdata_output_intlMap_hidden":True,
            ".clientdata_pixelratio":1,
            ".clientdata_url_protocol":"https:",
            ".clientdata_url_hostname":"collegecrisis.shinyapps.io",
            ".clientdata_url_port":"",
            ".clientdata_url_pathname":"/dashboard/",
            ".clientdata_url_search":"",
            ".clientdata_url_hash_initial":"",
            ".clientdata_url_hash":"",
            ".clientdata_singletons": singletons,
            ".clientdata_allowDataUriScheme":True
        }
    })
    r = session.post(f"https://collegecrisis.shinyapps.io/dashboard/__sockjs__/n={random_token}/t={token}/w={workerID}/s=0/{random_num}/{random_token2}/xhr_send",
    data = f'["1#0|m|{json.dumps(data)[1:-1]}"]', headers = {"Content-Type":"text/plain;charset=UTF-8"})

thread = Thread(target = getData, args = ())
thread.start()
sleep(1)
openChannel()
sendInit()
thread.join()

如果您需要来自地图的更多数据

,请检查字段data["values"]["homeMap"]["x"]["calls"]

run this on repl.it

此代码有效,但即使经过优化,它也会花费 大量 时间来通过打开和关闭弹出窗口获取大学名称。 在这种情况下,Bertrand 解决方案应该比使用 Selenium 更好!

public static void main(String[] args) {
        // TODO Auto-generated method stub

        System.setProperty("webdriver.chrome.driver", "chromedriver.exe");

        ChromeOptions options = new ChromeOptions();
        
        
        
        options.addArguments("--headless", "--window-size=600,600");
        
        WebDriver driver = new ChromeDriver(options);

        
        
        driver.get("https://collegecrisis.shinyapps.io/dashboard/");
        
        WebElement tmp = null;
        
        JavascriptExecutor js = (JavascriptExecutor) driver;
        
        ArrayList<WebElement> paths = new ArrayList<WebElement>();
        
        ArrayList<WebElement> markers = new ArrayList<WebElement>();
        
        while(markers.size() <= 0) {
            
            paths = (ArrayList<WebElement>) driver.findElements(By.className("leaflet-interactive"));
        
            
        for(int i = 0; i < paths.size(); i++) {
            
            System.out.println(i);
            
            if(paths.get(i).getAttribute("stroke-linecap") != null)
        if(paths.get(i).getAttribute("stroke-linecap").contains("round"))
            {markers.add(paths.get(i)); System.out.println(markers.get(0));}
        
        }
        
        
        }
        
        for(int i = 0; i < markers.size(); i++) {
            
            if(i == 0) {
                
                for(int j = 0; j < markers.size(); j++) {
                    
                    js.executeScript(
                            "arguments[0].style.pointerEvents = 'none';", 
                            markers.get(j));
                }
                
                
            }
            
            js.executeScript(
                    "arguments[0].style.pointerEvents = 'auto';", 
                    markers.get(i));
            
            js.executeScript("arguments[0].scrollIntoView(true);", markers.get(i)); 
            
            try {markers.get(i).click();}
            catch(Exception e) {js.executeScript("arguments[0].click();", markers.get(i));}
            
             System.out.println(driver.findElement(By.className("leaflet-popup-content")).findElement(By.xpath("b[1]")).getText());
             
             ArrayList<WebElement> popupPane = (ArrayList<WebElement>) driver.findElements(By.tagName("div"));
             
             for(int k = 0; k < popupPane.size(); k++) {
                 
                 try {
                 if(popupPane.get(k).getAttribute("class").contains("leaflet-pane") && popupPane.get(k).getAttribute("class").contains("leaflet-popup-pane"))
                     {tmp = popupPane.get(k); break;} }
                 
                 catch(Exception e){}
                 
             }
            
             ArrayList<WebElement> children = (ArrayList<WebElement>) tmp.findElements(By.xpath(".//*"));
             
             while(children.size() > 0) {
             
                 children = (ArrayList<WebElement>) tmp.findElements(By.xpath(".//*"));
                 
             try {
            driver.findElement(By.className("leaflet-popup-close-button")).click(); }
             catch(Exception e){}
             
             }
            
            js.executeScript(
                    "arguments[0].style.pointerEvents = 'none'", 
                    markers.get(i));
            
        }

        for(int i = 0; i < markers.size(); i++) {
            
            System.out.println(markers.get(i).getText());
            
            
        }
        
        System.out.println(Integer.toString(markers.size()));
        
        }