将 Web 浏览器自动化转换为 XmlHTTPRequest

Converting Web Browser Automation to XmlHTTP Request

我创建了一个从 Brief profiles (BP) 抓取相关信息的宏,可以在以下位置搜索:https://echa.europa.eu/information-on-chemicals

这可以使用对 Brief Profile 的 URL 的 XMLHTTP 请求,并且工作正常。

我现在想创建一个宏来搜索同一个网站以找到简介的 URL(href)。

作为 VBA 的初学者,我已经使用浏览器成功实现了这一点,但我希望将其转换为 XML HTTP 请求以提高效率。

使用 IE 浏览器自动化:

Sub Gethref()

Dim IE As New SHDocVw.InternetExplorer
Dim HTMLDoc As MSHTML.HTMLDocument
Dim HTMLInput As MSHTML.IHTMLElement
Dim HTMLButtom As MSHTML.IHTMLElement

Dim HTMLhref As MSHTML.IHTMLElement

'Go to Website
IE.Visible = True
IE.navigate "https://echa.europa.eu/information-on-chemicals"

'Check Website is ready for search and set HTMLDoc to IE.Document for elements
Do While IE.readyState <> READYSTATE_COMPLETE
Loop

Set HTMLDoc = IE.document

'Set value of Searchbox to keyword
Set HTMLInput = HTMLDoc.getElementById("_disssimplesearch_WAR_disssearchportlet_sskeywordKey")
HTMLInput.Value = "Potassium mercaptoacetate"

'Search for Result
Set HTMLButton = HTMLDoc.getElementById("_disssimplesearchhomepage_WAR_disssearchportlet_searchButton")
HTMLButton.Click

'Check page has loaded
Do While IE.readyState = READYSTATE_COMPLETE or IE.Busy
Loop
Set HTMLDoc = IE.document

'Find Desired href
Set HTMLhref = HTMLDoc.getElementsByClassName("briefProfileLink")(0)
Debug.Print HTMLhref.getAttribute("href")

End Sub

这应该将 Potassium mercaptoacetate 的 href 打印为 https://echa.europa.eu/brief-profile/-/briefprofile/100.000.602

我已经开始尝试使用 XML HTTP 尽可能多地转换,但我 运行 转换成我不太了解的问题

使用 XML HTTP 请求(不工作)

Sub Gethref()

    Dim XMLPage As New MSXML2.XMLHTTP60
    Dim HTMLDoc As New MSHTML.HTMLDocument
    Dim HTMLInput As MSHTML.IHTMLElement
    Dim HTMLButtom As MSHTML.IHTMLElement

Dim HTMLhref As MSHTML.IHTMLElement

'Go to Website
    XMLPage.Open "GET", "https://echa.europa.eu/information-on-chemicals", False
    XMLPage.send
    
'Set value of Searchbox to keyword
    Set HTMLInput = HTMLDoc.getElementById("_disssimplesearch_WAR_disssearchportlet_sskeywordKey")
    HTMLInput.Value

'Search for Result
    Set HTMLButton = HTMLDoc.getElementById("_disssimplesearchhomepage_WAR_disssearchportlet_searchButton")
    HTMLButton.Click

'Check page has loaded
HTMLDoc.body.innerHTML = IE.document.responseText

'Find Desired href
    Set HTMLhref = HTMLDoc.getElementsByClassName("briefProfileLink")(0)
    Debug.Print HTMLhref.getAttribute("href")

End Sub

我会在这方面取得进展时进行更新,但如果有人能提供帮助,那就太好了。

好的,这应该可以了。结果表明,您需要使用适当的参数发出 post HTTP 请求,以获得包含所需 links 的所需响应。

Public Sub GetContent()
    Const Url = "https://echa.europa.eu/search-for-chemicals?p_auth=5ayUnMyz&p_p_id=disssimplesearch_WAR_disssearchportlet&p_p_lifecycle=1&p_p_state=normal&p_p_col_id=_118_INSTANCE_UFgbrDo05Elj__column-1&p_p_col_count=1&_disssimplesearch_WAR_disssearchportlet_javax.portlet.action=doSearchAction&_disssimplesearch_WAR_disssearchportlet_backURL=https%3A%2F%2Fecha.europa.eu%2Finformation-on-chemicals%3Fp_p_id%3Ddisssimplesearchhomepage_WAR_disssearchportlet%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview%26p_p_col_id%3D_118_INSTANCE_UFgbrDo05Elj__column-1%26p_p_col_count%3D1%26_disssimplesearchhomepage_WAR_disssearchportlet_sessionCriteriaId%3D"
    Dim oHttp As Object, oHtml As HTMLDocument, MyDict As Object, I&, R&
    Dim DictKey As Variant, payload$, searchKeyword$, Ws As Worksheet
    
    Set oHtml = New HTMLDocument
    Set oHttp = CreateObject("MSXML2.XMLHTTP")
    Set MyDict = CreateObject("Scripting.Dictionary")
    Set Ws = ThisWorkbook.Worksheets("Sheet1")

    searchKeyword = "Acetone"
    
    MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_formDate") = "1621017052777" 'timestamp
    MyDict("_disssimplesearch_WAR_disssearchportlet_searchOccurred") = "true"
    MyDict("_disssimplesearch_WAR_disssearchportlet_sskeywordKey") = searchKeyword
    MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimer") = "true"
    MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimerCheckbox") = "on"

    payload = ""
    For Each DictKey In MyDict
        payload = IIf(Len(DictKey) = 0, WorksheetFunction.encodeURL(DictKey) & "=" & WorksheetFunction.encodeURL(MyDict(DictKey)), _
        payload & "&" & WorksheetFunction.encodeURL(DictKey) & "=" & WorksheetFunction.encodeURL(MyDict(DictKey)))
    Next DictKey
    
    With oHttp
        .Open "POST", Url, False
        .setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"
        .setRequestHeader "Content-type", "application/x-www-form-urlencoded"
        .send (payload)
        oHtml.body.innerHTML = .responseText
    End With
    
    With oHtml.querySelectorAll("table.table > tbody > tr > td > a.substanceNameLink")
        For I = 0 To .Length - 1
            R = R + 1: Ws.Cells(R, 1) = .item(I).getAttribute("href")
        Next I
    End With
End Sub

如果您只对第一个 link 感兴趣,请尝试以下而不是最后一个 with 块:

MsgBox oHtml.querySelector("table.table > tbody > tr > td > a.substanceNameLink").getAttribute("href")

或者您可以直接从开发工具中复制这些参数并使用它们:

Public Sub GetContent()
    Const Url = "https://echa.europa.eu/search-for-chemicals?"
    Dim oHttp As Object, oHtml As HTMLDocument
    Dim payload$, Ws As Worksheet, urlSuffix$
    
    Set oHtml = New HTMLDocument
    Set oHttp = CreateObject("MSXML2.XMLHTTP")
    Set Ws = ThisWorkbook.Worksheets("Sheet1")
    
    urlSuffix = "p_auth=69hDou3E&p_p_id=disssimplesearch_WAR_disssearchportlet&p_p_lifecycle=1&p_p_state=normal&p_p_col_id=" & _
                "_118_INSTANCE_UFgbrDo05Elj__column-1&p_p_col_count=1&_disssimplesearch_WAR_disssearchportlet_javax.portlet.action=" & _
                "doSearchAction&_disssimplesearch_WAR_disssearchportlet_backURL=https%3A%2F%2Fecha.europa.eu%2Finformation-on-chemicals" & _
                "%3Fp_p_id%3Ddisssimplesearchhomepage_WAR_disssearchportlet%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview" & _
                "%26p_p_col_id%3D_118_INSTANCE_UFgbrDo05Elj__column-1%26p_p_col_count%3D1%26_disssimplesearchhomepage_WAR_disssearchportlet_sessionCriteriaId%3D"

    payload = "_disssimplesearchhomepage_WAR_disssearchportlet_formDate=1621042609544&_disssimplesearch_WAR_disssearchportlet_searchOccurred=" & _
              "true&_disssimplesearch_WAR_disssearchportlet_sskeywordKey=Acetone&_disssimplesearchhomepage_WAR_disssearchportlet_disclaimer" & _
              "=true&_disssimplesearchhomepage_WAR_disssearchportlet_disclaimerCheckbox=on"
    
    With oHttp
        .Open "POST", Url & urlSuffix, False
        .setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"
        .setRequestHeader "Content-type", "application/x-www-form-urlencoded"
        .send (payload)
        oHtml.body.innerHTML = .responseText
    End With
    
    Debug.Print oHtml.querySelector("table.table > tbody > tr > td > a.substanceNameLink").getAttribute("href")
End Sub