如何从网页中解析 span 中的值?

How can I parse value in a spanfrom a web page?

我正在尝试从电子商务网站上抓取顶级产品的名称列表。然而结果是空的。想知道缺少什么。输出是: 来访:https://www.amazon.in/gp/bestsellers/electronics/ref=zg_bs_nav_0/ 抓取结束:https://www.amazon.in/gp/bestsellers/electronics/ref=zg_bs_nav_0/

代码:

package main

import (
    "encoding/csv"
    "fmt"
    "log"
    "os"

    "github.com/gocolly/colly"
)

func main() {
    fetchURL := "https://www.amazon.in/gp/bestsellers/electronics/ref=zg_bs_nav_0/"
    fileName := "results.csv"
    file, err := os.Create(fileName)
    if err != nil {
        log.Fatal("ERROR: Could not create file %q: %s\n", fileName, err)
        return
    }
    defer file.Close()
    writer := csv.NewWriter(file)
    defer writer.Flush()


    writer.Write([]string{"Sl. No."})


    c := colly.NewCollector()


    c.OnRequest(func(r *colly.Request) {
        fmt.Println("Visiting: ", r.URL)
    })

    c.OnHTML(`.a-section a-spacing-none aok-relative`, func(e *colly.HTMLElement) {
        number := e.ChildText(".zg-badge-text")
        name := e.ChildText(".p13n-sc-truncated")

        writer.Write([]string{
            number,
            name,

    })


    c.Visit(fetchURL)
    fmt.Println("End of scraping: ", fetchURL)
}

您需要将 User-Agent header 添加到 return 数据中。此外,p13n-sc-truncated 似乎是生成的 class 名称。您可以使用以下示例:

package main

import (
    "log"
    "strings"
    "github.com/gocolly/colly"
)

type AmazonData struct {
    Index int
    Link string
    Title string
}

func main() {
    c := colly.NewCollector()

    var data []AmazonData
    count := 1

    c.OnHTML(`#zg-ordered-list`, func(e *colly.HTMLElement) {
        e.ForEach("li .zg-item", func(_ int, elem *colly.HTMLElement) {
            link := elem.DOM.Find("a")
            linkHref, _ := link.Attr("href")
            data = append(data, AmazonData{
                Index: count,
                Link: linkHref,
                Title: strings.TrimSpace(link.Find("div").Text()),
            })
            count++
        })
        log.Println(data)
    })

    c.OnRequest(func(r *colly.Request) {
        r.Headers.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36")
    })

    c.Visit("https://www.amazon.in/gp/bestsellers/electronics/ref=zg_bs_nav_0/")
}