为什么 Beautiful Soup 找不到具有多个 类 的元素?

Why is Beautiful Soup not finding this element with multiple classes?

我正在尝试 select 使用查询 soup.find_all("li", {"class" : "first_result"})

看起来像 <li class="result first_result"> 的元素

该元素肯定在页面上,但当我 运行 我的脚本时它没有显示。我也尝试 soup.find_all("li", {"class" : "result first_result"}) 记录,但仍然没有。

我做错了什么?

编辑:应 alecxe 的要求,我已经发布了目前为止的代码。我在 64 位 Windows 7 上使用 Python 3.4,我确定这是罪魁祸首。我提出这个问题的具体部分在最底部 ###METACRITIC STUFF###

from bs4 import BeautifulSoup
from urllib3 import poolmanager
import csv
import requests
import sys
import os
import codecs
import re
import html5lib
import math
import time
from random import randint

connectBuilder = poolmanager.PoolManager()

inputstring = sys.argv[1]  #argv string MUST use double quotes

inputarray = re.split('\s+',inputstring)

##########################KAT STUFF########################


katstring = ""

for item in inputarray: katstring += (item + "+")
katstring=katstring[:-1]
#kataddress = "https://kat.cr/usearch/?q=" + katstring    #ALL kat
kataddress = "https://kat.cr/usearch/" + inputstring + " category:tv/?field=seeders&sorder=desc"    #JUST TV kat
#print(kataddress)
numSeedsArray = []
numLeechArray = []


r = requests.get(kataddress)
soup = BeautifulSoup(r.content, "html5lib")
totalpages = [h2.find('span') for h2 in soup.findAll('h2')][0].text #get a string that looks like 'house of cards results 1-25 from 178'
totalpages = int(totalpages[-4:]) #slice off everything but the total # of pages
totalpages = math.floor(totalpages/25)

#print("totalpages= "+str(totalpages))
iteration=0
savedpage = ""

def getdata(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.content, "html5lib")
    global numSeedsArray
    global numLeechArray


    tds = soup.findAll("td", { "class" : "green center" })
    numSeedsArray += [int(td.text) for td in tds]   
    tds = soup.findAll("td", { "class" : "red lasttd center"})
    numLeechArray += [int(td.text) for td in tds] 
    #print(numSeedsArray)



def getnextpage(url):
    global iteration
    global savedpage
    #print("url examined= "+url)
    r = requests.get(url)
    soup = BeautifulSoup(r.content, "html5lib")
    nextpagelinks = soup.findAll("a", { "class" : "turnoverButton siteButton bigButton" })
    nextpagelinks = [link.get('href') for link in nextpagelinks]
    #print(nextpagelinks)

    activepage = soup.findAll("a", { "class" : "turnoverButton siteButton bigButton active" })
    #print("activepage= " +activepage[0].text)
    currentpagenum = activepage[0].text
    #print("currentpagenum= "+currentpagenum)
    if len(currentpagenum)==1 and iteration>1:
        nextpage = savedpage+str(int(currentpagenum)+1)+str(nextpagelinks[0][-27:])
        #print("nextpage= "+nextpage)
        nextpage = re.sub(r'(%20)', ' ', nextpage)
        nextpage = re.sub(r'(%3A)', ':', nextpage)
        nextpage = "https://kat.cr"+nextpage
        #print(nextpage)
    elif len(currentpagenum)==1 and iteration<=1:
        nextpage = str(nextpagelinks[0][:-28])+str(int(currentpagenum)+1)+str(nextpagelinks[0][-27:])
        savedpage = str(nextpagelinks[0][:-28])
        #print("savedpage= "+savedpage )
        nextpage = re.sub(r'(%20)', ' ', nextpage)
        nextpage = re.sub(r'(%3A)', ':', nextpage)
        nextpage = "https://kat.cr"+nextpage
        #print(nextpage)
    elif len(currentpagenum)==2:
        nextpage = savedpage+str(int(currentpagenum)+1)+str(nextpagelinks[0][-27:])
        #print("nextpage= "+nextpage)
        nextpage = re.sub(r'(%20)', ' ', nextpage)
        nextpage = re.sub(r'(%3A)', ':', nextpage)
        nextpage = "https://kat.cr"+nextpage
        #print(nextpage)

    return nextpage   



if totalpages<2:
    while iteration < totalpages-1: #should be totalpages-1 for max accuracy
        getdata(kataddress)
        iteration+=1
        kataddress = getnextpage(kataddress)
else:
    while iteration < 2: #should be totalpages-1 for max accuracy
        getdata(kataddress)
        iteration+=1
        kataddress = getnextpage(kataddress)
    # print(str(sum(numSeedsArray)))
    # print(str(sum(numLeechArray)))

print(str(sum(numLeechArray)+sum(numSeedsArray)))

def getgoogdata(title):
    title = re.sub(r' ', '+', title)
    url = 'https://www.google.com/search?q=' +title+ '&ie=utf-8&oe=utf-8'
    r = requests.get(url)
    soup = BeautifulSoup(r.content, "html5lib")
    resultnum = soup.find("div", {"id": "resultStats"}).text[:-14]

    s2 = resultnum.replace(',', '')
    resultnum = re.findall(r'\b\d+\b', s2)
    print(resultnum)

getgoogdata(inputstring)



####################METACRITIC STUFF#########################
metainputstring = ""
for item in inputarray:
    metainputstring += item + " "
metainputstring = metainputstring[:-1]
metacriticaddress = "http://www.metacritic.com/search/tv/" + metainputstring + "/results"

print (metacriticaddress)

r = requests.get(metacriticaddress)
soup = BeautifulSoup(r.content, "html5lib")
first_result = soup.find_all("li", attrs={"class" : "first_result"})

# first_result = soup.select("li.result.first_result")
print(first_result)

Quoting the documentation:

It’s very useful to search for a tag that has a certain CSS class, but the name of the CSS attribute, “class”, is a reserved word in Python. Using class as a keyword argument will give you a syntax error. As of Beautiful Soup 4.1.2, you can search by CSS class using the keyword argument class_

因此,您需要改写:soup.find_all("li", class_="first_result").

如果您使用的是 pre-4.1.2 版本的 BeautifulSoup,或者如果您坚持要传递字典,则需要指定字典填充 attrs 参数: soup.find_all("li", attrs={"class" : "first_result"}).

您的第一次尝试 (soup.find_all("li", {"class" : "first_result"})) 几乎是正确的,但您需要指定要将字典传递给的参数(在本例中,参数名称是 attrs),然后调用喜欢 soup.find_all("li", attrs={"class" : "first_result"}).

但是,我建议使用 CSS 选择器进行此操作,因为您要匹配多个 类。你可以使用汤的 .select() 方法来做到这一点

results = soup.select("li.result.first_result")

请注意 .select() 总是 return 一个列表,因此如果只有一个元素,请不要忘记以 results[0].

的形式访问它

所有其他答案与您的实际问题无关。

您需要假装成真正的浏览器才能看到搜索结果:

r = requests.get(metacriticaddress, headers={
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"
})

证明(当然是搜索《权力的游戏》):

>>> from bs4 import BeautifulSoup
>>> 
>>> import requests
>>> 
>>> metacriticaddress = "http://www.metacritic.com/search/tv/game%20of%20thrones/results"
>>> r = requests.get(metacriticaddress, headers={
...     "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"
... })
>>> soup = BeautifulSoup(r.content, "html5lib")
>>> first_result = soup.find_all("li", class_="first_result")
>>> 
>>> print(first_result[0].find("h3", class_="product_title").get_text(strip=True))
Game of Thrones