Selenium 中的 getAttributre("href") returns None

Question

import time

from selenium import webdriver
from selenium.webdriver.common.keys import Keys

browser = webdriver.Chrome()
## In the following segment, I show how you can get a list of news article URLs based on a keyword search.
## I use CNN as an example but news source would work.

base_url = u'https://www.cnn.com/search?q=politics&size=200'

browser.get(base_url)
time.sleep(1)

#Finds the container that contains every news article.
main_news_container = browser.find_element_by_class_name('cnn-search__results-list')
headlines_list = main_news_container.find_elements_by_class_name('cnn-search__result-headline')
#In main container get 'a'
#text_sections = main_news_container.find_elements_by_xpath("//a[@href]")
print(len(text_sections))
headlines = [];
links = [];
for elem in headlines_list:
    #if "/2020/" in elem.get_attribute("href"):
        #this is printing the link
    #print(elem.get_attribute("href"))
    links.append(elem.get_attribute("href"))
        #this is printing the Headline
    #print(elem.text)
    headlines.append(elem.text)


#Find the text body_elements inside the main_news_container
body_elements = main_news_container.find_elements_by_class_name("cnn-search__result-body")

#this is how you get the body body_elements text
#print(body_elements[1].text)
print(links[0])
print(len(headlines))
print(len(body_elements))

我正在尝试从 headlines_list 的元素中获取 href，但它 returns None。但是我仍然可以通过说

来获取标题文本

elem.text

This is the inspect output of the web page source

Answer 1

您正在尝试获取 h3 的属性 href，因为 class 在 h3 中。您需要在 h3 元素中找到 a 元素。

我更改了包含您的标题的行以包含 a 元素：

headlines_list = main_news_container.find_element_by_css_selector("h3[class='cnn-search__result-headline']>a")

这是你的代码

import time

from selenium import webdriver
from selenium.webdriver.common.keys import Keys

browser = webdriver.Chrome()
## In the following segment, I show how you can get a list of news article URLs based on a keyword search.
## I use CNN as an example but news source would work.

base_url = u'https://www.cnn.com/search?q=politics&size=200'

browser.get(base_url)
time.sleep(1)

#Finds the container that contains every news article.
main_news_container = browser.find_element_by_class_name('cnn-search__results-list')
headlines_list = main_news_container.find_element_by_css_selector("h3[class='cnn-search__result-headline']>a")
#In main container get 'a'
#text_sections = main_news_container.find_elements_by_xpath("//a[@href]")
print(len(text_sections))
headlines = [];
links = [];
for elem in headlines_list:
    #if "/2020/" in elem.get_attribute("href"):
        #this is printing the link
    #print(elem.get_attribute("href"))
    links.append(elem.get_attribute("href"))
        #this is printing the Headline
    #print(elem.text)
    headlines.append(elem.text)


#Find the text body_elements inside the main_news_container
body_elements = main_news_container.find_elements_by_class_name("cnn-search__result-body")

#this is how you get the body body_elements text
#print(body_elements[1].text)
print(links[0])
print(len(headlines))
print(len(body_elements))

Selenium 中的 getAttributre("href") returns None

getAttributre("href") returns None in Selenium

python

selenium

attributes

href

web-scraping