我如何使用存储在一个变量中的 chromedriver 打开许多链接

Question

所以我现在写了这段代码来网络抓取 cnn 并获取关于特定主题的文章：

from bs4 import BeautifulSoup
import time
from selenium import webdriver
from webdriver_manager.chrome import  ChromeDriverManager
from selenium.webdriver.chrome.service import Service
import sys
import json
import os

serch = input('What News are you looking for today? ')
serch_term = str(serch)
real_serch = f'{serch_term}'

url = f'https://edition.cnn.com/search?q={real_serch}'

options = webdriver.ChromeOptions()
options.add_argument("--ignore-certificate-error")
options.add_argument("--ignore-ssl-errors")
service = Service(executable_path='chromedriver.exe')
driver = webdriver.Chrome(service=service, options=options)

driver.get(url)
time.sleep(4)

soup = BeautifulSoup(driver.page_source, 'html.parser')
#driver.close()

SAVED_DATA = "data.json"

def save_data(filepath, data):
    with open(filepath, "w") as f:
        json.dump(data, f)
    

def load_data(filepath):
    try:
        with open(filepath, "r") as f:
            data = json.load(f)
            return data
    except:
        return {}

def only_get_title():
    for h3 in soup.select('h3.cnn-search__result-headline > a'):
        title=h3.text
        return(title)

def get_href():
    for h3 in soup.select('h3.cnn-search__result-headline > a'):
        title = h3.text
        url_ = h3.get('href')
        abs_url = 'https:'+ url_
        return(abs_url)

def store():
        data = load_data(SAVED_DATA)
        key = only_get_title()
        data[key] = get_href()
        save_data(SAVED_DATA, data)
        print("News saved!")


if __name__ == '__main__':
    store()

我的问题是，在 abs_url 中存储了许多 link，在 cnn 上发现了关于该主题的不同文章，所以，我想查看其中的每一篇 links 并保存数据，但它只会打开存储在 abs_url 中的第一个 link 而不是另一个我该怎么做我打开每个 link 并保存每个 link 在我的 json 文件中，正如您在代码中看到的那样

Answer 1

你运行 return 在 for 循环中所以你首先退出函数 link.

您应该将所有 link 添加到列表中并在 for-loop

之后使用 return yourlist

def get_href():
    all_results = []

    # --- loop ---

    for h3 in soup.select('h3.cnn-search__result-headline > a'):
        title = h3.text
        url_ = h3.get('href')
        abs_url = 'https:'+ url_

        all_results.append( abs_url)

    # --- after loop -- 

    return all_results

标题也有同样的问题

def only_get_title():

    all_results = []
 
    for h3 in soup.select('h3.cnn-search__result-headline > a'):
        title = h3.text
        all_results.append(title)

    # --- after loop -- 

    return all_results

以后您将需要使用 for 循环和 zip() 来创建对 (title, url)

def store():
    data = load_data(SAVED_DATA)
    
    all_titles = only_get_title()
    all_urls = get_href()
    
    for title, url in zip(all_titles, all_urls):
        data[title] = url
        
    save_data(SAVED_DATA, data)
    print("News saved!")

但在一个函数中获取标题和 url 并在添加到列表时创建对可能会更简单、更易读

def get_articles():
    all_results = []

    # --- loop ---

    for h3 in soup.select('h3.cnn-search__result-headline > a'):
        
        title = h3.text
        url = h3.get('href')
        abs_url = 'https:'+ url

        pair = (title, abs_url)

        all_results.append( pair )

    # --- after loop -- 

    return all_results

def store():
    data = load_data(SAVED_DATA)
    
    all_articles = get_articles()
    
    for title, url in all_articles:
        data[title] = url
        
    save_data(SAVED_DATA, data)
    print("News saved!")

当您想从文章中获取更多详细信息时，这也会更安全，因为如果文章没有一些详细信息，那么您可以添加 None 或默认值。使用分离函数它可能会跳过空元素，稍后 zip() 将创建 wrog 对（元组）

我如何使用存储在一个变量中的 chromedriver 打开许多链接

how i can open up many links with chromedriver that are stored in one variable

python

selenium

web-scraping

python-3.x

selenium-webdriver