我如何使用存储在一个变量中的 chromedriver 打开许多链接
how i can open up many links with chromedriver that are stored in one variable
所以我现在写了这段代码来网络抓取 cnn 并获取关于特定主题的文章:
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
import sys
import json
import os
serch = input('What News are you looking for today? ')
serch_term = str(serch)
real_serch = f'{serch_term}'
url = f'https://edition.cnn.com/search?q={real_serch}'
options = webdriver.ChromeOptions()
options.add_argument("--ignore-certificate-error")
options.add_argument("--ignore-ssl-errors")
service = Service(executable_path='chromedriver.exe')
driver = webdriver.Chrome(service=service, options=options)
driver.get(url)
time.sleep(4)
soup = BeautifulSoup(driver.page_source, 'html.parser')
#driver.close()
SAVED_DATA = "data.json"
def save_data(filepath, data):
with open(filepath, "w") as f:
json.dump(data, f)
def load_data(filepath):
try:
with open(filepath, "r") as f:
data = json.load(f)
return data
except:
return {}
def only_get_title():
for h3 in soup.select('h3.cnn-search__result-headline > a'):
title=h3.text
return(title)
def get_href():
for h3 in soup.select('h3.cnn-search__result-headline > a'):
title = h3.text
url_ = h3.get('href')
abs_url = 'https:'+ url_
return(abs_url)
def store():
data = load_data(SAVED_DATA)
key = only_get_title()
data[key] = get_href()
save_data(SAVED_DATA, data)
print("News saved!")
if __name__ == '__main__':
store()
我的问题是,在 abs_url 中存储了许多 link,在 cnn 上发现了关于该主题的不同文章,所以,我想查看其中的每一篇 links 并保存数据,但它只会打开存储在 abs_url 中的第一个 link 而不是另一个 我该怎么做 我打开每个 link 并保存每个 link 在我的 json 文件中,正如您在代码中看到的那样
你 运行 return
在 for
循环中所以你首先退出函数 link.
您应该将所有 link 添加到列表中并在 for
-loop
之后使用 return yourlist
def get_href():
all_results = []
# --- loop ---
for h3 in soup.select('h3.cnn-search__result-headline > a'):
title = h3.text
url_ = h3.get('href')
abs_url = 'https:'+ url_
all_results.append( abs_url)
# --- after loop --
return all_results
标题也有同样的问题
def only_get_title():
all_results = []
for h3 in soup.select('h3.cnn-search__result-headline > a'):
title = h3.text
all_results.append(title)
# --- after loop --
return all_results
以后您将需要使用 for
循环和 zip()
来创建对 (title, url)
def store():
data = load_data(SAVED_DATA)
all_titles = only_get_title()
all_urls = get_href()
for title, url in zip(all_titles, all_urls):
data[title] = url
save_data(SAVED_DATA, data)
print("News saved!")
但在一个函数中获取标题和 url 并在添加到列表时创建对可能会更简单、更易读
def get_articles():
all_results = []
# --- loop ---
for h3 in soup.select('h3.cnn-search__result-headline > a'):
title = h3.text
url = h3.get('href')
abs_url = 'https:'+ url
pair = (title, abs_url)
all_results.append( pair )
# --- after loop --
return all_results
def store():
data = load_data(SAVED_DATA)
all_articles = get_articles()
for title, url in all_articles:
data[title] = url
save_data(SAVED_DATA, data)
print("News saved!")
当您想从文章中获取更多详细信息时,这也会更安全,因为如果文章没有一些详细信息,那么您可以添加 None 或默认值。使用分离函数它可能会跳过空元素,稍后 zip()
将创建 wrog 对(元组)
所以我现在写了这段代码来网络抓取 cnn 并获取关于特定主题的文章:
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
import sys
import json
import os
serch = input('What News are you looking for today? ')
serch_term = str(serch)
real_serch = f'{serch_term}'
url = f'https://edition.cnn.com/search?q={real_serch}'
options = webdriver.ChromeOptions()
options.add_argument("--ignore-certificate-error")
options.add_argument("--ignore-ssl-errors")
service = Service(executable_path='chromedriver.exe')
driver = webdriver.Chrome(service=service, options=options)
driver.get(url)
time.sleep(4)
soup = BeautifulSoup(driver.page_source, 'html.parser')
#driver.close()
SAVED_DATA = "data.json"
def save_data(filepath, data):
with open(filepath, "w") as f:
json.dump(data, f)
def load_data(filepath):
try:
with open(filepath, "r") as f:
data = json.load(f)
return data
except:
return {}
def only_get_title():
for h3 in soup.select('h3.cnn-search__result-headline > a'):
title=h3.text
return(title)
def get_href():
for h3 in soup.select('h3.cnn-search__result-headline > a'):
title = h3.text
url_ = h3.get('href')
abs_url = 'https:'+ url_
return(abs_url)
def store():
data = load_data(SAVED_DATA)
key = only_get_title()
data[key] = get_href()
save_data(SAVED_DATA, data)
print("News saved!")
if __name__ == '__main__':
store()
我的问题是,在 abs_url 中存储了许多 link,在 cnn 上发现了关于该主题的不同文章,所以,我想查看其中的每一篇 links 并保存数据,但它只会打开存储在 abs_url 中的第一个 link 而不是另一个 我该怎么做 我打开每个 link 并保存每个 link 在我的 json 文件中,正如您在代码中看到的那样
你 运行 return
在 for
循环中所以你首先退出函数 link.
您应该将所有 link 添加到列表中并在 for
-loop
return yourlist
def get_href():
all_results = []
# --- loop ---
for h3 in soup.select('h3.cnn-search__result-headline > a'):
title = h3.text
url_ = h3.get('href')
abs_url = 'https:'+ url_
all_results.append( abs_url)
# --- after loop --
return all_results
标题也有同样的问题
def only_get_title():
all_results = []
for h3 in soup.select('h3.cnn-search__result-headline > a'):
title = h3.text
all_results.append(title)
# --- after loop --
return all_results
以后您将需要使用 for
循环和 zip()
来创建对 (title, url)
def store():
data = load_data(SAVED_DATA)
all_titles = only_get_title()
all_urls = get_href()
for title, url in zip(all_titles, all_urls):
data[title] = url
save_data(SAVED_DATA, data)
print("News saved!")
但在一个函数中获取标题和 url 并在添加到列表时创建对可能会更简单、更易读
def get_articles():
all_results = []
# --- loop ---
for h3 in soup.select('h3.cnn-search__result-headline > a'):
title = h3.text
url = h3.get('href')
abs_url = 'https:'+ url
pair = (title, abs_url)
all_results.append( pair )
# --- after loop --
return all_results
def store():
data = load_data(SAVED_DATA)
all_articles = get_articles()
for title, url in all_articles:
data[title] = url
save_data(SAVED_DATA, data)
print("News saved!")
当您想从文章中获取更多详细信息时,这也会更安全,因为如果文章没有一些详细信息,那么您可以添加 None 或默认值。使用分离函数它可能会跳过空元素,稍后 zip()
将创建 wrog 对(元组)