如何根据标题抓取新闻内容?
How can I scrape the content of a news, based on its title?
我的个人小工具是为了好玩而构建的。我有一个列表框,其中的标题和新闻时间是从 2 link 秒中抓取的,并在单击“查看标题”按钮后打印在列表框中。这工作正常。一切顺利!
现在我想select列表框中的报纸标题,单击“查看内容”按钮,然后在多行文本框中查看新闻内容。所以我想在下面的文本框中查看selected标题的新闻内容。 我指定标题与新闻内容link相同。但是我在构建这个函数时遇到了问题:
def content():
if title.select:
#click on title-link
driver.find_element_by_tag_name("title").click()
#Download Content to class for every title
content_download =(" ".join([span.text for span in div.select("text mbottom")]))
#Print Content in textobox
textbox_download.insert(tk.END, content_download)
所以我想象要得到这个,我们必须模拟点击新闻标题打开它(在html中是title
),然后select内容的文本(在 html 中是 text mbottom
),然后将其复制到我的文件的 tetbox 中。应该是这样吗?你在说什么?显然我的代码写得不好,它不起作用。我不太擅长刮。谁能帮帮我?谢谢
完整的代码是这样的(可以正确执行并抓取标题和现在。我不调用按钮中的内容函数)。除了上述功能外,代码运行良好并获取标题和新闻时间
from tkinter import *
from tkinter import ttk
import tkinter as tk
import sqlite3
import random
import tkinter.font as tkFont
from tkinter import ttk
window=Tk()
window.title("x")
window.geometry("800x800")
textbox_title = tk.Listbox(window, width=80, height=16, font=('helvetic', 12), selectbackground="#960000", selectforeground="white", bg="white") #prima era self.tutti_pronostici, per far visualizzare le chiamate dall'altra finestra
textbox_title.place(x=1, y=1)
textbox_download = tk.Listbox(window, width=80, height=15, font=('helvetic', 12), selectbackground="#960000", selectforeground="white", bg="white") #prima era self.tutti_pronostici, per far visualizzare le chiamate dall'altra finestra
textbox_download.place(x=1, y=340)
#Download All Titles and Time
def all_titles():
allnews = []
import requests
from bs4 import BeautifulSoup
# mock browser request
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
}
#ATALANTA
site_atalanta = requests.get('https://www.tuttomercatoweb.com/atalanta/', headers=headers)
soup = BeautifulSoup(site_atalanta.content, 'html.parser')
news = soup.find_all('div', attrs={"class": "tcc-list-news"})
for each in news:
for div in each.find_all("div"):
time= (div.find('span', attrs={'class': 'hh serif'}).text)
title=(" ".join([span.text for span in div.select("a > span")]))
news = (f" {time} {'ATALANTA'}, {title} (TMW)")
allnews.append(news)
#BOLOGNA
site_bologna = requests.get('https://www.tuttomercatoweb.com/bologna/', headers=headers)
soup = BeautifulSoup(site_bologna.content, 'html.parser')
news = soup.find_all('div', attrs={"class": "tcc-list-news"})
for each in news:
for div in each.find_all("div"):
time= (div.find('span', attrs={'class': 'hh serif'}).text)
title=(" ".join([span.text for span in div.select("a > span")]))
news = (f" {time} {'BOLOGNA'}, {title} (TMW)")
allnews.append(news)
allnews.sort(reverse=True)
for news in allnews:
textbox_title.insert(tk.END, news)
#Download Content of News
def content():
if titolo.select:
#click on title-link
driver.find_element_by_tag_name("title").click()
#Download Content to class for every title
content_download =(" ".join([span.text for span in div.select("text mbottom")]))
#Print Content in textobox
textbox_download.insert(tk.END, content_download)
button = tk.Button(window, text="View Titles", command= lambda: [all_titles()])
button.place(x=1, y=680)
button2 = tk.Button(window, text="View Content", command= lambda: [content()])
button2.place(x=150, y=680)
window.mainloop()
当您获得 title
和 time
时,您可以直接获得 link
以包含详细信息的页面 - 并将它们保持成对。
news = f" {time} '{place}', {title} (TMW)"
link = div.find('a')['href']
results.append( [news, link] )
以后你只能显示 news
但是当你 select 标题时你可以获取索引并从 allnews
获取 link
并直接下载它 - 使用 requests
而不是 driver
def content():
# tuple with indexes of all selected titles
selection = listbox_title.curselection()
print('selection:', selection)
if selection:
item = allnews[selection[-1]]
print('item:', item)
url = item[1]
print('url:', url)
要select完整新闻,您必须使用带点的select(".text.mbottom")
。
要显示新闻,最好使用 Text()
而不是 Listbox()
因为 运行 ATALANTA
和 BOLOGNA
的代码相同,所以我将此代码移至函数 get_data_for(place)
现在我什至可以使用 for
- 循环到 运行 更多地方。
for place in ['atalanta', 'bologna']:
results = get_data_for(place)
allnews += results
完整的工作代码 (1) - 我试图只保留重要的元素。
我使用 pack()
而不是 place()
因为它允许调整大小 window 并且它也会调整 Listbox()
和 Text()
import tkinter as tk # PEP8: `import *` is not preferred
from tkinter import ttk
import requests
from bs4 import BeautifulSoup
# PEP8: all imports at the beginning
# --- functions --- # PEP8: all functions directly after imports
def get_data_for(place):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
}
results = []
response = requests.get(f'https://www.tuttomercatoweb.com/{place}/', headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
news = soup.find_all('div', attrs={"class": "tcc-list-news"})
for each in news:
for div in each.find_all("div"):
time = div.find('span', attrs={'class': 'hh serif'}).text
title = " ".join(span.text for span in div.select("a > span"))
news = f" {time} {place.upper()}, {title} (TMW)"
link = div.find('a')['href']
results.append( [news, link] )
return results
def all_titles():
global allnews # inform function to use global variable instead of local variable
allnews = []
for place in ['atalanta', 'bologna']:
print('search:', place)
results = get_data_for(place)
print('found:', len(results))
allnews += results
allnews.sort(reverse=True)
listbox_title.delete('0', 'end')
for news in allnews:
listbox_title.insert('end', news[0])
#Download Content of News
def content():
# tuple
selection = listbox_title.curselection()
print('selection:', selection)
if selection:
item = allnews[selection[-1]]
print('item:', item)
url = item[1]
print('url:', url)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
content_download = "\n".join(item.get_text() for item in soup.select("div.text.mbottom"))
text_download.delete('1.0', 'end') # remove previous content)
text_download.insert('end', content_download)
# --- main ---
allnews = [] # global variable with default value at start
window = tk.Tk()
window.geometry("800x800")
listbox_title = tk.Listbox(window, selectbackground="#960000", selectforeground="white", bg="white")
listbox_title.pack(fill='both', expand=True, pady=5, padx=5)
text_download = tk.Text(window, bg="white")
text_download.pack(fill='both', expand=True, pady=0, padx=5)
buttons_frame = tk.Frame(window)
buttons_frame.pack(fill='x')
button1 = tk.Button(buttons_frame, text="View Titles", command=all_titles) # don't use `[]` to execute functions
button1.pack(side='left', pady=5, padx=5)
button2 = tk.Button(buttons_frame, text="View Content", command=content) # don't use `[]` to execute functions
button2.pack(side='left', pady=5, padx=(0,5))
window.mainloop()
结果:
编辑:
排序问题:今天的标题在列表的末尾,但它们应该在开头 - 所有这些都是因为它们仅使用 time
进行排序,但需要使用 date time
进行排序或 number time
.
你会 enumerate
每 tcc-list-news
然后每一天都会有自己的号码,他们会(几乎)正确排序。因为你想以相反的顺序排序,所以你可能需要 -number
而不是 number
来获得正确的顺序。
for number, each in enumerate(news):
for div in each.find_all("div"):
time = div.find('span', attrs={'class': 'hh serif'}).text
title = " ".join(span.text for span in div.select("a > span"))
news = f" {time} {place.upper()}, {title} (TMW)"
link = div.find('a')['href']
results.append( [-number, news, link] )
排序后
for number, news, url in allnews:
listbox_title.insert('end', news)
完整的工作代码 (2)
import tkinter as tk # PEP8: `import *` is not preferred
from tkinter import ttk
import requests
from bs4 import BeautifulSoup
# PEP8: all imports at the beginning
# --- functions --- # PEP8: all functions directly after imports
def get_data_for(place):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
}
results = []
response = requests.get(f'https://www.tuttomercatoweb.com/{place}/', headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
news = soup.find_all('div', attrs={"class": "tcc-list-news"})
for number, each in enumerate(news):
for div in each.find_all("div"):
time = div.find('span', attrs={'class': 'hh serif'}).text
title = " ".join(span.text for span in div.select("a > span"))
news = f" {time} {place.upper()}, {title} (TMW)"
link = div.find('a')['href']
results.append( [-number, news, link] )
return results
def all_titles():
global allnews # inform function to use global variable instead of local variable
allnews = []
for place in ['atalanta', 'bologna']:
print('search:', place)
results = get_data_for(place)
print('found:', len(results))
allnews += results
allnews.sort(reverse=True)
listbox_title.delete('0', 'end')
for number, news, url in allnews:
listbox_title.insert('end', news)
#Download Content of News
def content():
# tuple
selection = listbox_title.curselection()
print('selection:', selection)
if selection:
item = allnews[selection[-1]]
print('item:', item)
url = item[2]
print('url:', url)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
content_download = "\n".join(item.get_text() for item in soup.select("div.text.mbottom"))
text_download.delete('1.0', 'end') # remove previous content)
text_download.insert('end', content_download)
# --- main ---
allnews = [] # global variable with default value at start
window = tk.Tk()
window.geometry("800x800")
listbox_title = tk.Listbox(window, selectbackground="#960000", selectforeground="white", bg="white")
listbox_title.pack(fill='both', expand=True, pady=5, padx=5)
text_download = tk.Text(window, bg="white")
text_download.pack(fill='both', expand=True, pady=0, padx=5)
buttons_frame = tk.Frame(window)
buttons_frame.pack(fill='x')
button1 = tk.Button(buttons_frame, text="View Titles", command=all_titles) # don't use `[]` to execute functions
button1.pack(side='left', pady=5, padx=5)
button2 = tk.Button(buttons_frame, text="View Content", command=content) # don't use `[]` to execute functions
button2.pack(side='left', pady=5, padx=(0,5))
window.mainloop()
顺便说一句
因为您以相反的顺序排序,所以您在 00:30 atalanta
之前得到 00:30 bologna
- 要在 00:30 bologna
之前得到 00:30 atalanta
,您必须保留 time
, place
作为分隔值,并在 sort()
中使用 key=
来分配仅反转 time
而不会反转 place
和 number
的函数。也许将所有内容都放在 pandas.DataFrame
中会更简单,因为 pandas.DataFrame
有更好的排序方法。
带有 pandas.DataFrame
和 sort_values()
的版本
df = df.sort_values(by=['number', 'time', 'place', 'title'], ascending=[True, False, True, True])
如果您更改顺序 'title', 'place'
而不是 'place', 'title'
,那么您会得到相同的标题。
完整的工作代码 (3)
import tkinter as tk # PEP8: `import *` is not preferred
from tkinter import ttk
import requests
from bs4 import BeautifulSoup
import pandas as pd
# PEP8: all imports at the beginning
# --- functions --- # PEP8: all functions directly after imports
def get_data_for(place):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
}
results = []
response = requests.get(f'https://www.tuttomercatoweb.com/{place}/', headers=headers)
print('url:', response.url)
print('status:', response.status_code)
#print('html:', response.text[:1000])
soup = BeautifulSoup(response.content, 'html.parser')
news = soup.find_all('div', attrs={"class": "tcc-list-news"})
for number, each in enumerate(news):
for div in each.find_all("div"):
time = div.find('span', attrs={'class': 'hh serif'}).text
title = " ".join(span.text for span in div.select("a > span"))
news = f" {time} {place.upper()}, {title} (TMW)"
link = div.find('a')['href']
results.append( [number, time, place, title, news, link] )
return results
def all_titles():
global df
allnews = [] # local variable
for place in ['atalanta', 'bologna']:
print('search:', place)
results = get_data_for(place)
print('found:', len(results))
allnews += results
text_download.insert('end', f"search: {place}\nfound: {len(results)}\n")
df = pd.DataFrame(allnews, columns=['number', 'time', 'place', 'title', 'news', 'link'])
df = df.sort_values(by=['number', 'time', 'place', 'title'], ascending=[True, False, True, True])
df = df.reset_index()
listbox_title.delete('0', 'end')
for index, row in df.iterrows():
listbox_title.insert('end', row['news'])
#Download Content of News
def content():
# tuple
selection = listbox_title.curselection()
print('selection:', selection)
if selection:
item = df.iloc[selection[-1]]
#print('item:', item)
url = item['link']
#print('url:', url)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
content_download = "\n".join(item.get_text() for item in soup.select("div.text.mbottom"))
text_download.delete('1.0', 'end') # remove previous content)
text_download.insert('end', content_download)
# --- main ---
df = None
window = tk.Tk()
window.geometry("800x800")
listbox_title = tk.Listbox(window, selectbackground="#960000", selectforeground="white", bg="white")
listbox_title.pack(fill='both', expand=True, pady=5, padx=5)
text_download = tk.Text(window, bg="white")
text_download.pack(fill='both', expand=True, pady=0, padx=5)
buttons_frame = tk.Frame(window)
buttons_frame.pack(fill='x')
button1 = tk.Button(buttons_frame, text="View Titles", command=all_titles) # don't use `[]` to execute functions
button1.pack(side='left', pady=5, padx=5)
button2 = tk.Button(buttons_frame, text="View Content", command=content) # don't use `[]` to execute functions
button2.pack(side='left', pady=5, padx=(0,5))
window.mainloop()
编辑:
最新版本
- ScrolledText
- Scrollbar
double click
标题看新闻
- requests_cache to read page with news only once even if you click it many times (it may need to install SQLite)
完整的工作代码 (4)
import tkinter as tk # PEP8: `import *` is not preferred
from tkinter import ttk
from tkinter.scrolledtext import ScrolledText # https://docs.python.org/3/library/tkinter.scrolledtext.html
import requests
import requests_cache # https://github.com/reclosedev/requests-cache
from bs4 import BeautifulSoup
import pandas as pd
# PEP8: all imports at the beginning
# --- functions --- # PEP8: all functions directly after imports
def get_data_for(place):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
}
results = []
response = requests.get(f'https://www.tuttomercatoweb.com/{place}/', headers=headers)
print('url:', response.url)
print('status:', response.status_code)
#print('html:', response.text[:1000])
soup = BeautifulSoup(response.content, 'html.parser')
news = soup.find_all('div', attrs={"class": "tcc-list-news"})
for number, each in enumerate(news):
for div in each.find_all("div"):
time = div.find('span', attrs={'class': 'hh serif'}).text
title = " ".join(span.text for span in div.select("a > span"))
news = f" {time} {place.upper()}, {title} (TMW)"
link = div.find('a')['href']
results.append( [number, time, place, title, news, link] )
return results
def all_titles():
global df
allnews = [] # local variable
for place in ['atalanta', 'bologna']:
print('search:', place)
results = get_data_for(place)
print('found:', len(results))
allnews += results
text_download.insert('end', f"search: {place}\nfound: {len(results)}\n")
df = pd.DataFrame(allnews, columns=['number', 'time', 'place', 'title', 'news', 'link'])
df = df.sort_values(by=['number', 'time', 'place', 'title'], ascending=[True, False, True, True])
df = df.reset_index()
listbox_title.delete('0', 'end')
for index, row in df.iterrows():
listbox_title.insert('end', row['news'])
def content(event=None): # `command=` executes without `event`, but `bind` executes with `event` - so it needs default value
# tuple
selection = listbox_title.curselection()
print('selection:', selection)
if selection:
item = df.iloc[selection[-1]]
#print('item:', item)
url = item['link']
#print('url:', url)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
}
# keep page in database `SQLite`
# https://github.com/reclosedev/requests-cache
# https://sqlite.org/index.html
session = requests_cache.CachedSession('titles')
response = session.get(url, headers=headers)
#response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
content_download = "\n".join(item.get_text() for item in soup.select("div.text.mbottom"))
text_download.delete('1.0', 'end') # remove previous content)
text_download.insert('end', content_download)
# --- main ---
df = None
window = tk.Tk()
window.geometry("800x800")
# ---
# [Tkinter: How to display Listbox with Scrollbar — furas.pl](https://blog.furas.pl/python-tkitner-how-to-display-listbox-with-scrollbar-gb.html)
frame_title = tk.Frame(window)
frame_title.pack(fill='both', expand=True, pady=5, padx=5)
listbox_title = tk.Listbox(frame_title, selectbackground="#960000", selectforeground="white", bg="white")
listbox_title.pack(side='left', fill='both', expand=True)
scrollbar_title = tk.Scrollbar(frame_title)
scrollbar_title.pack(side='left', fill='y')
scrollbar_title['command'] = listbox_title.yview
listbox_title.config(yscrollcommand=scrollbar_title.set)
listbox_title.bind('<Double-Button-1>', content) # it executes `content(event)`
# ----
text_download = ScrolledText(window, bg="white")
text_download.pack(fill='both', expand=True, pady=0, padx=5)
# ----
buttons_frame = tk.Frame(window)
buttons_frame.pack(fill='x')
button1 = tk.Button(buttons_frame, text="View Titles", command=all_titles) # don't use `[]` to execute functions
button1.pack(side='left', pady=5, padx=5)
button2 = tk.Button(buttons_frame, text="View Content", command=content) # don't use `[]` to execute functions
button2.pack(side='left', pady=5, padx=(0,5))
window.mainloop()
我的个人小工具是为了好玩而构建的。我有一个列表框,其中的标题和新闻时间是从 2 link 秒中抓取的,并在单击“查看标题”按钮后打印在列表框中。这工作正常。一切顺利!
现在我想select列表框中的报纸标题,单击“查看内容”按钮,然后在多行文本框中查看新闻内容。所以我想在下面的文本框中查看selected标题的新闻内容。 我指定标题与新闻内容link相同。但是我在构建这个函数时遇到了问题:
def content():
if title.select:
#click on title-link
driver.find_element_by_tag_name("title").click()
#Download Content to class for every title
content_download =(" ".join([span.text for span in div.select("text mbottom")]))
#Print Content in textobox
textbox_download.insert(tk.END, content_download)
所以我想象要得到这个,我们必须模拟点击新闻标题打开它(在html中是title
),然后select内容的文本(在 html 中是 text mbottom
),然后将其复制到我的文件的 tetbox 中。应该是这样吗?你在说什么?显然我的代码写得不好,它不起作用。我不太擅长刮。谁能帮帮我?谢谢
完整的代码是这样的(可以正确执行并抓取标题和现在。我不调用按钮中的内容函数)。除了上述功能外,代码运行良好并获取标题和新闻时间
from tkinter import *
from tkinter import ttk
import tkinter as tk
import sqlite3
import random
import tkinter.font as tkFont
from tkinter import ttk
window=Tk()
window.title("x")
window.geometry("800x800")
textbox_title = tk.Listbox(window, width=80, height=16, font=('helvetic', 12), selectbackground="#960000", selectforeground="white", bg="white") #prima era self.tutti_pronostici, per far visualizzare le chiamate dall'altra finestra
textbox_title.place(x=1, y=1)
textbox_download = tk.Listbox(window, width=80, height=15, font=('helvetic', 12), selectbackground="#960000", selectforeground="white", bg="white") #prima era self.tutti_pronostici, per far visualizzare le chiamate dall'altra finestra
textbox_download.place(x=1, y=340)
#Download All Titles and Time
def all_titles():
allnews = []
import requests
from bs4 import BeautifulSoup
# mock browser request
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
}
#ATALANTA
site_atalanta = requests.get('https://www.tuttomercatoweb.com/atalanta/', headers=headers)
soup = BeautifulSoup(site_atalanta.content, 'html.parser')
news = soup.find_all('div', attrs={"class": "tcc-list-news"})
for each in news:
for div in each.find_all("div"):
time= (div.find('span', attrs={'class': 'hh serif'}).text)
title=(" ".join([span.text for span in div.select("a > span")]))
news = (f" {time} {'ATALANTA'}, {title} (TMW)")
allnews.append(news)
#BOLOGNA
site_bologna = requests.get('https://www.tuttomercatoweb.com/bologna/', headers=headers)
soup = BeautifulSoup(site_bologna.content, 'html.parser')
news = soup.find_all('div', attrs={"class": "tcc-list-news"})
for each in news:
for div in each.find_all("div"):
time= (div.find('span', attrs={'class': 'hh serif'}).text)
title=(" ".join([span.text for span in div.select("a > span")]))
news = (f" {time} {'BOLOGNA'}, {title} (TMW)")
allnews.append(news)
allnews.sort(reverse=True)
for news in allnews:
textbox_title.insert(tk.END, news)
#Download Content of News
def content():
if titolo.select:
#click on title-link
driver.find_element_by_tag_name("title").click()
#Download Content to class for every title
content_download =(" ".join([span.text for span in div.select("text mbottom")]))
#Print Content in textobox
textbox_download.insert(tk.END, content_download)
button = tk.Button(window, text="View Titles", command= lambda: [all_titles()])
button.place(x=1, y=680)
button2 = tk.Button(window, text="View Content", command= lambda: [content()])
button2.place(x=150, y=680)
window.mainloop()
当您获得 title
和 time
时,您可以直接获得 link
以包含详细信息的页面 - 并将它们保持成对。
news = f" {time} '{place}', {title} (TMW)"
link = div.find('a')['href']
results.append( [news, link] )
以后你只能显示 news
但是当你 select 标题时你可以获取索引并从 allnews
获取 link
并直接下载它 - 使用 requests
而不是 driver
def content():
# tuple with indexes of all selected titles
selection = listbox_title.curselection()
print('selection:', selection)
if selection:
item = allnews[selection[-1]]
print('item:', item)
url = item[1]
print('url:', url)
要select完整新闻,您必须使用带点的select(".text.mbottom")
。
要显示新闻,最好使用 Text()
而不是 Listbox()
因为 运行 ATALANTA
和 BOLOGNA
的代码相同,所以我将此代码移至函数 get_data_for(place)
现在我什至可以使用 for
- 循环到 运行 更多地方。
for place in ['atalanta', 'bologna']:
results = get_data_for(place)
allnews += results
完整的工作代码 (1) - 我试图只保留重要的元素。
我使用 pack()
而不是 place()
因为它允许调整大小 window 并且它也会调整 Listbox()
和 Text()
import tkinter as tk # PEP8: `import *` is not preferred
from tkinter import ttk
import requests
from bs4 import BeautifulSoup
# PEP8: all imports at the beginning
# --- functions --- # PEP8: all functions directly after imports
def get_data_for(place):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
}
results = []
response = requests.get(f'https://www.tuttomercatoweb.com/{place}/', headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
news = soup.find_all('div', attrs={"class": "tcc-list-news"})
for each in news:
for div in each.find_all("div"):
time = div.find('span', attrs={'class': 'hh serif'}).text
title = " ".join(span.text for span in div.select("a > span"))
news = f" {time} {place.upper()}, {title} (TMW)"
link = div.find('a')['href']
results.append( [news, link] )
return results
def all_titles():
global allnews # inform function to use global variable instead of local variable
allnews = []
for place in ['atalanta', 'bologna']:
print('search:', place)
results = get_data_for(place)
print('found:', len(results))
allnews += results
allnews.sort(reverse=True)
listbox_title.delete('0', 'end')
for news in allnews:
listbox_title.insert('end', news[0])
#Download Content of News
def content():
# tuple
selection = listbox_title.curselection()
print('selection:', selection)
if selection:
item = allnews[selection[-1]]
print('item:', item)
url = item[1]
print('url:', url)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
content_download = "\n".join(item.get_text() for item in soup.select("div.text.mbottom"))
text_download.delete('1.0', 'end') # remove previous content)
text_download.insert('end', content_download)
# --- main ---
allnews = [] # global variable with default value at start
window = tk.Tk()
window.geometry("800x800")
listbox_title = tk.Listbox(window, selectbackground="#960000", selectforeground="white", bg="white")
listbox_title.pack(fill='both', expand=True, pady=5, padx=5)
text_download = tk.Text(window, bg="white")
text_download.pack(fill='both', expand=True, pady=0, padx=5)
buttons_frame = tk.Frame(window)
buttons_frame.pack(fill='x')
button1 = tk.Button(buttons_frame, text="View Titles", command=all_titles) # don't use `[]` to execute functions
button1.pack(side='left', pady=5, padx=5)
button2 = tk.Button(buttons_frame, text="View Content", command=content) # don't use `[]` to execute functions
button2.pack(side='left', pady=5, padx=(0,5))
window.mainloop()
结果:
编辑:
排序问题:今天的标题在列表的末尾,但它们应该在开头 - 所有这些都是因为它们仅使用 time
进行排序,但需要使用 date time
进行排序或 number time
.
你会 enumerate
每 tcc-list-news
然后每一天都会有自己的号码,他们会(几乎)正确排序。因为你想以相反的顺序排序,所以你可能需要 -number
而不是 number
来获得正确的顺序。
for number, each in enumerate(news):
for div in each.find_all("div"):
time = div.find('span', attrs={'class': 'hh serif'}).text
title = " ".join(span.text for span in div.select("a > span"))
news = f" {time} {place.upper()}, {title} (TMW)"
link = div.find('a')['href']
results.append( [-number, news, link] )
排序后
for number, news, url in allnews:
listbox_title.insert('end', news)
完整的工作代码 (2)
import tkinter as tk # PEP8: `import *` is not preferred
from tkinter import ttk
import requests
from bs4 import BeautifulSoup
# PEP8: all imports at the beginning
# --- functions --- # PEP8: all functions directly after imports
def get_data_for(place):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
}
results = []
response = requests.get(f'https://www.tuttomercatoweb.com/{place}/', headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
news = soup.find_all('div', attrs={"class": "tcc-list-news"})
for number, each in enumerate(news):
for div in each.find_all("div"):
time = div.find('span', attrs={'class': 'hh serif'}).text
title = " ".join(span.text for span in div.select("a > span"))
news = f" {time} {place.upper()}, {title} (TMW)"
link = div.find('a')['href']
results.append( [-number, news, link] )
return results
def all_titles():
global allnews # inform function to use global variable instead of local variable
allnews = []
for place in ['atalanta', 'bologna']:
print('search:', place)
results = get_data_for(place)
print('found:', len(results))
allnews += results
allnews.sort(reverse=True)
listbox_title.delete('0', 'end')
for number, news, url in allnews:
listbox_title.insert('end', news)
#Download Content of News
def content():
# tuple
selection = listbox_title.curselection()
print('selection:', selection)
if selection:
item = allnews[selection[-1]]
print('item:', item)
url = item[2]
print('url:', url)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
content_download = "\n".join(item.get_text() for item in soup.select("div.text.mbottom"))
text_download.delete('1.0', 'end') # remove previous content)
text_download.insert('end', content_download)
# --- main ---
allnews = [] # global variable with default value at start
window = tk.Tk()
window.geometry("800x800")
listbox_title = tk.Listbox(window, selectbackground="#960000", selectforeground="white", bg="white")
listbox_title.pack(fill='both', expand=True, pady=5, padx=5)
text_download = tk.Text(window, bg="white")
text_download.pack(fill='both', expand=True, pady=0, padx=5)
buttons_frame = tk.Frame(window)
buttons_frame.pack(fill='x')
button1 = tk.Button(buttons_frame, text="View Titles", command=all_titles) # don't use `[]` to execute functions
button1.pack(side='left', pady=5, padx=5)
button2 = tk.Button(buttons_frame, text="View Content", command=content) # don't use `[]` to execute functions
button2.pack(side='left', pady=5, padx=(0,5))
window.mainloop()
顺便说一句
因为您以相反的顺序排序,所以您在 00:30 atalanta
之前得到 00:30 bologna
- 要在 00:30 bologna
之前得到 00:30 atalanta
,您必须保留 time
, place
作为分隔值,并在 sort()
中使用 key=
来分配仅反转 time
而不会反转 place
和 number
的函数。也许将所有内容都放在 pandas.DataFrame
中会更简单,因为 pandas.DataFrame
有更好的排序方法。
带有 pandas.DataFrame
和 sort_values()
df = df.sort_values(by=['number', 'time', 'place', 'title'], ascending=[True, False, True, True])
如果您更改顺序 'title', 'place'
而不是 'place', 'title'
,那么您会得到相同的标题。
完整的工作代码 (3)
import tkinter as tk # PEP8: `import *` is not preferred
from tkinter import ttk
import requests
from bs4 import BeautifulSoup
import pandas as pd
# PEP8: all imports at the beginning
# --- functions --- # PEP8: all functions directly after imports
def get_data_for(place):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
}
results = []
response = requests.get(f'https://www.tuttomercatoweb.com/{place}/', headers=headers)
print('url:', response.url)
print('status:', response.status_code)
#print('html:', response.text[:1000])
soup = BeautifulSoup(response.content, 'html.parser')
news = soup.find_all('div', attrs={"class": "tcc-list-news"})
for number, each in enumerate(news):
for div in each.find_all("div"):
time = div.find('span', attrs={'class': 'hh serif'}).text
title = " ".join(span.text for span in div.select("a > span"))
news = f" {time} {place.upper()}, {title} (TMW)"
link = div.find('a')['href']
results.append( [number, time, place, title, news, link] )
return results
def all_titles():
global df
allnews = [] # local variable
for place in ['atalanta', 'bologna']:
print('search:', place)
results = get_data_for(place)
print('found:', len(results))
allnews += results
text_download.insert('end', f"search: {place}\nfound: {len(results)}\n")
df = pd.DataFrame(allnews, columns=['number', 'time', 'place', 'title', 'news', 'link'])
df = df.sort_values(by=['number', 'time', 'place', 'title'], ascending=[True, False, True, True])
df = df.reset_index()
listbox_title.delete('0', 'end')
for index, row in df.iterrows():
listbox_title.insert('end', row['news'])
#Download Content of News
def content():
# tuple
selection = listbox_title.curselection()
print('selection:', selection)
if selection:
item = df.iloc[selection[-1]]
#print('item:', item)
url = item['link']
#print('url:', url)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
content_download = "\n".join(item.get_text() for item in soup.select("div.text.mbottom"))
text_download.delete('1.0', 'end') # remove previous content)
text_download.insert('end', content_download)
# --- main ---
df = None
window = tk.Tk()
window.geometry("800x800")
listbox_title = tk.Listbox(window, selectbackground="#960000", selectforeground="white", bg="white")
listbox_title.pack(fill='both', expand=True, pady=5, padx=5)
text_download = tk.Text(window, bg="white")
text_download.pack(fill='both', expand=True, pady=0, padx=5)
buttons_frame = tk.Frame(window)
buttons_frame.pack(fill='x')
button1 = tk.Button(buttons_frame, text="View Titles", command=all_titles) # don't use `[]` to execute functions
button1.pack(side='left', pady=5, padx=5)
button2 = tk.Button(buttons_frame, text="View Content", command=content) # don't use `[]` to execute functions
button2.pack(side='left', pady=5, padx=(0,5))
window.mainloop()
编辑:
最新版本
- ScrolledText
- Scrollbar
double click
标题看新闻- requests_cache to read page with news only once even if you click it many times (it may need to install SQLite)
完整的工作代码 (4)
import tkinter as tk # PEP8: `import *` is not preferred
from tkinter import ttk
from tkinter.scrolledtext import ScrolledText # https://docs.python.org/3/library/tkinter.scrolledtext.html
import requests
import requests_cache # https://github.com/reclosedev/requests-cache
from bs4 import BeautifulSoup
import pandas as pd
# PEP8: all imports at the beginning
# --- functions --- # PEP8: all functions directly after imports
def get_data_for(place):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
}
results = []
response = requests.get(f'https://www.tuttomercatoweb.com/{place}/', headers=headers)
print('url:', response.url)
print('status:', response.status_code)
#print('html:', response.text[:1000])
soup = BeautifulSoup(response.content, 'html.parser')
news = soup.find_all('div', attrs={"class": "tcc-list-news"})
for number, each in enumerate(news):
for div in each.find_all("div"):
time = div.find('span', attrs={'class': 'hh serif'}).text
title = " ".join(span.text for span in div.select("a > span"))
news = f" {time} {place.upper()}, {title} (TMW)"
link = div.find('a')['href']
results.append( [number, time, place, title, news, link] )
return results
def all_titles():
global df
allnews = [] # local variable
for place in ['atalanta', 'bologna']:
print('search:', place)
results = get_data_for(place)
print('found:', len(results))
allnews += results
text_download.insert('end', f"search: {place}\nfound: {len(results)}\n")
df = pd.DataFrame(allnews, columns=['number', 'time', 'place', 'title', 'news', 'link'])
df = df.sort_values(by=['number', 'time', 'place', 'title'], ascending=[True, False, True, True])
df = df.reset_index()
listbox_title.delete('0', 'end')
for index, row in df.iterrows():
listbox_title.insert('end', row['news'])
def content(event=None): # `command=` executes without `event`, but `bind` executes with `event` - so it needs default value
# tuple
selection = listbox_title.curselection()
print('selection:', selection)
if selection:
item = df.iloc[selection[-1]]
#print('item:', item)
url = item['link']
#print('url:', url)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
}
# keep page in database `SQLite`
# https://github.com/reclosedev/requests-cache
# https://sqlite.org/index.html
session = requests_cache.CachedSession('titles')
response = session.get(url, headers=headers)
#response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
content_download = "\n".join(item.get_text() for item in soup.select("div.text.mbottom"))
text_download.delete('1.0', 'end') # remove previous content)
text_download.insert('end', content_download)
# --- main ---
df = None
window = tk.Tk()
window.geometry("800x800")
# ---
# [Tkinter: How to display Listbox with Scrollbar — furas.pl](https://blog.furas.pl/python-tkitner-how-to-display-listbox-with-scrollbar-gb.html)
frame_title = tk.Frame(window)
frame_title.pack(fill='both', expand=True, pady=5, padx=5)
listbox_title = tk.Listbox(frame_title, selectbackground="#960000", selectforeground="white", bg="white")
listbox_title.pack(side='left', fill='both', expand=True)
scrollbar_title = tk.Scrollbar(frame_title)
scrollbar_title.pack(side='left', fill='y')
scrollbar_title['command'] = listbox_title.yview
listbox_title.config(yscrollcommand=scrollbar_title.set)
listbox_title.bind('<Double-Button-1>', content) # it executes `content(event)`
# ----
text_download = ScrolledText(window, bg="white")
text_download.pack(fill='both', expand=True, pady=0, padx=5)
# ----
buttons_frame = tk.Frame(window)
buttons_frame.pack(fill='x')
button1 = tk.Button(buttons_frame, text="View Titles", command=all_titles) # don't use `[]` to execute functions
button1.pack(side='left', pady=5, padx=5)
button2 = tk.Button(buttons_frame, text="View Content", command=content) # don't use `[]` to execute functions
button2.pack(side='left', pady=5, padx=(0,5))
window.mainloop()