如何根据标题抓取新闻内容？

Question

我的个人小工具是为了好玩而构建的。我有一个列表框，其中的标题和新闻时间是从 2 link 秒中抓取的，并在单击“查看标题”按钮后打印在列表框中。这工作正常。一切顺利！

现在我想select列表框中的报纸标题，单击“查看内容”按钮，然后在多行文本框中查看新闻内容。所以我想在下面的文本框中查看selected标题的新闻内容。 我指定标题与新闻内容link相同。但是我在构建这个函数时遇到了问题：

def content():
    if title.select:

        #click on title-link
        driver.find_element_by_tag_name("title").click()

        #Download Content to class for every title
        content_download =(" ".join([span.text for span in div.select("text mbottom")]))

        #Print Content in textobox
        textbox_download.insert(tk.END, content_download)

所以我想象要得到这个，我们必须模拟点击新闻标题打开它（在html中是title），然后select内容的文本（在 html 中是 text mbottom），然后将其复制到我的文件的 tetbox 中。应该是这样吗？你在说什么？显然我的代码写得不好，它不起作用。我不太擅长刮。谁能帮帮我？谢谢

完整的代码是这样的（可以正确执行并抓取标题和现在。我不调用按钮中的内容函数）。除了上述功能外，代码运行良好并获取标题和新闻时间

from tkinter import *
from tkinter import ttk
import tkinter as tk
import sqlite3
import random
import tkinter.font as tkFont
from tkinter import ttk

window=Tk()
window.title("x")
window.geometry("800x800")

textbox_title = tk.Listbox(window, width=80, height=16, font=('helvetic', 12), selectbackground="#960000", selectforeground="white", bg="white") #prima era self.tutti_pronostici, per far visualizzare le chiamate dall'altra finestra
textbox_title.place(x=1, y=1)

textbox_download = tk.Listbox(window, width=80, height=15, font=('helvetic', 12), selectbackground="#960000", selectforeground="white", bg="white") #prima era self.tutti_pronostici, per far visualizzare le chiamate dall'altra finestra
textbox_download.place(x=1, y=340)

#Download All Titles and Time
def all_titles():

    allnews = []

    import requests
    from bs4 import BeautifulSoup

    # mock browser request
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
    }


    #ATALANTA
    site_atalanta = requests.get('https://www.tuttomercatoweb.com/atalanta/', headers=headers)
    soup = BeautifulSoup(site_atalanta.content, 'html.parser')

    news = soup.find_all('div', attrs={"class": "tcc-list-news"})

    for each in news:
        for div in each.find_all("div"):
            time= (div.find('span', attrs={'class': 'hh serif'}).text)
            title=(" ".join([span.text for span in div.select("a > span")]))

            news = (f" {time} {'ATALANTA'}, {title} (TMW)")
            allnews.append(news)        


    #BOLOGNA
    site_bologna = requests.get('https://www.tuttomercatoweb.com/bologna/', headers=headers)
    soup = BeautifulSoup(site_bologna.content, 'html.parser')

    news = soup.find_all('div', attrs={"class": "tcc-list-news"})

    for each in news:
        for div in each.find_all("div"):
            time= (div.find('span', attrs={'class': 'hh serif'}).text)
            title=(" ".join([span.text for span in div.select("a > span")]))

            news = (f" {time} {'BOLOGNA'}, {title} (TMW)")
            allnews.append(news)           
                            

    allnews.sort(reverse=True)

    for news in allnews:
        textbox_title.insert(tk.END, news)

#Download Content of News
def content():
    if titolo.select:

        #click on title-link
        driver.find_element_by_tag_name("title").click()

        #Download Content to class for every title
        content_download =(" ".join([span.text for span in div.select("text mbottom")]))

        #Print Content in textobox
        textbox_download.insert(tk.END, content_download)



button = tk.Button(window, text="View Titles", command= lambda: [all_titles()])
button.place(x=1, y=680)

button2 = tk.Button(window, text="View Content", command= lambda: [content()])
button2.place(x=150, y=680)

window.mainloop()

Answer 1

当您获得 title 和 time 时，您可以直接获得 link 以包含详细信息的页面 - 并将它们保持成对。

            news = f" {time} '{place}', {title} (TMW)"
            link = div.find('a')['href']

            results.append( [news, link] )

以后你只能显示 news 但是当你 select 标题时你可以获取索引并从 allnews 获取 link 并直接下载它 - 使用 requests 而不是 driver

def content():
    # tuple with indexes of all selected titles
    selection = listbox_title.curselection()
    print('selection:', selection)

    if selection:
        
        item = allnews[selection[-1]]
        print('item:', item)

        url = item[1]
        print('url:', url)

要select完整新闻，您必须使用带点的select(".text.mbottom")。

要显示新闻，最好使用 Text() 而不是 Listbox()

因为运行 ATALANTA 和 BOLOGNA 的代码相同，所以我将此代码移至函数 get_data_for(place) 现在我什至可以使用 for - 循环到运行更多地方。

for place in ['atalanta',  'bologna']: 
    results = get_data_for(place)
    allnews += results

完整的工作代码 (1) - 我试图只保留重要的元素。

我使用 pack() 而不是 place() 因为它允许调整大小 window 并且它也会调整 Listbox() 和 Text()

import tkinter as tk   # PEP8: `import *` is not preferred
from tkinter import ttk
import requests
from bs4 import BeautifulSoup

# PEP8: all imports at the beginning

# --- functions ---   # PEP8: all functions directly after imports

def get_data_for(place):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
    }
    
    results = []
    
    response = requests.get(f'https://www.tuttomercatoweb.com/{place}/', headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    news = soup.find_all('div', attrs={"class": "tcc-list-news"})

    for each in news:
        for div in each.find_all("div"):
            time = div.find('span', attrs={'class': 'hh serif'}).text
            title = " ".join(span.text for span in div.select("a > span"))
            news = f" {time} {place.upper()}, {title} (TMW)"
            link = div.find('a')['href']
            results.append( [news, link] )
    
    return results

def all_titles():
    global allnews  # inform function to use global variable instead of local variable

    allnews = []

    for place in ['atalanta',  'bologna']: 
        print('search:', place)
        results = get_data_for(place)
        print('found:', len(results))
        allnews += results

    allnews.sort(reverse=True)

    listbox_title.delete('0', 'end')

    for news in allnews:
        listbox_title.insert('end', news[0])

#Download Content of News
def content():
    # tuple
    selection = listbox_title.curselection()
    print('selection:', selection)

    if selection:
        
        item = allnews[selection[-1]]
        print('item:', item)
        url = item[1]
        print('url:', url)

        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
        }
             
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')

        content_download = "\n".join(item.get_text() for item in soup.select("div.text.mbottom"))

        text_download.delete('1.0', 'end') # remove previous content)
        text_download.insert('end', content_download)

# --- main ---

allnews = []  # global variable with default value at start

window = tk.Tk()
window.geometry("800x800")

listbox_title = tk.Listbox(window, selectbackground="#960000", selectforeground="white", bg="white")
listbox_title.pack(fill='both', expand=True, pady=5, padx=5)

text_download = tk.Text(window, bg="white")
text_download.pack(fill='both', expand=True, pady=0, padx=5)

buttons_frame = tk.Frame(window)
buttons_frame.pack(fill='x')

button1 = tk.Button(buttons_frame, text="View Titles", command=all_titles)  # don't use `[]` to execute functions
button1.pack(side='left', pady=5, padx=5)

button2 = tk.Button(buttons_frame, text="View Content", command=content)   # don't use `[]` to execute functions
button2.pack(side='left', pady=5, padx=(0,5))

window.mainloop()

结果：

编辑：

排序问题：今天的标题在列表的末尾，但它们应该在开头 - 所有这些都是因为它们仅使用 time 进行排序，但需要使用 date time 进行排序或 number time.

你会 enumerate 每 tcc-list-news 然后每一天都会有自己的号码，他们会（几乎）正确排序。因为你想以相反的顺序排序，所以你可能需要 -number 而不是 number 来获得正确的顺序。

    for number, each in enumerate(news):
        for div in each.find_all("div"):
            time  = div.find('span', attrs={'class': 'hh serif'}).text
            title = " ".join(span.text for span in div.select("a > span"))
            news  = f" {time} {place.upper()}, {title} (TMW)"
            link  = div.find('a')['href']
            results.append( [-number, news, link] )

排序后

    for number, news, url in allnews:
        listbox_title.insert('end', news)

完整的工作代码 (2)

import tkinter as tk   # PEP8: `import *` is not preferred
from tkinter import ttk
import requests
from bs4 import BeautifulSoup

# PEP8: all imports at the beginning

# --- functions ---   # PEP8: all functions directly after imports

def get_data_for(place):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
    }
    
    results = []
    
    response = requests.get(f'https://www.tuttomercatoweb.com/{place}/', headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    news = soup.find_all('div', attrs={"class": "tcc-list-news"})

    for number, each in enumerate(news):
        for div in each.find_all("div"):
            time  = div.find('span', attrs={'class': 'hh serif'}).text
            title = " ".join(span.text for span in div.select("a > span"))
            news  = f" {time} {place.upper()}, {title} (TMW)"
            link  = div.find('a')['href']
            results.append( [-number, news, link] )
    
    return results

def all_titles():
    global allnews  # inform function to use global variable instead of local variable

    allnews = []

    for place in ['atalanta',  'bologna']: 
        print('search:', place)
        results = get_data_for(place)
        print('found:', len(results))
        allnews += results

    allnews.sort(reverse=True)

    listbox_title.delete('0', 'end')
    
    for number, news, url in allnews:
        listbox_title.insert('end', news)

#Download Content of News
def content():
    # tuple
    selection = listbox_title.curselection()
    print('selection:', selection)

    if selection:
        
        item = allnews[selection[-1]]
        print('item:', item)
        url = item[2]
        print('url:', url)

        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
        }
             
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')

        content_download = "\n".join(item.get_text() for item in soup.select("div.text.mbottom"))

        text_download.delete('1.0', 'end') # remove previous content)
        text_download.insert('end', content_download)

# --- main ---

allnews = []  # global variable with default value at start

window = tk.Tk()
window.geometry("800x800")

listbox_title = tk.Listbox(window, selectbackground="#960000", selectforeground="white", bg="white")
listbox_title.pack(fill='both', expand=True, pady=5, padx=5)

text_download = tk.Text(window, bg="white")
text_download.pack(fill='both', expand=True, pady=0, padx=5)

buttons_frame = tk.Frame(window)
buttons_frame.pack(fill='x')

button1 = tk.Button(buttons_frame, text="View Titles", command=all_titles)  # don't use `[]` to execute functions
button1.pack(side='left', pady=5, padx=5)

button2 = tk.Button(buttons_frame, text="View Content", command=content)   # don't use `[]` to execute functions
button2.pack(side='left', pady=5, padx=(0,5))

window.mainloop()

顺便说一句

因为您以相反的顺序排序，所以您在 00:30 atalanta 之前得到 00:30 bologna - 要在 00:30 bologna 之前得到 00:30 atalanta，您必须保留 time， place 作为分隔值，并在 sort() 中使用 key= 来分配仅反转 time 而不会反转 place 和 number 的函数。也许将所有内容都放在 pandas.DataFrame 中会更简单，因为 pandas.DataFrame 有更好的排序方法。

带有 pandas.DataFrame 和 sort_values()

的版本

df = df.sort_values(by=['number', 'time', 'place', 'title'], ascending=[True, False, True, True])

如果您更改顺序 'title', 'place' 而不是 'place', 'title'，那么您会得到相同的标题。

完整的工作代码 (3)

import tkinter as tk   # PEP8: `import *` is not preferred
from tkinter import ttk
import requests
from bs4 import BeautifulSoup
import pandas as pd

# PEP8: all imports at the beginning

# --- functions ---   # PEP8: all functions directly after imports

def get_data_for(place):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
    }
    
    results = []
    
    response = requests.get(f'https://www.tuttomercatoweb.com/{place}/', headers=headers)
    print('url:', response.url)
    print('status:', response.status_code)
    #print('html:', response.text[:1000])
    
    soup = BeautifulSoup(response.content, 'html.parser')

    news = soup.find_all('div', attrs={"class": "tcc-list-news"})

    for number, each in enumerate(news):
        for div in each.find_all("div"):
            time  = div.find('span', attrs={'class': 'hh serif'}).text
            title = " ".join(span.text for span in div.select("a > span"))
            news = f" {time} {place.upper()}, {title} (TMW)"
            link  = div.find('a')['href']
            results.append( [number, time, place, title, news, link] )
    
    return results

def all_titles():
    global df
    
    allnews = []  # local variable

    for place in ['atalanta',  'bologna']: 
        print('search:', place)
        results = get_data_for(place)
        print('found:', len(results))
        allnews += results
        text_download.insert('end', f"search: {place}\nfound: {len(results)}\n")

    df = pd.DataFrame(allnews, columns=['number', 'time', 'place', 'title', 'news', 'link'])
    df = df.sort_values(by=['number', 'time', 'place', 'title'], ascending=[True, False, True, True])
    df = df.reset_index()
                      
    listbox_title.delete('0', 'end')
    
    for index, row in df.iterrows():
        listbox_title.insert('end', row['news'])

#Download Content of News
def content():
    # tuple
    selection = listbox_title.curselection()
    print('selection:', selection)

    if selection:
        
        item = df.iloc[selection[-1]]
        #print('item:', item)
        
        url = item['link']
        #print('url:', url)

        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
        }
             
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')

        content_download = "\n".join(item.get_text() for item in soup.select("div.text.mbottom"))

        text_download.delete('1.0', 'end') # remove previous content)
        text_download.insert('end', content_download)

# --- main ---

df = None

window = tk.Tk()
window.geometry("800x800")

listbox_title = tk.Listbox(window, selectbackground="#960000", selectforeground="white", bg="white")
listbox_title.pack(fill='both', expand=True, pady=5, padx=5)

text_download = tk.Text(window, bg="white")
text_download.pack(fill='both', expand=True, pady=0, padx=5)

buttons_frame = tk.Frame(window)
buttons_frame.pack(fill='x')

button1 = tk.Button(buttons_frame, text="View Titles", command=all_titles)  # don't use `[]` to execute functions
button1.pack(side='left', pady=5, padx=5)

button2 = tk.Button(buttons_frame, text="View Content", command=content)   # don't use `[]` to execute functions
button2.pack(side='left', pady=5, padx=(0,5))

window.mainloop()

编辑：

最新版本

ScrolledText
Scrollbar
double click标题看新闻
requests_cache to read page with news only once even if you click it many times (it may need to install SQLite)

完整的工作代码 (4)

import tkinter as tk   # PEP8: `import *` is not preferred
from tkinter import ttk
from tkinter.scrolledtext import ScrolledText  # https://docs.python.org/3/library/tkinter.scrolledtext.html
import requests
import requests_cache  # https://github.com/reclosedev/requests-cache
from bs4 import BeautifulSoup
import pandas as pd

# PEP8: all imports at the beginning

# --- functions ---   # PEP8: all functions directly after imports

def get_data_for(place):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
    }

    results = []

    response = requests.get(f'https://www.tuttomercatoweb.com/{place}/', headers=headers)
    print('url:', response.url)
    print('status:', response.status_code)
    #print('html:', response.text[:1000])

    soup = BeautifulSoup(response.content, 'html.parser')

    news = soup.find_all('div', attrs={"class": "tcc-list-news"})

    for number, each in enumerate(news):
        for div in each.find_all("div"):
            time  = div.find('span', attrs={'class': 'hh serif'}).text
            title = " ".join(span.text for span in div.select("a > span"))
            news = f" {time} {place.upper()}, {title} (TMW)"
            link  = div.find('a')['href']
            results.append( [number, time, place, title, news, link] )

    return results

def all_titles():
    global df

    allnews = []  # local variable

    for place in ['atalanta',  'bologna']:
        print('search:', place)
        results = get_data_for(place)
        print('found:', len(results))
        allnews += results
        text_download.insert('end', f"search: {place}\nfound: {len(results)}\n")

    df = pd.DataFrame(allnews, columns=['number', 'time', 'place', 'title', 'news', 'link'])
    df = df.sort_values(by=['number', 'time', 'place', 'title'], ascending=[True, False, True, True])
    df = df.reset_index()

    listbox_title.delete('0', 'end')

    for index, row in df.iterrows():
        listbox_title.insert('end', row['news'])

def content(event=None):   # `command=` executes without `event`, but `bind` executes with `event` - so it needs default value
    # tuple
    selection = listbox_title.curselection()
    print('selection:', selection)

    if selection:

        item = df.iloc[selection[-1]]
        #print('item:', item)

        url = item['link']
        #print('url:', url)

        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
        }

        # keep page in database `SQLite` 
        # https://github.com/reclosedev/requests-cache
        # https://sqlite.org/index.html
        session = requests_cache.CachedSession('titles')
        response = session.get(url, headers=headers)
        #response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')

        content_download = "\n".join(item.get_text() for item in soup.select("div.text.mbottom"))

        text_download.delete('1.0', 'end') # remove previous content)
        text_download.insert('end', content_download)

# --- main ---

df = None

window = tk.Tk()
window.geometry("800x800")

# ---
# [Tkinter: How to display Listbox with Scrollbar — furas.pl](https://blog.furas.pl/python-tkitner-how-to-display-listbox-with-scrollbar-gb.html)

frame_title = tk.Frame(window)
frame_title.pack(fill='both', expand=True, pady=5, padx=5)

listbox_title = tk.Listbox(frame_title, selectbackground="#960000", selectforeground="white", bg="white")
listbox_title.pack(side='left', fill='both', expand=True)

scrollbar_title = tk.Scrollbar(frame_title)
scrollbar_title.pack(side='left', fill='y')

scrollbar_title['command'] = listbox_title.yview
listbox_title.config(yscrollcommand=scrollbar_title.set)

listbox_title.bind('<Double-Button-1>', content)  # it executes `content(event)`

# ----

text_download = ScrolledText(window, bg="white")
text_download.pack(fill='both', expand=True, pady=0, padx=5)

# ----

buttons_frame = tk.Frame(window)
buttons_frame.pack(fill='x')

button1 = tk.Button(buttons_frame, text="View Titles", command=all_titles)  # don't use `[]` to execute functions
button1.pack(side='left', pady=5, padx=5)

button2 = tk.Button(buttons_frame, text="View Content", command=content)   # don't use `[]` to execute functions
button2.pack(side='left', pady=5, padx=(0,5))

window.mainloop()

如何根据标题抓取新闻内容？

How can I scrape the content of a news, based on its title?

python

selenium

beautifulsoup

web-scraping

python-3.x