在自定义中提取名称 <h2> 但它被提取了很多次 beautifulsoup

Question

我正在尝试在自定义 <h2> 中提取名称，但我想要的名称被提取了很多次。如何解决此问题并将其提取一次我从中提取数据的页面 here

import requests
import csv
from bs4 import BeautifulSoup
from itertools import zip_longest
lawy_name = []
page_num = 1
phone = []
logo = []
website = []
links = []
while True:
    try:
        result = requests.get(f"https://example.com/motor-vehicle-accidents/texas/houston/page{page_num}/")
        src = result.content
        soup = BeautifulSoup(src, "lxml")
        page_limit = int("126")
        if(page_num > page_limit // 25):
            print("page ended, terminate")
            break
        lawy_names = soup.select('div.poap.serp-container.lawyer h2.indigo_text')
        for i in range(len(lawy_names)) :
            lawy_name.append(lawy_names[i].text.strip())
            links.append(lawy_names[i].find("a").attrs["href"])
        for link in links:
            result = requests.get(link)
            src = result.content
            soup = BeautifulSoup(src, "lxml")
            phones = soup.find("a", {"class":"profile-phone-header profile-contact-btn"})
            phone.append(phones["href"])
            logos = soup.find("div", {"class":"photo-container"})
            logo.append(logos.find('img')['src'])
            websites = soup.find("a", {"class":"profile-website-header","id":"firm_website"})
            website.append(websites.text.strip())

        page_num +=1
        print("page switched")
    except:
        print("error")
        break
    
file_list = [lawy_name, phone, website, logo]
exported = zip_longest(*file_list)
with open("/Users/dsoky/Desktop/fonts/Moaaz.csv", "w") as myfile:
    wr = csv.writer(myfile)
    wr.writerow(["lawyer name","phone","website","logo"])
    wr.writerows(exported)

问题：

Answer 1

该网站确实产生了很多重复条目。您可能会假设所有条目都具有唯一的名称，因为这样的字典可用于保存您的所有数据。只需跳过您已经看到相同名称的任何条目。例如：

from bs4 import BeautifulSoup
import requests
import csv

lawyers = {}
page_num = 1

while True:
    print(f"Page {page_num}")
    req = requests.get(f"https://example.com/motor-vehicle-accidents/texas/houston/page{page_num}/")
    soup = BeautifulSoup(req.content, "lxml")
    found = False
    
    for id in ['sponsored_serps', 'ts_results', 'poap_results', 'basic_results']:
        div_results = soup.find('div', id=id)
    
        if div_results:
            for result in div_results.find_all('div', class_='lawyer'):
                name = result.h2.get_text(strip=True)
                
                if name not in lawyers:
                    print(' ', name)
                    link = result.h2.a['href']
                    req_details = requests.get(link)
                    soup_details = BeautifulSoup(req_details.content, "lxml")

                    a_phone = soup_details.find("a", {"class":"profile-phone-header profile-contact-btn"}, href=True)
                    
                    if a_phone:
                        phone = a_phone['href']
                    else:
                        phone = None
                    
                    div_logo = soup_details.find("div", {"class":"photo-container"})
                    
                    if div_logo.img:
                        logo = div_logo.img['src']
                    else:
                        logo = None
            
                    a_website = soup_details.find("a", {"class":"profile-website-header","id":"firm_website"})
                    
                    if a_website:
                        website = a_website.get_text(strip=True)
                    else:
                        website = None
                    
                    lawyers[name] = [phone, logo, website]
                    found = True
                    
    # Keep going until no new names found
    if found:
        page_num += 1
    else:
        break

with open('Moaaz.csv', 'w', newline='') as f_output:
    csv_output = csv.writer(f_output)
    csv_output.writerow(['Name', 'Phone', 'Logo', 'Website'])
    
    for name, details in lawyers.items():
        csv_output.writerow([name, *details])

在自定义中提取名称 <h2> 但它被提取了很多次 beautifulsoup

extract names in custom <h2> but It is extracted many times beautifulsoup

python

beautifulsoup

python-requests