Selenium webdriver 错误和崩溃

Question

我正在构建一个 webscraper 来获取一堆棒球数据，我 99% 确定我编写的代码有效，我已经单独测试了它，它应该能获得我想要的数据。但是，我还没有能够运行它一直通过但没有给我这样的 webdriver 错误：

WebDriverException                        Traceback (most recent call last)
c:\Users\jense\VSCODE\BR-selenium-scrape.py in find_plyr_links_pit(self)
     104             try:
---> 105                 WebDriverWait(self.driver, 5).until(
     106                     EC.presence_of_element_located((By.TAG_NAME, "a")))

它并不总是在同一点停止，有时它会一路抓取玩家数据然后停止，有时它不会通过设置联赛链接。这是我的代码，也许它有问题或效率极低，我是 selenium 的新手。

import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
    
class scrape_br():

    def __init__(self):
        ser = Service("/path/to/my/Chromedriver.exe")
        op = webdriver.ChromeOptions()
        self.driver = webdriver.Chrome(service=ser, options=op)

    def get_league_hist(self, lg_href):
        self.lg_home = self.driver.get(lg_href)
        table = self.driver.find_element(By.ID, "div_lg_history")
        tbody = table.find_element(By.TAG_NAME, "tbody")
        tr_list = tbody.find_elements(By.TAG_NAME, 'tr')
        tags = []
        for row in tr_list:
            tags.append(row.find_element(By.TAG_NAME, "a"))
        yr_list = []
        for link in tags:
            yr_list.append(link.get_attribute("href"))
        yr_list = [str(i) for i in yr_list]
        tm_list = []
        for yr in yr_list:
            self.driver.get(yr)
            tm_list.append(self.find_tm_links())
        for i,yr in enumerate(tm_list):
            for j, team in enumerate(yr):
                tm_list[i][j] = team.split(',')
        plyr_list = []
        for tm in tm_list:
            for player in tm:
                for player_link in player:
                    self.driver.get(player_link)
                    plyr_list.append(self.find_plyr_links_bat())
                    plyr_list.append(self.find_plyr_links_pit())
        plyr_data = []
        for team in plyr_list:
            for player in team:
                self.driver.get(player)
                try:
                    plyr_data.append(self.find_bat_tables())
                except:
                    plyr_data.append(self.find_pitch_tables())
        return plyr_data

    def find_tm_links(self):
        for i in range(1):
            try:
                table = self.driver.find_element(By.ID, "div_standings_pitching")
            except:
                table = self.driver.find_element(By.ID, "regular_season")
        tbody = table.find_element(By.TAG_NAME, "tbody")
        tr_list = tbody.find_elements(By.TAG_NAME, 'tr')
        tags = []
        for row in tr_list:
            try:
                tags.append(row.find_elements(By.TAG_NAME, "a"))
            except:
                print("find_tm_links error, could not get a tags from the tr's")
        tm_list = []
        for link in tags:
            for i in link:
                try:
                    tm_list.append(i.get_attribute("href"))
                except:
                    print("error getting a-ref attribute from find_tm_links")
        return tm_list

    def find_plyr_links_bat(self):
        table = self.driver.find_element(By.ID, "team_batting")
        tbody = table.find_element(By.TAG_NAME, "tbody")
        tr_list = tbody.find_elements(By.TAG_NAME, 'tr')
        a_tags = []
        for row in tr_list:
            try:
             a_tags.append(row.find_elements(By.TAG_NAME, "a"))
            except:
                print("Could not get player links in find_plyr_links_bat")
        player_hrefs = []
        for nested_tag in a_tags:
            for tag in nested_tag:
                try:
                    player_hrefs.append(tag.get_attribute("href"))
                except:
                    print("could not successfully implement find_plyr_links_bat")
        return player_hrefs

    def find_plyr_links_pit(self):
        table = self.driver.find_element(By.ID, "team_pitching")
        tbody = table.find_element(By.TAG_NAME, "tbody")
        tr_list = tbody.find_elements(By.TAG_NAME, 'td')
        tags = []
        for row in tr_list:
            try: 
                WebDriverWait(self.driver, 5).until(
                    EC.presence_of_element_located((By.TAG_NAME, "a")))
            except:
                tags.append(row.find_element(By.TAG_NAME, "a"))
        tm_list = []
        for link in tags:
                tm_list.append(link.get_attribute("href"))
        return tm_list

    def find_bat_tables(self):
        x = self.bat_txt = self.driver.find_element(By.ID, "div_standard_batting")
        y = x.get_attribute("innerHTML")
        z = pd.read_html(y)
        return z[0]

    def find_pitch_tables(self):
        x = self.bat_txt = self.driver.find_element(By.ID, "div_standard_pitching")
        y = x.get_attribute("innerHTML")
        z = pd.read_html(y)
        return z[0]

#%% test 
lg = scrape_br()
nwds_hist = lg.get_league_hist("https://www.baseball-reference.com/register/league.cgi?code=NWDS&class=Smr")

Answer 1

import requests
from bs4 import BeautifulSoup, Comment
import pandas as pd
import re

url = 'https://www.baseball-reference.com/register/league.cgi?code=NWDS&class=Smr'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'}

# Get links
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
yearLinks = soup.find_all('th', {'data-stat':'year_ID'})

links = {}
for year in yearLinks:
    if year.find('a', href=True):
        links[year.text] = 'https://www.baseball-reference.com' + year.find('a', href=True)['href']

final_df = {'batting':[], 'pitching':[]}
for year, link in links.items():
    print(year)
    response = requests.get(link, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    if soup.find_all('th', {'data-stat':'team_ID'}):
        team_links = soup.find_all('th', {'data-stat':'team_ID'})
        
    else:
        team_links = []
        comments = soup.find_all(string=lambda text: isinstance(text, Comment))
        for each in comments:
            if 'th' in str(each):
                try:
                    soupAlpha = BeautifulSoup(str(each), 'html.parser').find_all('th', {'data-stat':'team_ID'})
                    if soupAlpha != []:
                        team_links += soupAlpha
                except:
                    continue
                    
    teamLinks = {}
    for team_link in team_links:
        if team_link.find('a', href=True):
            teamLinks[team_link.text] = 'https://www.baseball-reference.com' + team_link.find('a', href=True)['href']
            
    for team, teamLink in teamLinks.items():
        print(f'\t{team}')
        response = requests.get(teamLink, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        batting_table = pd.read_html(response.text, attrs = {'id': 'team_batting'})[0]
        batting_table['Year'] = year
        batting_table['Team'] = team
        
        print(f'\t\t{team} - batting stats')
        
        comments = soup.find_all(string=lambda text: isinstance(text, Comment))
        for each in comments:
            if 'table' in str(each):
                try:
                    pitching_table = pd.read_html(str(each), attrs = {'id': 'team_pitching'})[0]
                    batting_table['Year'] = year
                    batting_table['Team'] = team
                    
                    print(f'\t\t{team} - pitching stats')
                    break
                except:
                    continue
                
        final_df['batting'].append(batting_table)
        final_df['pitching'].append(pitching_table)
            
batting = pd.concat(final_df['batting'], axis=0)     
pitching = pd.concat(final_df['pitching'], axis=0)

输出：

print(batting)
      Rk               Name   Age   G  ...  IBB  Notes  Year                 Team
0    1.0       Josh Buckley  20.0  35  ...  NaN    NaN  2021   Kokomo Jackrabbits
1    2.0       Justus Burke  22.0  15  ...  NaN    NaN  2021   Kokomo Jackrabbits
2    3.0      Adam Crampton  20.0  33  ...  NaN    NaN  2021   Kokomo Jackrabbits
3    4.0   Dylan Delvecchio  20.0   8  ...  NaN    NaN  2021   Kokomo Jackrabbits
4    5.0       Dylan Dennis  21.0  60  ...  NaN    NaN  2021   Kokomo Jackrabbits
..   ...                ...   ...  ..  ...  ...    ...   ...                  ...
19  20.0    Johnathon Tripp    19  24  ...  0.0    NaN  2013  Green Bay Bullfrogs
20  21.0         Logan West    --  20  ...  0.0    NaN  2013  Green Bay Bullfrogs
21  22.0       Boomer White    19  45  ...  0.0    NaN  2013  Green Bay Bullfrogs
22  23.0  Robert Youngdahl*    20  65  ...  0.0    NaN  2013  Green Bay Bullfrogs
23   NaN         23 Players  19.9  70  ...  1.0    NaN  2013  Green Bay Bullfrogs

[4674 rows x 29 columns]

         
print(pitching)
      Rk               Name   Age   W   L  ...  HR9   BB9   SO9  SO/W  Notes
0    1.0        Parker Bard  19.0   1   1  ...  0.0  10.7   7.9  0.74    NaN
1    2.0    Andrew Beauvais  21.0   1   0  ...  0.0   4.9   5.3  1.08    NaN
2    3.0          Ryan Beck  22.0   2   0  ...  0.0   2.9   6.5  2.20    NaN
3    4.0      Brock Begesha  19.0   2   2  ...  1.3  13.2   7.5  0.57    NaN
4    5.0       Garrett Bell  20.0   0   0  ...  0.0   6.0  15.0  2.50    NaN
..   ...                ...   ...  ..  ..  ...  ...   ...   ...   ...    ...
20  21.0         Logan West    --   3   2  ...  1.1   3.4   4.2  1.23    NaN
21  22.0      Jordan Wright    22   0   1  ...  1.4  10.8   6.8  0.63    NaN
22  23.0  Robert Youngdahl*    20   2   1  ...  0.3   3.8   7.9  2.08    NaN
23  24.0      Marshall Zahn    --   1   0  ...  0.8   5.3   7.5  1.43    NaN
24   NaN         24 Players  19.6  28  42  ...  0.7   3.8   6.8  1.77    NaN

[5225 rows x 32 columns]

Selenium webdriver 错误和崩溃

Selenium webdriver errors and crashing

python

selenium

web-scraping

pandas