如何使用抓取的硒数据更新 Google 表

How to update Google Sheets with scraped selenium data

我正在编写 python 脚本以从 ESPN 获取一些数据并更新 google sheet。具体来说,我正在为本周的 4 场 NFL 季后赛拉差和 over/user。

我能够成功地抓取我需要的数据并将其打印到控制台。但是,尽管我能够使用虚拟文本“Testing”成功更新 Google Sheets 单元格,但当我尝试使用 Web 元素中的文本更新单元格时,它会引发错误。

import pygsheets
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.common.keys import Keys

#URI Data
url = 'https://www.espn.com/nfl/scoreboard/_/year/2020/seasontype/3/week/2'

devChromePath = '/Users/ryanbuckner/PycharmProjects/chromedriver'
prodChromePath = '/Library/Application Support/Perceptive Automation/Indigo 7.4/Scripts/Attachments/chromedriver'
driver = webdriver.Chrome(executable_path='/Users/ryanbuckner/PycharmProjects/chromedriver')

driver.get(url);
time.sleep(2);

# find the spread for the game
gbData = driver.find_element_by_xpath('//*[@id="401220398"]/div/div/div[1]/section/div[2]')
bufData = driver.find_element_by_xpath('//*[@id="401220397"]/div/div/div[1]/section/div[2]')
kcData = driver.find_element_by_xpath('//*[@id="401220400"]/div/div/div[1]/section/div[3]')
noData = driver.find_element_by_xpath('//*[@id="401220399"]/div/div/div[1]/section/div[2]')

greenBay = driver.find_element_by_xpath('//*[@id="401220398"]/div/div/div[1]/section/div[2]').find_elements_by_tag_name('div')

driver.close()  # close the focus window

#google sheets authorization
gc = pygsheets.authorize(service_file='/Users/ryanbuckner/PycharmProjects/NFLPredictions/creds.json')

# Create empty dataframe
df = pd.DataFrame()

# Create a column
# df['lines'] = [gbData.text]

#open the google spreadsheet (where 'PY to Gsheet Test' is the name of my sheet)
sh = gc.open('ESPN Test')

# select the first sheet
wks = sh[0]

# update the first sheet with df, starting at cell B2.
# wks.set_dataframe(df,(1,1))

print(greenBay[0].text)

# Update a single cell.
wks.update_value('A2',greenBay[0].text)
#wks.update_value('B2', bufData.text)
#wks.update_value('C2', kcData.text)
#wks.update_value('D2', noData.text)

试图更新单元格时抛出错误wks.update_value('A2',greenBay[0].text)

我找到问题了。关闭 webdriver 的行需要在使用该元素后出现。我不知道驱动程序关闭时元素也关闭了。

嘿,把这个扔出去。 Selenium 很好用,但是有一些缺点:

  1. 可能会很慢,因为它需要打开浏览器,转到页面,让它呈现,获取元素,解析它,...,关闭
  2. 如果以后网站有任何变化,脚本就会崩溃

更好的选择是,如果可以的话,通过 api(espn 确实有)。优点:

  1. 快多了
  2. 数据将以 json 格式保持一致(即 site/html 的结构可能会改变,但数据结构不会(或者我应该说很少会改变)
  3. 通常可以提取更多数据(那里有大量元数据,如天气、出勤率、场地、广播信息等。

所以看看这段代码,如果您有任何问题,请告诉我:

import pygsheets
import pandas as pd
import requests


#URI Data
url = 'http://site.api.espn.com/apis/site/v2/sports/football/nfl/scoreboard'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'}
payload = {
'lang': 'en',
'region': 'us',
'calendartype': 'blacklist',
'limit': '100',
'showAirings': 'true',
'dates': '2020',
'seasontype': '3',
'week': '3'}

data = requests.get(url, headers=headers, params=payload).json()

# find the spread for the games
odds_dict = {}
games = data['events']
for game in games:
    teams = game['competitions'][0]['competitors']
    for team in teams:
        if team['homeAway'] == 'home':
            homeTeam = team['team']['abbreviation']
        if team['homeAway'] == 'away':
            awayTeam = team['team']['abbreviation']
    odds = game['competitions'][0]['odds'][0]['details'].split(' ')[-1].strip()
    ou = game['competitions'][0]['odds'][0]['overUnder']
    odds_dict.update({homeTeam:{'Line':odds, 'O/U':ou}})



#google sheets authorization
gc = pygsheets.authorize(service_file='/Users/ryanbuckner/PycharmProjects/NFLPredictions/creds.json')

# Create empty dataframe
df = pd.DataFrame()

# Create a column
# df['lines'] = [gbData.text]

#open the google spreadsheet (where 'PY to Gsheet Test' is the name of my sheet)
sh = gc.open('ESPN Test')

# select the first sheet
wks = sh[0]

# update the first sheet with df, starting at cell B2.
# wks.set_dataframe(df,(1,1))

print(odds_dict['GB'])

# Update a single cell.
wks.update_value('A2', odds_dict['GB']['Line'])
wks.update_value('B2', odds_dict['KC']['Line'])

输出:

 print(odds_dict)
{'GB': {'Line': '-3.5', 'O/U': 51.0}, 'KC': {'Line': '-3.0', 'O/U': 54.0}}

更新:

确定找到了 api 的赔率。再次只需要更改 'week' 参数。

import pygsheets
import pandas as pd
import requests


url = 'https://site.web.api.espn.com/apis/v2/scoreboard/header'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'}
payload = {
'sport': 'football',
'league': 'nfl',
'region': 'us',
'lang': 'en',
'contentorigin': 'espn',
'buyWindow': '1m',
'showAirings': 'buy,live,replay',
'showZipLookup': 'true',
'tz': 'America/New_York',
'seasontype': '3',
'weeks': '3',
'dates': '2020'}

data = requests.get(url, headers=headers, params=payload).json()

# find the spread for the games
odds_dict = {}
games = data['sports'][0]['leagues'][0]['events']
for game in games:
    teams = game['competitors']
    for team in teams:
        if team['homeAway'] == 'home':
            homeTeam = team['abbreviation']
        if team['homeAway'] == 'away':
            awayTeam = team['abbreviation']
    odds = game['odds']['spread']
    ou = game['odds']['overUnder']
    odds_dict.update({homeTeam:{'Line':odds, 'O/U':ou}})

print(odds_dict)

#google sheets authorization
gc = pygsheets.authorize(service_file='/Users/ryanbuckner/PycharmProjects/NFLPredictions/creds.json')

# Create empty dataframe
df = pd.DataFrame()

# Create a column
# df['lines'] = [gbData.text]

#open the google spreadsheet (where 'PY to Gsheet Test' is the name of my sheet)
sh = gc.open('ESPN Test')

# select the first sheet
wks = sh[0]

# update the first sheet with df, starting at cell B2.
# wks.set_dataframe(df,(1,1))

print(odds_dict['GB'])

# Update a single cell.
wks.update_value('A2', odds_dict['GB']['Line'])
wks.update_value('B2', odds_dict['KC']['Line'])