Python 网络抓取 ESPN 团队花名册
Python Web Scraping ESPN team Rosters
我有这段代码可以通过粘贴 'any' ESPN 名册页面的 URL 来抓取球员信息(姓名、位置、号码)。我说 'any' 是因为任何包含至少一名没有 number/jersey 值的玩家的页面都会出错。有没有办法修复这样的错误。
作为每个示例,费城老鹰队页面正确转换 (https://www.espn.com/nfl/team/roster/_/name/phi)
但是底特律雄狮队的名单没有 (https://www.espn.com/nfl/team/roster/_/name/det)
# -*- coding: utf-8 -*-
import os, json, re
import requests
team = ''
def SavePlayerData(DATA):
global team
for s in ['\','/',':','*','?','"','<','>','|']:
team = team.replace(s,'')
outfilename = '%s.txt'%(team)
with open(outfilename, 'w') as out_file:
for line in DATA:
out_file.write(line)
def GetTeamData(link):
global opener, headers, team, short
response = opener.get(link,headers=headers).text.encode('utf-8')
content = re.search("window\['__espnfitt__'\]\=(.+?)\;</script>",response).group(1)
jsonobj = json.loads(content)
roster = jsonobj['page']['content']['roster']
team = roster['team']['displayName']
coach = roster['coach']['description']
TEAM = []
for group in roster['groups']:
for player in group['athletes']:
n=player['name']
p=player['position']
j=player['jersey']
DATA = '%s%s\t%s %s %s (%s)\t[%s]\n'%(short,j,team,p,n,j,n)
TEAM.append(DATA)
DATA = '%shc\t%s %s %s\t[%s]\n'%(short,team,'head coach',coach,coach)
TEAM.append(DATA)
SavePlayerData(TEAM)
opener = requests.Session()
headers = {'host': 'www.espn.com',
'user-agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
if __name__=="__main__":
teamURL = raw_input(' >> Enter the Team Roster URL :: ').strip()
short = raw_input(' >> Enter the Letter for this Team :: ').strip().lower()
if not short:
short='d'
try:
if not teamURL:
raise Exception
if not '/roster/' in teamURL:
teamURL = teamURL.replace('/team/_/','/team/roster/_/')
print ('\n >> Collecting Data from <%s>\n'%(teamURL))
GetTeamData(teamURL)
print (' >> Link Scraped & Data Saved to File')
except Exception as e:
print ('\n >> Failed to Get Required Data, Re-Check your Input URL.')
您可以使用 try/except,或者只是放入一个条件语句来检查球衣是否在数据中:
import os, json, re
import requests
team = ''
def SavePlayerData(DATA):
global team
for s in ['\','/',':','*','?','"','<','>','|']:
team = team.replace(s,'')
outfilename = '%s.txt'%(team)
with open(outfilename, 'w') as out_file:
for line in DATA:
out_file.write(line)
def GetTeamData(link):
global opener, headers, team, short
response = opener.get(link,headers=headers).text
content = re.search("window\['__espnfitt__'\]\=(.+?)\;</script>",response).group(1)
jsonobj = json.loads(content)
roster = jsonobj['page']['content']['roster']
team = roster['team']['displayName']
coach = roster['coach']['description']
TEAM = []
for group in roster['groups']:
for player in group['athletes']:
n=player['name']
p=player['position']
if 'jersey' in player:
j=player['jersey']
else:
j = ''
DATA = '%s%s\t%s %s %s (%s)\t[%s]\n'%(short,j,team,p,n,j,n)
TEAM.append(DATA)
DATA = '%shc\t%s %s %s\t[%s]\n'%(short,team,'head coach',coach,coach)
TEAM.append(DATA)
SavePlayerData(TEAM)
opener = requests.Session()
headers = {'host': 'www.espn.com',
'user-agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
if __name__=="__main__":
teamURL = input(' >> Enter the Team Roster URL :: ').strip()
short = input(' >> Enter the Letter for this Team :: ').strip().lower()
if not short:
short='d'
try:
if not teamURL:
raise Exception
if not '/roster/' in teamURL:
teamURL = teamURL.replace('/team/_/','/team/roster/_/')
print ('\n >> Collecting Data from <%s>\n'%(teamURL))
GetTeamData(teamURL)
print (' >> Link Scraped & Data Saved to File')
except Exception as e:
print ('\n >> Failed to Get Required Data, Re-Check your Input URL.')
我有这段代码可以通过粘贴 'any' ESPN 名册页面的 URL 来抓取球员信息(姓名、位置、号码)。我说 'any' 是因为任何包含至少一名没有 number/jersey 值的玩家的页面都会出错。有没有办法修复这样的错误。
作为每个示例,费城老鹰队页面正确转换 (https://www.espn.com/nfl/team/roster/_/name/phi) 但是底特律雄狮队的名单没有 (https://www.espn.com/nfl/team/roster/_/name/det)
# -*- coding: utf-8 -*-
import os, json, re
import requests
team = ''
def SavePlayerData(DATA):
global team
for s in ['\','/',':','*','?','"','<','>','|']:
team = team.replace(s,'')
outfilename = '%s.txt'%(team)
with open(outfilename, 'w') as out_file:
for line in DATA:
out_file.write(line)
def GetTeamData(link):
global opener, headers, team, short
response = opener.get(link,headers=headers).text.encode('utf-8')
content = re.search("window\['__espnfitt__'\]\=(.+?)\;</script>",response).group(1)
jsonobj = json.loads(content)
roster = jsonobj['page']['content']['roster']
team = roster['team']['displayName']
coach = roster['coach']['description']
TEAM = []
for group in roster['groups']:
for player in group['athletes']:
n=player['name']
p=player['position']
j=player['jersey']
DATA = '%s%s\t%s %s %s (%s)\t[%s]\n'%(short,j,team,p,n,j,n)
TEAM.append(DATA)
DATA = '%shc\t%s %s %s\t[%s]\n'%(short,team,'head coach',coach,coach)
TEAM.append(DATA)
SavePlayerData(TEAM)
opener = requests.Session()
headers = {'host': 'www.espn.com',
'user-agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
if __name__=="__main__":
teamURL = raw_input(' >> Enter the Team Roster URL :: ').strip()
short = raw_input(' >> Enter the Letter for this Team :: ').strip().lower()
if not short:
short='d'
try:
if not teamURL:
raise Exception
if not '/roster/' in teamURL:
teamURL = teamURL.replace('/team/_/','/team/roster/_/')
print ('\n >> Collecting Data from <%s>\n'%(teamURL))
GetTeamData(teamURL)
print (' >> Link Scraped & Data Saved to File')
except Exception as e:
print ('\n >> Failed to Get Required Data, Re-Check your Input URL.')
您可以使用 try/except,或者只是放入一个条件语句来检查球衣是否在数据中:
import os, json, re
import requests
team = ''
def SavePlayerData(DATA):
global team
for s in ['\','/',':','*','?','"','<','>','|']:
team = team.replace(s,'')
outfilename = '%s.txt'%(team)
with open(outfilename, 'w') as out_file:
for line in DATA:
out_file.write(line)
def GetTeamData(link):
global opener, headers, team, short
response = opener.get(link,headers=headers).text
content = re.search("window\['__espnfitt__'\]\=(.+?)\;</script>",response).group(1)
jsonobj = json.loads(content)
roster = jsonobj['page']['content']['roster']
team = roster['team']['displayName']
coach = roster['coach']['description']
TEAM = []
for group in roster['groups']:
for player in group['athletes']:
n=player['name']
p=player['position']
if 'jersey' in player:
j=player['jersey']
else:
j = ''
DATA = '%s%s\t%s %s %s (%s)\t[%s]\n'%(short,j,team,p,n,j,n)
TEAM.append(DATA)
DATA = '%shc\t%s %s %s\t[%s]\n'%(short,team,'head coach',coach,coach)
TEAM.append(DATA)
SavePlayerData(TEAM)
opener = requests.Session()
headers = {'host': 'www.espn.com',
'user-agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
if __name__=="__main__":
teamURL = input(' >> Enter the Team Roster URL :: ').strip()
short = input(' >> Enter the Letter for this Team :: ').strip().lower()
if not short:
short='d'
try:
if not teamURL:
raise Exception
if not '/roster/' in teamURL:
teamURL = teamURL.replace('/team/_/','/team/roster/_/')
print ('\n >> Collecting Data from <%s>\n'%(teamURL))
GetTeamData(teamURL)
print (' >> Link Scraped & Data Saved to File')
except Exception as e:
print ('\n >> Failed to Get Required Data, Re-Check your Input URL.')