Python: 解析网页中的多个表格并将数据分组为CSV
Python: Parse multiple tables from webpage and group data in CSV
我是 Python 的新手,我遇到了一个我认为非常复杂的问题。我想从一个网站上解析两个 table 以获取大约 80 个 URL,其中一个页面的示例:https://www.sports-reference.com/cfb/players/sam-darnold-1.html
我需要 80 个 URL 中的第一个 table "Passing" 和第二个 table "Rushing and Receiving"(我知道如何获取第一个和第二个table)。但问题是我需要它来处理一个 csv 中的所有 80 个 URL。
这是我到目前为止的代码以及数据的样子:
import requests
import pandas as pd
COLUMNS = ['School', 'Conf', 'Class', 'Pos', 'G', 'Cmp', 'Att', 'Pct', 'Yds','Y/A', 'AY/A', 'TD', 'Int', 'Rate']
urls = ['https://www.sports-reference.com/cfb/players/russell-wilson-1.html',
'https://www.sports-reference.com/cfb/players/cam-newton-1.html',
'https://www.sports-reference.com/cfb/players/peyton-manning-1.html']
#scrape elements
dataframes = []
try:
for url in urls:
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
#print(soup)
table = soup.find_all('table')[0] # Find the first "table" tag in the page
rows = table.find_all("tr")
cy_data = []
for row in rows:
cells = row.find_all("td")
cells = cells[0:14]
cy_data.append([cell.text for cell in cells]) # For each "td" tag, get the text inside it
dataframes.append(pd.DataFrame(cy_data, columns=COLUMNS).drop(0, axis=0))
except:
pass
data = pd.concat(dataframes)
data.to_csv('testcsv3.csv', sep=',') ```
+---+--+----------------------+---------+-------+-----+----+-----+------+------+-------+------+------+-----+-----+-------+
| | | School | Conf | Class | Pos | G | Cmp | Att | Pct | Yds | Y/A | AY/A | TD | Int | Rate |
+---+--+----------------------+---------+-------+-----+----+-----+------+------+-------+------+------+-----+-----+-------+
| 1 | | | | | | | | | | | | | | | |
| 2 | | North Carolina State | ACC | FR | QB | 11 | 150 | 275 | 54.5 | 1955 | 7.1 | 8.2 | 17 | 1 | 133.9 |
| 3 | | North Carolina State | ACC | SO | QB | 12 | 224 | 378 | 59.3 | 3027 | 8 | 8.3 | 31 | 11 | 147.8 |
| 4 | | North Carolina State | ACC | JR | QB | 13 | 308 | 527 | 58.4 | 3563 | 6.8 | 6.6 | 28 | 14 | 127.5 |
| 5 | | Wisconsin | Big Ten | SR | QB | 14 | 225 | 309 | 72.8 | 3175 | 10.3 | 11.8 | 33 | 4 | 191.8 |
| 6 | | Overall | | | | | 907 | 1489 | 60.9 | 11720 | 7.9 | 8.4 | 109 | 30 | 147.2 |
| 7 | | North Carolina State | | | | | 682 | 1180 | 57.8 | 8545 | 7.2 | 7.5 | 76 | 26 | 135.5 |
| 8 | | Wisconsin | | | | | 225 | 309 | 72.8 | 3175 | 10.3 | 11.8 | 33 | 4 | 191.8 |
| 1 | | | | | | | | | | | | | | | |
| 2 | | Florida | SEC | FR | QB | 5 | 5 | 10 | 50 | 40 | 4 | 4 | 0 | 0 | 83.6 |
| 3 | | Florida | SEC | SO | QB | 1 | 1 | 2 | 50 | 14 | 7 | 7 | 0 | 0 | 108.8 |
| 4 | | Auburn | SEC | JR | QB | 14 | 185 | 280 | 66.1 | 2854 | 10.2 | 11.2 | 30 | 7 | 182 |
| 5 | | Overall | | | | | 191 | 292 | 65.4 | 2908 | 10 | 10.9 | 30 | 7 | 178.2 |
| 6 | | Florida | | | | | 6 | 12 | 50 | 54 | 4.5 | 4.5 | 0 | 0 | 87.8 |
| 7 | | Auburn | | | | | 185 | 280 | 66.1 | 2854 | 10.2 | 11.2 | 30 | 7 | 182 |
+---+--+----------------------+---------+-------+-----+----+-----+------+------+-------+------+------+-----+-----+-------+
And this is how I'd like the data to look, note the player name is missing from each grouping which ideally can be added from the sample website/url and I've added the second table which I need help appending:
+---+----------------+----------------------+---------+-------+-----+----+-----+------+------+-------+------+------+-----+-----+-------+----------------------+---------+-------+-----+----+-----+-----+-----+----+
| | | School | Conf | Class | Pos | G | Cmp | Att | Pct | Yds | Y/A | AY/A | TD | Int | Rate | School | Conf | Class | Pos | G | Att | Yds | Avg | TD |
+---+----------------+----------------------+---------+-------+-----+----+-----+------+------+-------+------+------+-----+-----+-------+----------------------+---------+-------+-----+----+-----+-----+-----+----+
| 1 | | | | | | | | | | | | | | | | | | | | | | | | |
| 2 | Russell Wilson | North Carolina State | ACC | FR | QB | 11 | 150 | 275 | 54.5 | 1955 | 7.1 | 8.2 | 17 | 1 | 133.9 | North Carolina State | ACC | FR | QB | 11 | 150 | 467 | 6.7 | 3 |
| 3 | Russell Wilson | North Carolina State | ACC | SO | QB | 12 | 224 | 378 | 59.3 | 3027 | 8 | 8.3 | 31 | 11 | 147.8 | North Carolina State | ACC | SO | QB | 12 | 129 | 300 | 6.8 | 2 |
| 4 | Russell Wilson | North Carolina State | ACC | JR | QB | 13 | 308 | 527 | 58.4 | 3563 | 6.8 | 6.6 | 28 | 14 | 127.5 | North Carolina State | ACC | JR | QB | 13 | 190 | 560 | 7.1 | 5 |
| 5 | Russell Wilson | Wisconsin | Big Ten | SR | QB | 14 | 225 | 309 | 72.8 | 3175 | 10.3 | 11.8 | 33 | 4 | 191.8 | Wisconsin | Big Ten | SR | QB | 14 | 210 | 671 | 7.3 | 7 |
| 6 | Russell Wilson | Overall | | | | | 907 | 1489 | 60.9 | 11720 | 7.9 | 8.4 | 109 | 30 | 147.2 | Overall | | | | | | | | |
| 7 | Russell Wilson | North Carolina State | | | | | 682 | 1180 | 57.8 | 8545 | 7.2 | 7.5 | 76 | 26 | 135.5 | North Carolina State | | | | | | | | |
| 8 | Russell Wilson | Wisconsin | | | | | 225 | 309 | 72.8 | 3175 | 10.3 | 11.8 | 33 | 4 | 191.8 | Wisconsin | | | | | | | | |
| 1 | | | | | | | | | | | | | | | | | | | | | | | | |
| 2 | Cam Newton | Florida | SEC | FR | QB | 5 | 5 | 10 | 50 | 40 | 4 | 4 | 0 | 0 | 83.6 | Florida | SEC | FR | QB | 5 | 210 | 456 | 7.1 | 2 |
| 3 | Cam Newton | Florida | SEC | SO | QB | 1 | 1 | 2 | 50 | 14 | 7 | 7 | 0 | 0 | 108.8 | Florida | SEC | SO | QB | 1 | 212 | 478 | 4.5 | 5 |
| 4 | Cam Newton | Auburn | SEC | JR | QB | 14 | 185 | 280 | 66.1 | 2854 | 10.2 | 11.2 | 30 | 7 | 182 | Auburn | SEC | JR | QB | 14 | 219 | 481 | 6.7 | 6 |
| 5 | Cam Newton | Overall | | | | | 191 | 292 | 65.4 | 2908 | 10 | 10.9 | 30 | 7 | 178.2 | Overall | | | | | | | 3.4 | 7 |
| 6 | Cam Newton | Florida | | | | | 6 | 12 | 50 | 54 | 4.5 | 4.5 | 0 | 0 | 87.8 | Florida | | | | | | | | |
| 7 | Cam Newton | Auburn | | | | | 185 | 280 | 66.1 | 2854 | 10.2 | 11.2 | 30 | 7 | 182 | Auburn | | | | | | | | |
+---+----------------+----------------------+---------+-------+-----+----+-----+------+------+-------+------+------+-----+-----+-------+----------------------+---------+-------+-----+----+-----+-----+-----+----+
So basically I'd wanna append the second table (Only the columns mentioned) to the end of the first table and add the player name (read from the URL) to each row
import requests
import pandas as pd
from bs4 import BeautifulSoup
COLUMNS = ['School', 'Conf', 'Class', 'Pos', 'G', 'Cmp', 'Att', 'Pct', 'Yds','Y/A', 'AY/A', 'TD', 'Int', 'Rate']
COLUMNS2 = ['School', 'Conf', 'Class', 'Pos', 'G', 'Att', 'Yds','Avg', 'TD', 'Rec', 'Yds', 'Avg', 'TD', 'Plays', 'Yds', 'Avg', 'TD']
urls = ['https://www.sports-reference.com/cfb/players/russell-wilson-1.html',
'https://www.sports-reference.com/cfb/players/cam-newton-1.html',
'https://www.sports-reference.com/cfb/players/peyton-manning-1.html']
#scrape elements
dataframes = []
dataframes2 = []
for url in urls:
a = url
print(a)
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
#print(soup)
table = soup.find_all('table')[0] # Find the first "table" tag in the page
rows = table.find_all("tr")
cy_data = []
for row in rows:
cells = row.find_all("td")
cells = cells[0:14]
cy_data.append([cell.text for cell in cells]) # For each "td" tag, get the text inside it
cy_data = pd.DataFrame(cy_data, columns=COLUMNS)
#Create player column in first column and derive the player from the URL
cy_data.insert(0, 'Player', url)
cy_data['Player'] = cy_data['Player'].str.split('/').str[5].str.split('-').str[0].str.title() + ' ' + cy_data['Player'].str.split('/').str[5].str.split('-').str[1].str.title()
dataframes.append(cy_data)
table2 = soup.find_all('table')[1] # Find the second "table" tag in the page
rows2 = table2.find_all("tr")
cy_data2 = []
for row2 in rows2:
cells2 = row2.find_all("td")
cells2 = cells2[0:14]
cy_data2.append([cell.text for cell in cells2]) # For each "td" tag, get the text inside it
cy_data2 = pd.DataFrame(cy_data2, columns=COLUMNS2)
cy_data2.insert(0, 'Player', url)
cy_data2['Player'] = cy_data2['Player'].str.split('/').str[5].str.split('-').str[0].str.title() + ' ' + cy_data2['Player'].str.split('/').str[5].str.split('-').str[1].str.title()
dataframes2.append(cy_data2)
data = pd.concat(dataframes).reset_index()
data2 = pd.concat(dataframes).reset_index()
data3 = data.merge(data2, on=['index', 'Player'], suffixes=('',' '))
#Filter on None rows
data3 = data3.loc[data3['School'].notnull()].drop('index', axis=1)
display(data, data2, data3)
我是 Python 的新手,我遇到了一个我认为非常复杂的问题。我想从一个网站上解析两个 table 以获取大约 80 个 URL,其中一个页面的示例:https://www.sports-reference.com/cfb/players/sam-darnold-1.html
我需要 80 个 URL 中的第一个 table "Passing" 和第二个 table "Rushing and Receiving"(我知道如何获取第一个和第二个table)。但问题是我需要它来处理一个 csv 中的所有 80 个 URL。
这是我到目前为止的代码以及数据的样子:
import requests
import pandas as pd
COLUMNS = ['School', 'Conf', 'Class', 'Pos', 'G', 'Cmp', 'Att', 'Pct', 'Yds','Y/A', 'AY/A', 'TD', 'Int', 'Rate']
urls = ['https://www.sports-reference.com/cfb/players/russell-wilson-1.html',
'https://www.sports-reference.com/cfb/players/cam-newton-1.html',
'https://www.sports-reference.com/cfb/players/peyton-manning-1.html']
#scrape elements
dataframes = []
try:
for url in urls:
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
#print(soup)
table = soup.find_all('table')[0] # Find the first "table" tag in the page
rows = table.find_all("tr")
cy_data = []
for row in rows:
cells = row.find_all("td")
cells = cells[0:14]
cy_data.append([cell.text for cell in cells]) # For each "td" tag, get the text inside it
dataframes.append(pd.DataFrame(cy_data, columns=COLUMNS).drop(0, axis=0))
except:
pass
data = pd.concat(dataframes)
data.to_csv('testcsv3.csv', sep=',') ```
+---+--+----------------------+---------+-------+-----+----+-----+------+------+-------+------+------+-----+-----+-------+
| | | School | Conf | Class | Pos | G | Cmp | Att | Pct | Yds | Y/A | AY/A | TD | Int | Rate |
+---+--+----------------------+---------+-------+-----+----+-----+------+------+-------+------+------+-----+-----+-------+
| 1 | | | | | | | | | | | | | | | |
| 2 | | North Carolina State | ACC | FR | QB | 11 | 150 | 275 | 54.5 | 1955 | 7.1 | 8.2 | 17 | 1 | 133.9 |
| 3 | | North Carolina State | ACC | SO | QB | 12 | 224 | 378 | 59.3 | 3027 | 8 | 8.3 | 31 | 11 | 147.8 |
| 4 | | North Carolina State | ACC | JR | QB | 13 | 308 | 527 | 58.4 | 3563 | 6.8 | 6.6 | 28 | 14 | 127.5 |
| 5 | | Wisconsin | Big Ten | SR | QB | 14 | 225 | 309 | 72.8 | 3175 | 10.3 | 11.8 | 33 | 4 | 191.8 |
| 6 | | Overall | | | | | 907 | 1489 | 60.9 | 11720 | 7.9 | 8.4 | 109 | 30 | 147.2 |
| 7 | | North Carolina State | | | | | 682 | 1180 | 57.8 | 8545 | 7.2 | 7.5 | 76 | 26 | 135.5 |
| 8 | | Wisconsin | | | | | 225 | 309 | 72.8 | 3175 | 10.3 | 11.8 | 33 | 4 | 191.8 |
| 1 | | | | | | | | | | | | | | | |
| 2 | | Florida | SEC | FR | QB | 5 | 5 | 10 | 50 | 40 | 4 | 4 | 0 | 0 | 83.6 |
| 3 | | Florida | SEC | SO | QB | 1 | 1 | 2 | 50 | 14 | 7 | 7 | 0 | 0 | 108.8 |
| 4 | | Auburn | SEC | JR | QB | 14 | 185 | 280 | 66.1 | 2854 | 10.2 | 11.2 | 30 | 7 | 182 |
| 5 | | Overall | | | | | 191 | 292 | 65.4 | 2908 | 10 | 10.9 | 30 | 7 | 178.2 |
| 6 | | Florida | | | | | 6 | 12 | 50 | 54 | 4.5 | 4.5 | 0 | 0 | 87.8 |
| 7 | | Auburn | | | | | 185 | 280 | 66.1 | 2854 | 10.2 | 11.2 | 30 | 7 | 182 |
+---+--+----------------------+---------+-------+-----+----+-----+------+------+-------+------+------+-----+-----+-------+
And this is how I'd like the data to look, note the player name is missing from each grouping which ideally can be added from the sample website/url and I've added the second table which I need help appending:
+---+----------------+----------------------+---------+-------+-----+----+-----+------+------+-------+------+------+-----+-----+-------+----------------------+---------+-------+-----+----+-----+-----+-----+----+
| | | School | Conf | Class | Pos | G | Cmp | Att | Pct | Yds | Y/A | AY/A | TD | Int | Rate | School | Conf | Class | Pos | G | Att | Yds | Avg | TD |
+---+----------------+----------------------+---------+-------+-----+----+-----+------+------+-------+------+------+-----+-----+-------+----------------------+---------+-------+-----+----+-----+-----+-----+----+
| 1 | | | | | | | | | | | | | | | | | | | | | | | | |
| 2 | Russell Wilson | North Carolina State | ACC | FR | QB | 11 | 150 | 275 | 54.5 | 1955 | 7.1 | 8.2 | 17 | 1 | 133.9 | North Carolina State | ACC | FR | QB | 11 | 150 | 467 | 6.7 | 3 |
| 3 | Russell Wilson | North Carolina State | ACC | SO | QB | 12 | 224 | 378 | 59.3 | 3027 | 8 | 8.3 | 31 | 11 | 147.8 | North Carolina State | ACC | SO | QB | 12 | 129 | 300 | 6.8 | 2 |
| 4 | Russell Wilson | North Carolina State | ACC | JR | QB | 13 | 308 | 527 | 58.4 | 3563 | 6.8 | 6.6 | 28 | 14 | 127.5 | North Carolina State | ACC | JR | QB | 13 | 190 | 560 | 7.1 | 5 |
| 5 | Russell Wilson | Wisconsin | Big Ten | SR | QB | 14 | 225 | 309 | 72.8 | 3175 | 10.3 | 11.8 | 33 | 4 | 191.8 | Wisconsin | Big Ten | SR | QB | 14 | 210 | 671 | 7.3 | 7 |
| 6 | Russell Wilson | Overall | | | | | 907 | 1489 | 60.9 | 11720 | 7.9 | 8.4 | 109 | 30 | 147.2 | Overall | | | | | | | | |
| 7 | Russell Wilson | North Carolina State | | | | | 682 | 1180 | 57.8 | 8545 | 7.2 | 7.5 | 76 | 26 | 135.5 | North Carolina State | | | | | | | | |
| 8 | Russell Wilson | Wisconsin | | | | | 225 | 309 | 72.8 | 3175 | 10.3 | 11.8 | 33 | 4 | 191.8 | Wisconsin | | | | | | | | |
| 1 | | | | | | | | | | | | | | | | | | | | | | | | |
| 2 | Cam Newton | Florida | SEC | FR | QB | 5 | 5 | 10 | 50 | 40 | 4 | 4 | 0 | 0 | 83.6 | Florida | SEC | FR | QB | 5 | 210 | 456 | 7.1 | 2 |
| 3 | Cam Newton | Florida | SEC | SO | QB | 1 | 1 | 2 | 50 | 14 | 7 | 7 | 0 | 0 | 108.8 | Florida | SEC | SO | QB | 1 | 212 | 478 | 4.5 | 5 |
| 4 | Cam Newton | Auburn | SEC | JR | QB | 14 | 185 | 280 | 66.1 | 2854 | 10.2 | 11.2 | 30 | 7 | 182 | Auburn | SEC | JR | QB | 14 | 219 | 481 | 6.7 | 6 |
| 5 | Cam Newton | Overall | | | | | 191 | 292 | 65.4 | 2908 | 10 | 10.9 | 30 | 7 | 178.2 | Overall | | | | | | | 3.4 | 7 |
| 6 | Cam Newton | Florida | | | | | 6 | 12 | 50 | 54 | 4.5 | 4.5 | 0 | 0 | 87.8 | Florida | | | | | | | | |
| 7 | Cam Newton | Auburn | | | | | 185 | 280 | 66.1 | 2854 | 10.2 | 11.2 | 30 | 7 | 182 | Auburn | | | | | | | | |
+---+----------------+----------------------+---------+-------+-----+----+-----+------+------+-------+------+------+-----+-----+-------+----------------------+---------+-------+-----+----+-----+-----+-----+----+
So basically I'd wanna append the second table (Only the columns mentioned) to the end of the first table and add the player name (read from the URL) to each row
import requests
import pandas as pd
from bs4 import BeautifulSoup
COLUMNS = ['School', 'Conf', 'Class', 'Pos', 'G', 'Cmp', 'Att', 'Pct', 'Yds','Y/A', 'AY/A', 'TD', 'Int', 'Rate']
COLUMNS2 = ['School', 'Conf', 'Class', 'Pos', 'G', 'Att', 'Yds','Avg', 'TD', 'Rec', 'Yds', 'Avg', 'TD', 'Plays', 'Yds', 'Avg', 'TD']
urls = ['https://www.sports-reference.com/cfb/players/russell-wilson-1.html',
'https://www.sports-reference.com/cfb/players/cam-newton-1.html',
'https://www.sports-reference.com/cfb/players/peyton-manning-1.html']
#scrape elements
dataframes = []
dataframes2 = []
for url in urls:
a = url
print(a)
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
#print(soup)
table = soup.find_all('table')[0] # Find the first "table" tag in the page
rows = table.find_all("tr")
cy_data = []
for row in rows:
cells = row.find_all("td")
cells = cells[0:14]
cy_data.append([cell.text for cell in cells]) # For each "td" tag, get the text inside it
cy_data = pd.DataFrame(cy_data, columns=COLUMNS)
#Create player column in first column and derive the player from the URL
cy_data.insert(0, 'Player', url)
cy_data['Player'] = cy_data['Player'].str.split('/').str[5].str.split('-').str[0].str.title() + ' ' + cy_data['Player'].str.split('/').str[5].str.split('-').str[1].str.title()
dataframes.append(cy_data)
table2 = soup.find_all('table')[1] # Find the second "table" tag in the page
rows2 = table2.find_all("tr")
cy_data2 = []
for row2 in rows2:
cells2 = row2.find_all("td")
cells2 = cells2[0:14]
cy_data2.append([cell.text for cell in cells2]) # For each "td" tag, get the text inside it
cy_data2 = pd.DataFrame(cy_data2, columns=COLUMNS2)
cy_data2.insert(0, 'Player', url)
cy_data2['Player'] = cy_data2['Player'].str.split('/').str[5].str.split('-').str[0].str.title() + ' ' + cy_data2['Player'].str.split('/').str[5].str.split('-').str[1].str.title()
dataframes2.append(cy_data2)
data = pd.concat(dataframes).reset_index()
data2 = pd.concat(dataframes).reset_index()
data3 = data.merge(data2, on=['index', 'Player'], suffixes=('',' '))
#Filter on None rows
data3 = data3.loc[data3['School'].notnull()].drop('index', axis=1)
display(data, data2, data3)