使用替换方法后在数据框中选择不同的特定值
Selecting different specific values in dataframe after use replace method
这是我的代码:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd
import matplotlib as mpl
import numpy as np
import matplotlib.pyplot as plt
#create a list of each year where data will be extract
years_list = [2001, 2002, 2008, 2012, 2015,2018, 2020 , 2021]
player_list = ['Mac Jones', 'Aaron Rodgers', 'Deshaun Watson', 'Patrick Mahomes',
'Josh Allen', 'Ryan Tannehill', 'Drew Bress', 'Russel Wilson',
'Kirk Cousins', 'Tom Brady', 'Derek Carr']
#selecting stats
cols = ['Player', 'Tm','Cmp%', 'Yds', 'TD', 'Int', 'Y/A', 'Rate', 'G']
df_list = []
#loop for extract data
for year in years_list:
url_mac = f'https://www.pro-football-reference.com/years/{year}/passing.htm'
temp_df = pd.read_html(url_mac)[0][cols]
temp_df['Season'] = year
df_list.append(temp_df)
print(f'Collected: {year}')
data_radar = pd.concat(df_list)
#renaming columns
new_columns = data_radar.columns.values
new_columns[-6] = 'y_sack'
data_radar.columns = new_columns
#picking stats
mid_data = pd.DataFrame()
for player in player_list:
mid_data = mid_data.append(data_radar[data_radar['Player'] == player + '*'])
mid_data = mid_data.append(data_radar[data_radar['Player'] == player + '*' + '+'])
mid_data = mid_data.append(data_radar[data_radar['Player'] == player])
mid_data = mid_data.append(data_radar[data_radar['Player'] == player + '+'])
#relevant stats
cols = ['Cmp%', 'Yds', 'Int', 'Y/A','Rate', 'G', 'Season']
final_data = pd.DataFrame()
#fixing names
mid_data = mid_data.replace({'Tom Brady*':'Tom Brady', 'Aaron Rodgers*':'Aaron Rodgers','Aaron Rodgers*+':'Aaron Rodgers',
'Deshaun Watson*':'Deshaun Watson', 'Josh Allen*':'Josh Allen',
'Derek Carr*':'Derek Carr','Patrick Mahomes*':'Patrick Mahomes', 'Patrick Mahomes*+':'Patrick Mahomes' })
#Select informations about players and ordering
final_data = mid_data[['Player', 'Tm'] + cols]
final_data.sort_values(by = 'Player', ascending=True)
final_data.drop_duplicates(subset = 'Player')
我想要的代码是我的 df final_data returns 我每个球员的第一个赛季,但这不适用于我需要使用替换方法的一些球员。
在 drop.duplicates()
之前,我写给 sort_value 的地方就是我的结果
我的想法是对这些值进行排序,然后使用 drop.duplicates() 到每个玩家的第一个 select。
我需要使用替换方法的所有播放器都会出现这种情况。如何解决这个问题?
您的代码中有很多令人困惑的部分。首先,如果您只想去掉玩家名称中的 '*'
和或 '+'
,为什么不直接这样做而不是对每个玩家进行硬编码呢?其次,您的评论实际上并没有描述您的代码在做什么。我不明白
的意义
#Converting colums from object to floats
cols = ['Cmp%', 'Yds', 'Int', 'Y/A','Rate', 'G', 'Season']
final_data = pd.DataFrame()
因为您没有转换为浮点数,而且 # picking top 10 qb in rating stats in last season + Mac Jones
评论也没有按照它说的去做。很难理解你的评论。
第三,如果你想要每个球员的第一个赛季,那么你需要按'Season'
排序,所以当你删除重复的球员名字时,你可以明确地说保留第一个[=29] =] 那个球员,如果你排序的话,这将是他们在数据框中的第一个赛季。
试试这个:
import pandas as pd
#create a list of each year where data will be extract
years_list = [2001, 2002, 2008, 2012, 2015,2018, 2020 , 2021]
player_list = ['Mac Jones', 'Aaron Rodgers', 'Deshaun Watson', 'Patrick Mahomes',
'Josh Allen', 'Ryan Tannehill', 'Drew Bress', 'Russel Wilson',
'Kirk Cousins', 'Tom Brady', 'Derek Carr']
#selecting stats
cols = ['Player', 'Tm','Cmp%', 'Yds', 'TD', 'Int', 'Y/A', 'Rate', 'G']
df_list = []
#loop for extract data
for year in years_list:
url_mac = f'https://www.pro-football-reference.com/years/{year}/passing.htm'
temp_df = pd.read_html(url_mac)[0][cols]
temp_df['Season'] = year
temp_df = temp_df[temp_df['Player'] != 'Player']
df_list.append(temp_df)
print(f'Collected: {year}')
data_radar = pd.concat(df_list)
#renaming columns
new_columns = data_radar.columns.values
new_columns[-6] = 'y_sack'
data_radar.columns = new_columns
# Repace * or + with ''
data_radar['Player'] = data_radar['Player'].str.replace(r'\*|\+','')
cols = ['Cmp%', 'Yds', 'Int', 'Y/A','Rate', 'G', 'Season']
#Select informations about players and ordering
final_data = data_radar[['Player', 'Tm'] + cols]
final_data = final_data.sort_values(by = ['Player', 'Season'], ascending=[True,True])
final_data = final_data.drop_duplicates(subset = 'Player', keep='first')
输出:
print(final_data)
Player Tm Cmp% Yds Int Y/A Rate G Season
53 A.J. Feeley PHI 71.4 143 1 10.2 114.0 1 2001
41 A.J. McCarron CIN 66.4 854 2 7.2 97.1 7 2015
3 Aaron Brooks NOR 55.9 3832 22 6.9 76.4 16 2001
3 Aaron Rodgers GNB 63.6 4038 13 7.5 93.8 16 2008
71 Akili Smith CIN 62.5 37 0 4.6 73.4 2 2001
.. ... ... ... ... .. ... ... .. ...
89 Wayne Chrebet NYJ 0.0 0 0 0.0 39.6 15 2002
39 Zach Mettenberger TEN 60.8 935 7 5.6 66.7 7 2015
112 Zach Pascal IND 0.0 0 0 0.0 39.6 16 2020
27 Zach Wilson NYJ 55.2 628 7 6.0 51.6 3 2021
105 Zay Jones BUF 0.0 0 0 0.0 39.6 16 2018
[427 rows x 9 columns]
这是我的代码:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd
import matplotlib as mpl
import numpy as np
import matplotlib.pyplot as plt
#create a list of each year where data will be extract
years_list = [2001, 2002, 2008, 2012, 2015,2018, 2020 , 2021]
player_list = ['Mac Jones', 'Aaron Rodgers', 'Deshaun Watson', 'Patrick Mahomes',
'Josh Allen', 'Ryan Tannehill', 'Drew Bress', 'Russel Wilson',
'Kirk Cousins', 'Tom Brady', 'Derek Carr']
#selecting stats
cols = ['Player', 'Tm','Cmp%', 'Yds', 'TD', 'Int', 'Y/A', 'Rate', 'G']
df_list = []
#loop for extract data
for year in years_list:
url_mac = f'https://www.pro-football-reference.com/years/{year}/passing.htm'
temp_df = pd.read_html(url_mac)[0][cols]
temp_df['Season'] = year
df_list.append(temp_df)
print(f'Collected: {year}')
data_radar = pd.concat(df_list)
#renaming columns
new_columns = data_radar.columns.values
new_columns[-6] = 'y_sack'
data_radar.columns = new_columns
#picking stats
mid_data = pd.DataFrame()
for player in player_list:
mid_data = mid_data.append(data_radar[data_radar['Player'] == player + '*'])
mid_data = mid_data.append(data_radar[data_radar['Player'] == player + '*' + '+'])
mid_data = mid_data.append(data_radar[data_radar['Player'] == player])
mid_data = mid_data.append(data_radar[data_radar['Player'] == player + '+'])
#relevant stats
cols = ['Cmp%', 'Yds', 'Int', 'Y/A','Rate', 'G', 'Season']
final_data = pd.DataFrame()
#fixing names
mid_data = mid_data.replace({'Tom Brady*':'Tom Brady', 'Aaron Rodgers*':'Aaron Rodgers','Aaron Rodgers*+':'Aaron Rodgers',
'Deshaun Watson*':'Deshaun Watson', 'Josh Allen*':'Josh Allen',
'Derek Carr*':'Derek Carr','Patrick Mahomes*':'Patrick Mahomes', 'Patrick Mahomes*+':'Patrick Mahomes' })
#Select informations about players and ordering
final_data = mid_data[['Player', 'Tm'] + cols]
final_data.sort_values(by = 'Player', ascending=True)
final_data.drop_duplicates(subset = 'Player')
我想要的代码是我的 df final_data returns 我每个球员的第一个赛季,但这不适用于我需要使用替换方法的一些球员。
在 drop.duplicates()
之前,我写给 sort_value 的地方就是我的结果我的想法是对这些值进行排序,然后使用 drop.duplicates() 到每个玩家的第一个 select。
我需要使用替换方法的所有播放器都会出现这种情况。如何解决这个问题?
您的代码中有很多令人困惑的部分。首先,如果您只想去掉玩家名称中的 '*'
和或 '+'
,为什么不直接这样做而不是对每个玩家进行硬编码呢?其次,您的评论实际上并没有描述您的代码在做什么。我不明白
#Converting colums from object to floats
cols = ['Cmp%', 'Yds', 'Int', 'Y/A','Rate', 'G', 'Season']
final_data = pd.DataFrame()
因为您没有转换为浮点数,而且 # picking top 10 qb in rating stats in last season + Mac Jones
评论也没有按照它说的去做。很难理解你的评论。
第三,如果你想要每个球员的第一个赛季,那么你需要按'Season'
排序,所以当你删除重复的球员名字时,你可以明确地说保留第一个[=29] =] 那个球员,如果你排序的话,这将是他们在数据框中的第一个赛季。
试试这个:
import pandas as pd
#create a list of each year where data will be extract
years_list = [2001, 2002, 2008, 2012, 2015,2018, 2020 , 2021]
player_list = ['Mac Jones', 'Aaron Rodgers', 'Deshaun Watson', 'Patrick Mahomes',
'Josh Allen', 'Ryan Tannehill', 'Drew Bress', 'Russel Wilson',
'Kirk Cousins', 'Tom Brady', 'Derek Carr']
#selecting stats
cols = ['Player', 'Tm','Cmp%', 'Yds', 'TD', 'Int', 'Y/A', 'Rate', 'G']
df_list = []
#loop for extract data
for year in years_list:
url_mac = f'https://www.pro-football-reference.com/years/{year}/passing.htm'
temp_df = pd.read_html(url_mac)[0][cols]
temp_df['Season'] = year
temp_df = temp_df[temp_df['Player'] != 'Player']
df_list.append(temp_df)
print(f'Collected: {year}')
data_radar = pd.concat(df_list)
#renaming columns
new_columns = data_radar.columns.values
new_columns[-6] = 'y_sack'
data_radar.columns = new_columns
# Repace * or + with ''
data_radar['Player'] = data_radar['Player'].str.replace(r'\*|\+','')
cols = ['Cmp%', 'Yds', 'Int', 'Y/A','Rate', 'G', 'Season']
#Select informations about players and ordering
final_data = data_radar[['Player', 'Tm'] + cols]
final_data = final_data.sort_values(by = ['Player', 'Season'], ascending=[True,True])
final_data = final_data.drop_duplicates(subset = 'Player', keep='first')
输出:
print(final_data)
Player Tm Cmp% Yds Int Y/A Rate G Season
53 A.J. Feeley PHI 71.4 143 1 10.2 114.0 1 2001
41 A.J. McCarron CIN 66.4 854 2 7.2 97.1 7 2015
3 Aaron Brooks NOR 55.9 3832 22 6.9 76.4 16 2001
3 Aaron Rodgers GNB 63.6 4038 13 7.5 93.8 16 2008
71 Akili Smith CIN 62.5 37 0 4.6 73.4 2 2001
.. ... ... ... ... .. ... ... .. ...
89 Wayne Chrebet NYJ 0.0 0 0 0.0 39.6 15 2002
39 Zach Mettenberger TEN 60.8 935 7 5.6 66.7 7 2015
112 Zach Pascal IND 0.0 0 0 0.0 39.6 16 2020
27 Zach Wilson NYJ 55.2 628 7 6.0 51.6 3 2021
105 Zay Jones BUF 0.0 0 0 0.0 39.6 16 2018
[427 rows x 9 columns]