如何抓取两个具有相同类名的表?
How to scrape two tables with same classname?
这是我第一次抓取网站。问题是两个不同的 table 具有相同的类名。到目前为止,我了解到要查找数据,我必须通过 HTML 标记的类名找到它。
该代码用于从第一个 table 抓取数据,但我也想为第二个 table 抓取数据。
import bs4 as bs
from urllib.request import Request, urlopen
import pandas as pd
from pyparsing import col
req = Request('https://www.worldometers.info/world-population/albania-population/',
headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
soup = bs.BeautifulSoup(webpage, 'html5lib')
# albania population
pupulation = soup.find(class_='col-md-8 country-pop-description')
for i in pupulation.find_all('strong')[1]:
print()
# print(i.text, end=" ")
# getting all city populattion
city_population = soup.find(
class_='table table-hover table-condensed table-list')
# print(city_population.text, end=" ")
# the first table
# population of albania(historical)
df = pd.DataFrame(columns=['Year', 'Population' 'Yearly Change %', 'Yearly Change', 'Migrants (net)', 'Median Age', 'Fertility Rate',
'Density(P/Km2)', 'Urban Pop %', 'Urban Population', "Countrys Share of Population", 'World Population', 'Albania Global Rank'])
hisoric_population = soup.find('table',
class_='table table-striped table-bordered table-hover table-condensed table-list')
for row in hisoric_population.tbody.find_all('tr'):
columns = row.find_all('td')
if (columns != []):
Year = columns[0].text.strip()
Population = columns[1].text.strip()
YearlyChange_percent = columns[2].text.strip('&0')
YearlyChange = columns[3].text.strip()
Migrants_net = columns[4].text.strip()
MedianAge = columns[5].text.strip('&0')
FertilityRate = columns[6].text.strip('&0')
Density_P_Km2 = columns[7].text.strip()
UrbanPop_percent = columns[8].text.strip('&0')
Urban_Population = columns[9].text.strip()
Countrys_Share_of_Population = columns[10].text.strip('&0')
World_Population = columns[11].text.strip()
Albania_Global_Rank = columns[12].text.strip()
df = df.append({'Year': Year, 'Population': Population, 'Yearly Change %': YearlyChange_percent, 'Yearly Change': YearlyChange, 'Migrants (net)': Migrants_net, 'Median Age': MedianAge, 'Fertility Rate': FertilityRate,
'Density(P/Km2)': Density_P_Km2, 'Urban Pop %': UrbanPop_percent, 'Countrys Share of Population': Countrys_Share_of_Population, 'World Population': World_Population, 'Albania Global Rank': Albania_Global_Rank}, ignore_index=True)
df.head()
# print(df)
#the second table
# Albania Population Forecast
forecast_population = soup.find(
'table', class_='table table-striped table-bordered table-hover table-condensed table-list')
for row in hisoric_population.tbody.find_all('tr'):
columns = row.find_all('td')
print(columns)
我认为您可以使用 find_all
获取两个表并将它们存储为 'bs4.element.ResultSet'
(下面代码中的 tables
),这可以通过使用 for 循环或它们的索引进行迭代
tables = soup.find_all('table', class_='table table-striped table-bordered table-hover table-condensed table-list')
hisoric_population = tables[0]
forecast_population = tables[1]
如上所述,使用 .find_all()
。当您使用 .find()
时,它只会 return 它找到的第一个实例。 find_all()
会将它找到的所有实例 return 放入列表中。然后你需要通过它的索引值来计算出你想要的特定的。
另一方面,为什么不使用 pandas
来解析表格。它在后台使用 BeautifulSoup。
import requests
import pandas as pd
url = 'https://www.worldometers.info/world-population/albania-population/'
response = requests.get(url)
dfs = pd.read_html(response.text, attrs={'class':'table table-striped table-bordered table-hover table-condensed table-list'})
historic_population = dfs[0]
forecast_population = dfs[1]
输出:
print(historic_population)
Year Population ... World Population AlbaniaGlobal Rank
0 2020 2877797 ... 7794798739 140
1 2019 2880917 ... 7713468100 140
2 2018 2882740 ... 7631091040 140
3 2017 2884169 ... 7547858925 140
4 2016 2886438 ... 7464022049 141
5 2015 2890513 ... 7379797139 141
6 2010 2948023 ... 6956823603 138
7 2005 3086810 ... 6541907027 134
8 2000 3129243 ... 6143493823 131
9 1995 3112936 ... 5744212979 130
10 1990 3286073 ... 5327231061 125
11 1985 2969672 ... 4870921740 125
12 1980 2682690 ... 4458003514 125
13 1975 2411732 ... 4079480606 126
14 1970 2150707 ... 3700437046 125
15 1965 1896171 ... 3339583597 127
16 1960 1636090 ... 3034949748 124
17 1955 1419994 ... 2773019936 127
[18 rows x 13 columns]
print(forecast_population)
Year Population ... World Population AlbaniaGlobal Rank
0 NaN NaN ... NaN NaN
1 2020.0 2877797.0 ... 7.794799e+09 140.0
2 2025.0 2840464.0 ... 8.184437e+09 141.0
3 2030.0 2786974.0 ... 8.548487e+09 143.0
4 2035.0 2721082.0 ... 8.887524e+09 145.0
5 2040.0 2634384.0 ... 9.198847e+09 146.0
6 2045.0 2533645.0 ... 9.481803e+09 147.0
7 2050.0 2424061.0 ... 9.735034e+09 148.0
[8 rows x 13 columns]
这是我第一次抓取网站。问题是两个不同的 table 具有相同的类名。到目前为止,我了解到要查找数据,我必须通过 HTML 标记的类名找到它。 该代码用于从第一个 table 抓取数据,但我也想为第二个 table 抓取数据。
import bs4 as bs
from urllib.request import Request, urlopen
import pandas as pd
from pyparsing import col
req = Request('https://www.worldometers.info/world-population/albania-population/',
headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
soup = bs.BeautifulSoup(webpage, 'html5lib')
# albania population
pupulation = soup.find(class_='col-md-8 country-pop-description')
for i in pupulation.find_all('strong')[1]:
print()
# print(i.text, end=" ")
# getting all city populattion
city_population = soup.find(
class_='table table-hover table-condensed table-list')
# print(city_population.text, end=" ")
# the first table
# population of albania(historical)
df = pd.DataFrame(columns=['Year', 'Population' 'Yearly Change %', 'Yearly Change', 'Migrants (net)', 'Median Age', 'Fertility Rate',
'Density(P/Km2)', 'Urban Pop %', 'Urban Population', "Countrys Share of Population", 'World Population', 'Albania Global Rank'])
hisoric_population = soup.find('table',
class_='table table-striped table-bordered table-hover table-condensed table-list')
for row in hisoric_population.tbody.find_all('tr'):
columns = row.find_all('td')
if (columns != []):
Year = columns[0].text.strip()
Population = columns[1].text.strip()
YearlyChange_percent = columns[2].text.strip('&0')
YearlyChange = columns[3].text.strip()
Migrants_net = columns[4].text.strip()
MedianAge = columns[5].text.strip('&0')
FertilityRate = columns[6].text.strip('&0')
Density_P_Km2 = columns[7].text.strip()
UrbanPop_percent = columns[8].text.strip('&0')
Urban_Population = columns[9].text.strip()
Countrys_Share_of_Population = columns[10].text.strip('&0')
World_Population = columns[11].text.strip()
Albania_Global_Rank = columns[12].text.strip()
df = df.append({'Year': Year, 'Population': Population, 'Yearly Change %': YearlyChange_percent, 'Yearly Change': YearlyChange, 'Migrants (net)': Migrants_net, 'Median Age': MedianAge, 'Fertility Rate': FertilityRate,
'Density(P/Km2)': Density_P_Km2, 'Urban Pop %': UrbanPop_percent, 'Countrys Share of Population': Countrys_Share_of_Population, 'World Population': World_Population, 'Albania Global Rank': Albania_Global_Rank}, ignore_index=True)
df.head()
# print(df)
#the second table
# Albania Population Forecast
forecast_population = soup.find(
'table', class_='table table-striped table-bordered table-hover table-condensed table-list')
for row in hisoric_population.tbody.find_all('tr'):
columns = row.find_all('td')
print(columns)
我认为您可以使用 find_all
获取两个表并将它们存储为 'bs4.element.ResultSet'
(下面代码中的 tables
),这可以通过使用 for 循环或它们的索引进行迭代
tables = soup.find_all('table', class_='table table-striped table-bordered table-hover table-condensed table-list')
hisoric_population = tables[0]
forecast_population = tables[1]
如上所述,使用 .find_all()
。当您使用 .find()
时,它只会 return 它找到的第一个实例。 find_all()
会将它找到的所有实例 return 放入列表中。然后你需要通过它的索引值来计算出你想要的特定的。
另一方面,为什么不使用 pandas
来解析表格。它在后台使用 BeautifulSoup。
import requests
import pandas as pd
url = 'https://www.worldometers.info/world-population/albania-population/'
response = requests.get(url)
dfs = pd.read_html(response.text, attrs={'class':'table table-striped table-bordered table-hover table-condensed table-list'})
historic_population = dfs[0]
forecast_population = dfs[1]
输出:
print(historic_population)
Year Population ... World Population AlbaniaGlobal Rank
0 2020 2877797 ... 7794798739 140
1 2019 2880917 ... 7713468100 140
2 2018 2882740 ... 7631091040 140
3 2017 2884169 ... 7547858925 140
4 2016 2886438 ... 7464022049 141
5 2015 2890513 ... 7379797139 141
6 2010 2948023 ... 6956823603 138
7 2005 3086810 ... 6541907027 134
8 2000 3129243 ... 6143493823 131
9 1995 3112936 ... 5744212979 130
10 1990 3286073 ... 5327231061 125
11 1985 2969672 ... 4870921740 125
12 1980 2682690 ... 4458003514 125
13 1975 2411732 ... 4079480606 126
14 1970 2150707 ... 3700437046 125
15 1965 1896171 ... 3339583597 127
16 1960 1636090 ... 3034949748 124
17 1955 1419994 ... 2773019936 127
[18 rows x 13 columns]
print(forecast_population)
Year Population ... World Population AlbaniaGlobal Rank
0 NaN NaN ... NaN NaN
1 2020.0 2877797.0 ... 7.794799e+09 140.0
2 2025.0 2840464.0 ... 8.184437e+09 141.0
3 2030.0 2786974.0 ... 8.548487e+09 143.0
4 2035.0 2721082.0 ... 8.887524e+09 145.0
5 2040.0 2634384.0 ... 9.198847e+09 146.0
6 2045.0 2533645.0 ... 9.481803e+09 147.0
7 2050.0 2424061.0 ... 9.735034e+09 148.0
[8 rows x 13 columns]