如何使用 Beautiful Soup 获得第一个 class 标签?
How can I get first class tag using Beautiful Soup?
我想抓取以下url的基金价格和日期:https://www.blackrock.com/sg/en/products/241427/blackrock-cont-european-flexible-d4rf-gbp-fund
并将这些值放入 table:
Date Price
21-Oct-2021 36.68
但是,在 html 来源中有许多
<span class="header-nav-label navAmount">
NAV as of 21-Oct-2021
</span>
<span class="header-nav-data">
GBP 36.68
</span>
<span class="header-nav-data">
0.10
(0.27%)
</span>
但我只想选第一个class,里面有每日价格。
我试过以下代码:
from bs4 import BeautifulSoup
import requests
#Create url list
urls = ['https://www.blackrock.com/sg/en/products/241427/blackrock-cont-european-flexible-d4rf-gbp-fund']
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'}
# Build the scraping loop
for url in urls:
# Extract HTML element (daily price and date) from url
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")
spans = soup.findAll('span', {'class':'header-nav-data'})
for span in spans:
print (span.text)
spans1 = soup.findAll('span', {'class':'header-nav-label navAmount'})
print (spans1)
哪个returns:
GBP 36.8
0.1
(0.27%)
[<span class="header-nav-label navAmount">
NAV as of 21-Oct-2021
</span>]
因为我只对价格感兴趣,所以我只需要 select 第一个
试试这个:
soup.find_all('span', class='header-nav-label navAmount')
你可以使用 limit=1 doc
from bs4 import BeautifulSoup
import requests
#Create url list
urls = ['https://www.blackrock.com/sg/en/products/241427/blackrock-cont-european-flexible-d4rf-gbp-fund']
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'}
# Build the scraping loop
for url in urls:
# Extract HTML element (daily price and date) from url
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")
spans = soup.findAll('span', {'class':'header-nav-data'})
print(spans)
print('----------------------------')
spans = soup.findAll('span', {'class':'header-nav-data'}, limit=1)
print(spans)
print('---------------------')
print(spans[0].text)
# or
for span in spans:
print (span.text)
spans1 = soup.findAll('span', {'class':'header-nav-label navAmount'})
print (spans1)
这是使用 css 选择器的工作解决方案。
代码:
from bs4 import BeautifulSoup
import requests
# Create url list
urls = ['https://www.blackrock.com/sg/en/products/241427/blackrock-cont-european-flexible-d4rf-gbp-fund']
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'}
# Build the scraping loop
for url in urls:
# Extract HTML element (daily price and date) from url
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")
spans1 = soup.select_one('ul.values-list li span:nth-child(1)').get_text(strip=True).replace('NAV as of', ' ')
spans2 = soup.select_one('ul.values-list li span:nth-child(2)').get_text(strip=True).replace('GBP', ' ')
print('Date:'+spans1)
print('Price:' +spans2)
输出:
Date: 21-Oct-2021
Price: 36.68
你也可以在 html:
中取出 json
import requests
import re
import json
import pandas as pd
url = 'https://www.blackrock.com/sg/en/products/241427/blackrock-cont-european-flexible-d4rf-gbp-fund'
response = requests.get(url)
regex = r"(var navData = )(\[.*)(;)"
jsonStr = re.search(regex, response.text).groups()[1]
jsonStr = re.sub(r"((x:)(Date.UTC\(\d{4},\d{1,2},\d{1,2}\)),y:Number\({1,2})([\d.]*)([).\s\w\(]*)", r"", jsonStr)
jsonStr = jsonStr.replace('x:','"y":')
jsonStr = jsonStr.replace('formattedX:','"Date":')
jsonData = json.loads(jsonStr)
df = pd.DataFrame(jsonData)
df = df[['Date','y']]
输出:
到最近的,就做print(df.tail(1))
print(df)
Date y
0 Thu, 13 Sep 2012 9.81
1 Fri, 14 Sep 2012 10.07
2 Mon, 17 Sep 2012 10.02
3 Tue, 18 Sep 2012 9.94
4 Wed, 19 Sep 2012 9.96
... ...
2275 Fri, 15 Oct 2021 36.30
2276 Mon, 18 Oct 2021 36.43
2277 Tue, 19 Oct 2021 36.48
2278 Wed, 20 Oct 2021 36.58
2279 Thu, 21 Oct 2021 36.68
[2280 rows x 2 columns]
我想抓取以下url的基金价格和日期:https://www.blackrock.com/sg/en/products/241427/blackrock-cont-european-flexible-d4rf-gbp-fund 并将这些值放入 table:
Date Price
21-Oct-2021 36.68
但是,在 html 来源中有许多
但我只想选第一个class,里面有每日价格。 我试过以下代码: 哪个returns: 因为我只对价格感兴趣,所以我只需要 select 第一个
<span class="header-nav-label navAmount">
NAV as of 21-Oct-2021
</span>
<span class="header-nav-data">
GBP 36.68
</span>
<span class="header-nav-data">
0.10
(0.27%)
</span>
from bs4 import BeautifulSoup
import requests
#Create url list
urls = ['https://www.blackrock.com/sg/en/products/241427/blackrock-cont-european-flexible-d4rf-gbp-fund']
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'}
# Build the scraping loop
for url in urls:
# Extract HTML element (daily price and date) from url
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")
spans = soup.findAll('span', {'class':'header-nav-data'})
for span in spans:
print (span.text)
spans1 = soup.findAll('span', {'class':'header-nav-label navAmount'})
print (spans1)
GBP 36.8
0.1
(0.27%)
[<span class="header-nav-label navAmount">
NAV as of 21-Oct-2021
</span>]
试试这个:
soup.find_all('span', class='header-nav-label navAmount')
你可以使用 limit=1 doc
from bs4 import BeautifulSoup
import requests
#Create url list
urls = ['https://www.blackrock.com/sg/en/products/241427/blackrock-cont-european-flexible-d4rf-gbp-fund']
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'}
# Build the scraping loop
for url in urls:
# Extract HTML element (daily price and date) from url
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")
spans = soup.findAll('span', {'class':'header-nav-data'})
print(spans)
print('----------------------------')
spans = soup.findAll('span', {'class':'header-nav-data'}, limit=1)
print(spans)
print('---------------------')
print(spans[0].text)
# or
for span in spans:
print (span.text)
spans1 = soup.findAll('span', {'class':'header-nav-label navAmount'})
print (spans1)
这是使用 css 选择器的工作解决方案。
代码:
from bs4 import BeautifulSoup
import requests
# Create url list
urls = ['https://www.blackrock.com/sg/en/products/241427/blackrock-cont-european-flexible-d4rf-gbp-fund']
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'}
# Build the scraping loop
for url in urls:
# Extract HTML element (daily price and date) from url
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")
spans1 = soup.select_one('ul.values-list li span:nth-child(1)').get_text(strip=True).replace('NAV as of', ' ')
spans2 = soup.select_one('ul.values-list li span:nth-child(2)').get_text(strip=True).replace('GBP', ' ')
print('Date:'+spans1)
print('Price:' +spans2)
输出:
Date: 21-Oct-2021
Price: 36.68
你也可以在 html:
中取出 jsonimport requests
import re
import json
import pandas as pd
url = 'https://www.blackrock.com/sg/en/products/241427/blackrock-cont-european-flexible-d4rf-gbp-fund'
response = requests.get(url)
regex = r"(var navData = )(\[.*)(;)"
jsonStr = re.search(regex, response.text).groups()[1]
jsonStr = re.sub(r"((x:)(Date.UTC\(\d{4},\d{1,2},\d{1,2}\)),y:Number\({1,2})([\d.]*)([).\s\w\(]*)", r"", jsonStr)
jsonStr = jsonStr.replace('x:','"y":')
jsonStr = jsonStr.replace('formattedX:','"Date":')
jsonData = json.loads(jsonStr)
df = pd.DataFrame(jsonData)
df = df[['Date','y']]
输出:
到最近的,就做print(df.tail(1))
print(df)
Date y
0 Thu, 13 Sep 2012 9.81
1 Fri, 14 Sep 2012 10.07
2 Mon, 17 Sep 2012 10.02
3 Tue, 18 Sep 2012 9.94
4 Wed, 19 Sep 2012 9.96
... ...
2275 Fri, 15 Oct 2021 36.30
2276 Mon, 18 Oct 2021 36.43
2277 Tue, 19 Oct 2021 36.48
2278 Wed, 20 Oct 2021 36.58
2279 Thu, 21 Oct 2021 36.68
[2280 rows x 2 columns]