双 try/except - bs4 最佳实践
Double try/except - bs4 best practices
您可以在下面找到我用来从 SO 的职位列表中获取一些详细信息的方法之一。我事先不知道该页面是否包含我需要的所有字段(因此顶部的空字典声明)。
现在,我希望该方法抛出错误的唯一情况是在 HTTPError
的情况下,以防它找不到任何我想 return 一个空字典。使用 2 个不同的 try/except
块(每个错误类型一个)会导致正确的行为,但我想知道是否有更多 elegant/concise 方法来实现这一点。
get_so_extras.py:
from bs4 import BeautifulSoup
import re
imoprt requests
ua = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
def get_so_extras(job_url):
"""
Get additional information from whosebug.com job listing page.
Args:
job_url (str): url pointing at job listing's page
Returns:
dict: a dict containing additional info about the job
listing and company.
"""
extra_info = {
"company_logo": None,
"salary_lower": None,
"salary_upper": None,
"salary_currency": None
}
try:
page = requests.get(job_url, headers=ua)
soup = BeautifulSoup(page.text, "html.parser")
except requests.HTTPError as e:
print(e)
try:
logo = soup.find(
"div", attrs={"class": "grid--cell bg-white fl-shrink0"}).img["src"]
extra_info["company_logo"] = logo
# Salary information
salary = soup.find("div", attrs={'class': 'mt12'}).span["title"]
extra_info["salary_currency"] = re.match("[^\d\.\,\s]+", salary)[0]
extra_info["salary_lower"] = re.findall("(\d+)(|\s-\s)", salary)[0][0]
extra_info["salary_upper"] = re.findall("(\d+)(|\s-\s)", salary)[1][0]
except Exception as e:
pass
time.sleep(3) # be kind
return extra_info
感谢任何反馈
您只需尝试一次,多次尝试即可捕捉到您想要的错误。
对于您的代码片段,您可以这样做:
try:
page = requests.get(job_url, headers=ua)
soup = BeautifulSoup(page.text, "html.parser")
logo = soup.find(
"div", attrs={"class": "grid--cell bg-white fl-shrink0"}).img["src"]
extra_info["company_logo"] = logo
# Salary information
salary = soup.find("div", attrs={'class': 'mt12'}).span["title"]
extra_info["salary_currency"] = re.match("[^\d\.\,\s]+", salary)[0]
extra_info["salary_lower"] = re.findall("(\d+)(|\s-\s)", salary)[0][0]
extra_info["salary_upper"] = re.findall("(\d+)(|\s-\s)", salary)[1][0]
except requests.HTTPError as e:
print(e)
except Exception as e:
pass
您可以在下面找到我用来从 SO 的职位列表中获取一些详细信息的方法之一。我事先不知道该页面是否包含我需要的所有字段(因此顶部的空字典声明)。
现在,我希望该方法抛出错误的唯一情况是在 HTTPError
的情况下,以防它找不到任何我想 return 一个空字典。使用 2 个不同的 try/except
块(每个错误类型一个)会导致正确的行为,但我想知道是否有更多 elegant/concise 方法来实现这一点。
get_so_extras.py:
from bs4 import BeautifulSoup
import re
imoprt requests
ua = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
def get_so_extras(job_url):
"""
Get additional information from whosebug.com job listing page.
Args:
job_url (str): url pointing at job listing's page
Returns:
dict: a dict containing additional info about the job
listing and company.
"""
extra_info = {
"company_logo": None,
"salary_lower": None,
"salary_upper": None,
"salary_currency": None
}
try:
page = requests.get(job_url, headers=ua)
soup = BeautifulSoup(page.text, "html.parser")
except requests.HTTPError as e:
print(e)
try:
logo = soup.find(
"div", attrs={"class": "grid--cell bg-white fl-shrink0"}).img["src"]
extra_info["company_logo"] = logo
# Salary information
salary = soup.find("div", attrs={'class': 'mt12'}).span["title"]
extra_info["salary_currency"] = re.match("[^\d\.\,\s]+", salary)[0]
extra_info["salary_lower"] = re.findall("(\d+)(|\s-\s)", salary)[0][0]
extra_info["salary_upper"] = re.findall("(\d+)(|\s-\s)", salary)[1][0]
except Exception as e:
pass
time.sleep(3) # be kind
return extra_info
感谢任何反馈
您只需尝试一次,多次尝试即可捕捉到您想要的错误。
对于您的代码片段,您可以这样做:
try:
page = requests.get(job_url, headers=ua)
soup = BeautifulSoup(page.text, "html.parser")
logo = soup.find(
"div", attrs={"class": "grid--cell bg-white fl-shrink0"}).img["src"]
extra_info["company_logo"] = logo
# Salary information
salary = soup.find("div", attrs={'class': 'mt12'}).span["title"]
extra_info["salary_currency"] = re.match("[^\d\.\,\s]+", salary)[0]
extra_info["salary_lower"] = re.findall("(\d+)(|\s-\s)", salary)[0][0]
extra_info["salary_upper"] = re.findall("(\d+)(|\s-\s)", salary)[1][0]
except requests.HTTPError as e:
print(e)
except Exception as e:
pass