如何使用 python 从站点中提取职位描述(网络抓取)
How to extract job description (web-scrapping) from the site using python
我试图从求职网站提取职位描述。除了职位描述,我得到了所有细节。我在下面附上我的代码和详细信息。从这段代码中,我分别获得了公司详细信息位置和一些其他数据。就像那样,我需要完整职位的职位描述。虽然 运行 附加 Job_Description 我没有得到任何数据。
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0"
}
url = "https://in.indeed.com/jobs?q=software%20engineer&l=Kerala&sort=date&vjk=ce1481bc5c182a25"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
soup = BeautifulSoup(requests.get(url, headers=headers).content, "html.parser")
Links = soup.find("div", {"class":"pagination"}).find_all('a')
Page= [tag['href'] for tag in Links]
for pageid in range(0,2):
pageid=10*pageid
website=f'https://in.indeed.com//jobs?q=software+engineer&l=Kerala&sort=date&start={pageid}'
soup = BeautifulSoup(requests.get(website, headers=headers).content, "html.parser")
SubLinks = soup.find("div", {"class":"pagination"}).find_all('a')
Page=list(set(Page+ [tag['href'] for tag in SubLinks]))
for job in soup.select('a[id^="job_"]'):
job_id = job["id"].split("_")[-1]
#s = BeautifulSoup(requests.get(api_url.format(job_id=job_id), headers=headers).content,"html.parser",)
data=[]
Company_Name=[]
Location=[]
Job_Description=[]
for div_block in soup.find_all('span', class_=['companyName',],style=None):
Company_Name.append([line.strip() for line in div_block.stripped_strings])
for div_block in soup.find_all('div', class_=['companyLocation'],style=None):
Location.append([line.strip() for line in div_block.stripped_strings])
for div_block in soup.find_all('div',class_=['jobsearch-JobComponent-description icl-u-xs-mt--md'],style=None):
Job_Description.append([line.strip() for line in div_block.stripped_strings])
由于您在 indeed.com 的分页搜索部分工作,除非您 select 这份工作并投入其中,否则您不会获得完整的职位描述。
话虽如此,我相信您正在寻找的是工作片段,它会根据代码的搜索条件为您提供所需的结果。
for div_block in soup.find_all('div',class_=['job-snippet'],style=None):
Job_Description.append([line.strip() for line in div_block.stripped_strings])
根据您要查找的内容,我认为您实际上想要获取所有数据而不仅仅是片段,因此我会考虑这样做。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import requests, json
from bs4 import BeautifulSoup
def main():
url = "https://in.indeed.com/jobs?q=software%20engineer&l=Kerala&sort=date&vjk=ce1481bc5c182a25"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'}
soup = BeautifulSoup(requests.get(url, headers=headers).content, "html.parser")
job_card = None
for row in str(soup).split('\n'):
if 'window.mosaic.providerData["mosaic-provider-jobcards"]=' in row:
job_card = row.replace('window.mosaic.providerData["mosaic-provider-jobcards"]=', '').replace(';', '')
job_card_data = json.loads(job_card)
job_list = list()
for job in job_card_data['metaData']['mosaicProviderJobCardsModel']['results']:
job_dict = job
job_full_soup_url = 'https://in.indeed.com{}'.format(job['viewJobLink'])
job_full_soup = BeautifulSoup(requests.get(job_full_soup_url, headers=headers).content, "html.parser")
for div_block in job_full_soup.find_all('div', class_=['jobsearch-jobDescriptionText'],style=None):
job_dict['full_description'] = [line.strip() for line in div_block.stripped_strings]
job_list.append(job_dict)
print(json.dumps(job_list, indent=4))
if __name__ == '__main__':
main()
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0"
}
url = "https://in.indeed.com/jobs?q=software%20engineer&l=Kerala&sort=date&vjk=ce1481bc5c182a25"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
soup = BeautifulSoup(requests.get(url, headers=headers).content, "html.parser")
Links = soup.find("div", {"class":"pagination"}).find_all('a')
Page= [tag['href'] for tag in Links]
for pageid in range(0,2):
pageid=10*pageid
website=f'https://in.indeed.com//jobs?q=software+engineer&l=Kerala&sort=date&start={pageid}'
soup = BeautifulSoup(requests.get(website, headers=headers).content, "html.parser")
SubLinks = soup.find("div", {"class":"pagination"}).find_all('a')
Page=list(set(Page+ [tag['href'] for tag in SubLinks]))
for job in soup.select('a[id^="job_"]'):
job_id = job["id"].split("_")[-1]
#s = BeautifulSoup(requests.get(api_url.format(job_id=job_id), headers=headers).content,"html.parser",)
data=[]
Company_Name=[]
Location=[]
Job_Description=[]
for div_block in soup.find_all('span', class_=['companyName',],style=None):
Company_Name.append([line.strip() for line in div_block.stripped_strings])
for div_block in soup.find_all('div', class_=['companyLocation'],style=None):
Location.append([line.strip() for line in div_block.stripped_strings])
for div_block in soup.find_all('div',class_=['jobsearch-JobComponent-description icl-u-xs-mt--md'],style=None):
Job_Description.append([line.strip() for line in div_block.stripped_strings])
由于您在 indeed.com 的分页搜索部分工作,除非您 select 这份工作并投入其中,否则您不会获得完整的职位描述。
话虽如此,我相信您正在寻找的是工作片段,它会根据代码的搜索条件为您提供所需的结果。
for div_block in soup.find_all('div',class_=['job-snippet'],style=None):
Job_Description.append([line.strip() for line in div_block.stripped_strings])
根据您要查找的内容,我认为您实际上想要获取所有数据而不仅仅是片段,因此我会考虑这样做。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import requests, json
from bs4 import BeautifulSoup
def main():
url = "https://in.indeed.com/jobs?q=software%20engineer&l=Kerala&sort=date&vjk=ce1481bc5c182a25"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'}
soup = BeautifulSoup(requests.get(url, headers=headers).content, "html.parser")
job_card = None
for row in str(soup).split('\n'):
if 'window.mosaic.providerData["mosaic-provider-jobcards"]=' in row:
job_card = row.replace('window.mosaic.providerData["mosaic-provider-jobcards"]=', '').replace(';', '')
job_card_data = json.loads(job_card)
job_list = list()
for job in job_card_data['metaData']['mosaicProviderJobCardsModel']['results']:
job_dict = job
job_full_soup_url = 'https://in.indeed.com{}'.format(job['viewJobLink'])
job_full_soup = BeautifulSoup(requests.get(job_full_soup_url, headers=headers).content, "html.parser")
for div_block in job_full_soup.find_all('div', class_=['jobsearch-jobDescriptionText'],style=None):
job_dict['full_description'] = [line.strip() for line in div_block.stripped_strings]
job_list.append(job_dict)
print(json.dumps(job_list, indent=4))
if __name__ == '__main__':
main()