如何使用 python 从站点中提取职位描述(网络抓取)

How to extract job description (web-scrapping) from the site using python

我试图从求职网站提取职位描述。除了职位描述,我得到了所有细节。我在下面附上我的代码和详细信息。从这段代码中,我分别获得了公司详细信息位置和一些其他数据。就像那样,我需要完整职位的职位描述。虽然 运行 附加 Job_Description 我没有得到任何数据。

import requests
from bs4 import BeautifulSoup
headers = {
    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0"
}


url = "https://in.indeed.com/jobs?q=software%20engineer&l=Kerala&sort=date&vjk=ce1481bc5c182a25"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
soup = BeautifulSoup(requests.get(url, headers=headers).content, "html.parser")
Links = soup.find("div", {"class":"pagination"}).find_all('a')


Page= [tag['href'] for tag in Links]
for pageid in range(0,2):
    pageid=10*pageid
    website=f'https://in.indeed.com//jobs?q=software+engineer&l=Kerala&sort=date&start={pageid}'
    soup = BeautifulSoup(requests.get(website, headers=headers).content, "html.parser")
    SubLinks = soup.find("div", {"class":"pagination"}).find_all('a')
    Page=list(set(Page+ [tag['href'] for tag in SubLinks]))

    for job in soup.select('a[id^="job_"]'):
        job_id = job["id"].split("_")[-1]
        #s = BeautifulSoup(requests.get(api_url.format(job_id=job_id), headers=headers).content,"html.parser",)
    data=[]
    Company_Name=[]
    Location=[]
    Job_Description=[]
    for div_block in soup.find_all('span', class_=['companyName',],style=None):
        Company_Name.append([line.strip() for line in div_block.stripped_strings])
    for div_block in soup.find_all('div', class_=['companyLocation'],style=None):
        Location.append([line.strip() for line in div_block.stripped_strings])
    for div_block in soup.find_all('div',class_=['jobsearch-JobComponent-description icl-u-xs-mt--md'],style=None):
        Job_Description.append([line.strip() for line in div_block.stripped_strings])

由于您在 indeed.com 的分页搜索部分工作,除非您 select 这份工作并投入其中,否则您不会获得完整的职位描述。

话虽如此,我相信您正在寻找的是工作片段,它会根据代码的搜索条件为您提供所需的结果。

for div_block in soup.find_all('div',class_=['job-snippet'],style=None):
    Job_Description.append([line.strip() for line in div_block.stripped_strings])

根据您要查找的内容,我认为您实际上想要获取所有数据而不仅仅是片段,因此我会考虑这样做。

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import requests, json
from bs4 import BeautifulSoup

def main():
    url = "https://in.indeed.com/jobs?q=software%20engineer&l=Kerala&sort=date&vjk=ce1481bc5c182a25"
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'}
    soup = BeautifulSoup(requests.get(url, headers=headers).content, "html.parser")
    job_card = None
    for row in str(soup).split('\n'):
        if 'window.mosaic.providerData["mosaic-provider-jobcards"]=' in row:
            job_card = row.replace('window.mosaic.providerData["mosaic-provider-jobcards"]=', '').replace(';', '')
    job_card_data = json.loads(job_card)
    job_list = list()
    for job in job_card_data['metaData']['mosaicProviderJobCardsModel']['results']:
        job_dict = job
        job_full_soup_url = 'https://in.indeed.com{}'.format(job['viewJobLink'])
        job_full_soup = BeautifulSoup(requests.get(job_full_soup_url, headers=headers).content, "html.parser")
        for div_block in job_full_soup.find_all('div', class_=['jobsearch-jobDescriptionText'],style=None):
            job_dict['full_description'] = [line.strip() for line in div_block.stripped_strings]
        job_list.append(job_dict)
    print(json.dumps(job_list, indent=4))

if __name__ == '__main__':
    main()