我怎样才能从 'every page' 得到 'page links'?
How can I get 'page links' from 'every page'?
我想通过 python3 从 'every page' 得到 'every page links'。
'every page' 位置在我的代码中位于 BaseUrl 中。而且,每个页面链接都位于我的代码中。
其中,
BaseUrl = 'https://www.jobplanet.co.kr/companies?sort_by=review_compensation_cache&industry_id=700&page='
select body = #listCompanies > div > div.section_group > section:nth-child(1) > div > div > dl.content_col2_3.cominfo > dt > a'
请检查我的代码。我想收集每个页面的每个链接,以便将 Link 列表作为 linkUrl。有什么问题吗?
from bs4 import BeautifulSoup
import csv
import os
import re
import requests
import json
# jobplanet
BaseUrl = 'https://www.jobplanet.co.kr/companies?sort_by=review_compensation_cache&industry_id=700&page='
for i in range(1, 5, 1):
url = BaseUrl + str(i)
r = requests.get(url)
soup = BeautifulSoup(r.text,'lxml')
body = soup.select('#listCompanies > div > div.section_group > section:nth-child(1) > div > div > dl.content_col2_3.cominfo > dt > a')
#print(body)
linkUrl = []
for item in body:
link = item.get('href')
linkUrl.append(link)
print(linkUrl)
您选择的 CSS 选择器 returns 只有一个 record.I 提供了更简单的 css 选择器 returns 每页所有 10 条记录。
您需要在循环外定义列表。
from bs4 import BeautifulSoup
import requests
linkUrl = []
BaseUrl = 'https://www.jobplanet.co.kr/companies?sort_by=review_compensation_cache&industry_id=700&page={}'
for i in range(1, 6):
url = BaseUrl.format(i)
r = requests.get(url)
soup = BeautifulSoup(r.text,'lxml')
links=soup.select(".us_titb_l3 >a")
for item in links:
link = item.get('href')
linkUrl.append(link)
print(linkUrl)
你的 Css 选择器是错误的还添加了分页
from bs4 import BeautifulSoup
import csv
import os
import re
import requests
import json
from urllib import parse
# jobplanet
BaseUrl = 'https://www.jobplanet.co.kr/companies?sort_by=review_compensation_cache&industry_id=700&page={}'
source = requests.get(BaseUrl.format(1))
soup = BeautifulSoup(source.text,'lxml')
last_page_index = soup.select('a[class="btn_pglast"]') # getting the last page index
last_page_index = int(last_page_index[0].get('href').split('page=')[1]) if last_page_index else 1
for i in range(1, last_page_index):
print('## Getting Page {} out of {}'.format(i,last_page_index))
if i > 1: # to avoid getting the same page again
url = BaseUrl.format(i)
r = requests.get(url)
soup = BeautifulSoup(r.text,'lxml')
body = soup.select('dt[class="us_titb_l3"] a')
linkUrl = []
for item in body:
link = item.get('href')
link = parse.urljoin(BaseUrl, link)
linkUrl.append(link)
print(linkUrl)
我想通过 python3 从 'every page' 得到 'every page links'。
'every page' 位置在我的代码中位于 BaseUrl 中。而且,每个页面链接都位于我的代码中。
其中,
BaseUrl = 'https://www.jobplanet.co.kr/companies?sort_by=review_compensation_cache&industry_id=700&page='
select body = #listCompanies > div > div.section_group > section:nth-child(1) > div > div > dl.content_col2_3.cominfo > dt > a'
请检查我的代码。我想收集每个页面的每个链接,以便将 Link 列表作为 linkUrl。有什么问题吗?
from bs4 import BeautifulSoup
import csv
import os
import re
import requests
import json
# jobplanet
BaseUrl = 'https://www.jobplanet.co.kr/companies?sort_by=review_compensation_cache&industry_id=700&page='
for i in range(1, 5, 1):
url = BaseUrl + str(i)
r = requests.get(url)
soup = BeautifulSoup(r.text,'lxml')
body = soup.select('#listCompanies > div > div.section_group > section:nth-child(1) > div > div > dl.content_col2_3.cominfo > dt > a')
#print(body)
linkUrl = []
for item in body:
link = item.get('href')
linkUrl.append(link)
print(linkUrl)
您选择的 CSS 选择器 returns 只有一个 record.I 提供了更简单的 css 选择器 returns 每页所有 10 条记录。
您需要在循环外定义列表。
from bs4 import BeautifulSoup
import requests
linkUrl = []
BaseUrl = 'https://www.jobplanet.co.kr/companies?sort_by=review_compensation_cache&industry_id=700&page={}'
for i in range(1, 6):
url = BaseUrl.format(i)
r = requests.get(url)
soup = BeautifulSoup(r.text,'lxml')
links=soup.select(".us_titb_l3 >a")
for item in links:
link = item.get('href')
linkUrl.append(link)
print(linkUrl)
你的 Css 选择器是错误的还添加了分页
from bs4 import BeautifulSoup
import csv
import os
import re
import requests
import json
from urllib import parse
# jobplanet
BaseUrl = 'https://www.jobplanet.co.kr/companies?sort_by=review_compensation_cache&industry_id=700&page={}'
source = requests.get(BaseUrl.format(1))
soup = BeautifulSoup(source.text,'lxml')
last_page_index = soup.select('a[class="btn_pglast"]') # getting the last page index
last_page_index = int(last_page_index[0].get('href').split('page=')[1]) if last_page_index else 1
for i in range(1, last_page_index):
print('## Getting Page {} out of {}'.format(i,last_page_index))
if i > 1: # to avoid getting the same page again
url = BaseUrl.format(i)
r = requests.get(url)
soup = BeautifulSoup(r.text,'lxml')
body = soup.select('dt[class="us_titb_l3"] a')
linkUrl = []
for item in body:
link = item.get('href')
link = parse.urljoin(BaseUrl, link)
linkUrl.append(link)
print(linkUrl)