使用 python 将数据写入 excel 时发生索引超出范围异常
Index out of range exception while writing data into excel using python
从 URL 获取数据时出现错误,数据 [][] 中的索引超出范围异常。
import requests
from bs4 import BeautifulSoup
import openpyxl
import uuid
row_number=2
for i in range(1,3):
website_url = "https://www.example.com/job_search?page=1&txtKeyword=IT&keyword=&txtLocation=Thiruvananthapuram%2C&page="+str(i)
res = requests.get(website_url, verify=False)
soup = BeautifulSoup(res.text, 'lxml')
Links = soup.find_all('div', {'class': 'col-md-6 col-sm-12'},)
data1=[]
data2=[]
for div_block in soup.find_all('dl', class_=['description-list'],style=None):
data1.append([line.strip() for line in div_block.stripped_strings])
for div_block in soup.find_all('div', class_=['details full-width'],style=None):
data2.append([line.strip() for line in div_block.stripped_strings])
url = [tag.find('a')['href'] for tag in Links]
wb = openpyxl.Workbook()
# Write a header row
columns = [
("job_listing_id"),#1
("unique_hash"),#2`enter code here`
("status"),#3
("primary_skills"),#4
("secondary_skills"),#5
("title"),#6
("description"),#7
("job_type"),#8
("source"),#9
("experience"),#10
("location"),#11
("company"),#12
("posted_date"),#13
("expiryDate"),#14
("vacancies"),#15
("company_website"),#16
("posted_by")#17
]
ws = wb.active
for col_number, value, in enumerate(columns, start=1) :
ws.cell(column=col_number, row=1, value=value)
row_number=row_number
y=0
id=uuid.uuid4
for x in url:
res = requests.get(f'{x}', verify=False)
soup = BeautifulSoup(res.text, 'lxml')
data = []
for div_block in soup.find_all('div', class_=['content-block-normal','details full-width'], style=None) :
data.append([line.strip() for line in div_block.stripped_strings])
# Write a data row
row = [
row_number-1, # job_listing_id
str(uuid.uuid4()), # Unique Hash
'NEW',#status
data1[y][5],#primary_skills
'',#secondary_skills
data2[y][0], # Job title
'\n'.join(data[2][1:]) ,#description`
'NA',#job_type
'NA', #source
data1[y][1],#EXp
data[1][4],#Location
data2[y][1],#company
data[0][3], # posted on
'', #expiryDate
'',#vacancies
'',#company_website
'example'#posted_by
]
y+=1
for col_number, value in enumerate(row, start=1):
cell = ws.cell(column=col_number, row=row_number, value=value)
row_number += 1
对于你在评论中给出的URL,数据返回为JSON,可以直接访问如下:
import requests
url = 'https://booking.snav.it/api/v1/rates/1040/2019-02-25/1042/2019-02-25?lang=1'
req = requests.get(url, verify=False)
data = req.json()
print(data['data']['itineraryOutward']['description'])
这会将 description
显示为:
NAPOLI BEVERELLO - ISCHIA CASAMICCIOLA
对于您的其他网站,您需要调查尝试使用他们的 API。以下内容应该可以帮助您入门:
import requests
from bs4 import BeautifulSoup
import openpyxl
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36',
'Host' : 'jobseeker-api.hirist.com',
'Referer' : 'https://www.hirist.com/',
'Authorization' : 'Bearer undefined',
'Origin' : 'https://www.hirist.com',
}
params = {
'pageNo' : '0',
'query' : 'software Engineer',
'loc' : '17',
'minexp' : '0',
'maxexp' : '0',
'range' : '0',
'boost' : '0',
'searchRange' : '4',
'searchOp' : 'AND',
'jobType' : '1',
}
with requests.Session() as sess:
url = "https://jobseeker-api.hirist.com/jobfeed/-1/search"
req_search = sess.get(url, params=params, headers=headers)
data = req_search.json()
wb = openpyxl.Workbook()
ws = wb.active
ws.append(['title', 'min_years', 'max_years'])
for job in data['jobs']:
row = [
job['title'],
job['min'],
job['max'],
]
ws.append(row)
wb.save('output.xlsx')
开始给你数据:
希望对您有所帮助
从 URL 获取数据时出现错误,数据 [][] 中的索引超出范围异常。
import requests
from bs4 import BeautifulSoup
import openpyxl
import uuid
row_number=2
for i in range(1,3):
website_url = "https://www.example.com/job_search?page=1&txtKeyword=IT&keyword=&txtLocation=Thiruvananthapuram%2C&page="+str(i)
res = requests.get(website_url, verify=False)
soup = BeautifulSoup(res.text, 'lxml')
Links = soup.find_all('div', {'class': 'col-md-6 col-sm-12'},)
data1=[]
data2=[]
for div_block in soup.find_all('dl', class_=['description-list'],style=None):
data1.append([line.strip() for line in div_block.stripped_strings])
for div_block in soup.find_all('div', class_=['details full-width'],style=None):
data2.append([line.strip() for line in div_block.stripped_strings])
url = [tag.find('a')['href'] for tag in Links]
wb = openpyxl.Workbook()
# Write a header row
columns = [
("job_listing_id"),#1
("unique_hash"),#2`enter code here`
("status"),#3
("primary_skills"),#4
("secondary_skills"),#5
("title"),#6
("description"),#7
("job_type"),#8
("source"),#9
("experience"),#10
("location"),#11
("company"),#12
("posted_date"),#13
("expiryDate"),#14
("vacancies"),#15
("company_website"),#16
("posted_by")#17
]
ws = wb.active
for col_number, value, in enumerate(columns, start=1) :
ws.cell(column=col_number, row=1, value=value)
row_number=row_number
y=0
id=uuid.uuid4
for x in url:
res = requests.get(f'{x}', verify=False)
soup = BeautifulSoup(res.text, 'lxml')
data = []
for div_block in soup.find_all('div', class_=['content-block-normal','details full-width'], style=None) :
data.append([line.strip() for line in div_block.stripped_strings])
# Write a data row
row = [
row_number-1, # job_listing_id
str(uuid.uuid4()), # Unique Hash
'NEW',#status
data1[y][5],#primary_skills
'',#secondary_skills
data2[y][0], # Job title
'\n'.join(data[2][1:]) ,#description`
'NA',#job_type
'NA', #source
data1[y][1],#EXp
data[1][4],#Location
data2[y][1],#company
data[0][3], # posted on
'', #expiryDate
'',#vacancies
'',#company_website
'example'#posted_by
]
y+=1
for col_number, value in enumerate(row, start=1):
cell = ws.cell(column=col_number, row=row_number, value=value)
row_number += 1
对于你在评论中给出的URL,数据返回为JSON,可以直接访问如下:
import requests
url = 'https://booking.snav.it/api/v1/rates/1040/2019-02-25/1042/2019-02-25?lang=1'
req = requests.get(url, verify=False)
data = req.json()
print(data['data']['itineraryOutward']['description'])
这会将 description
显示为:
NAPOLI BEVERELLO - ISCHIA CASAMICCIOLA
对于您的其他网站,您需要调查尝试使用他们的 API。以下内容应该可以帮助您入门:
import requests
from bs4 import BeautifulSoup
import openpyxl
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36',
'Host' : 'jobseeker-api.hirist.com',
'Referer' : 'https://www.hirist.com/',
'Authorization' : 'Bearer undefined',
'Origin' : 'https://www.hirist.com',
}
params = {
'pageNo' : '0',
'query' : 'software Engineer',
'loc' : '17',
'minexp' : '0',
'maxexp' : '0',
'range' : '0',
'boost' : '0',
'searchRange' : '4',
'searchOp' : 'AND',
'jobType' : '1',
}
with requests.Session() as sess:
url = "https://jobseeker-api.hirist.com/jobfeed/-1/search"
req_search = sess.get(url, params=params, headers=headers)
data = req_search.json()
wb = openpyxl.Workbook()
ws = wb.active
ws.append(['title', 'min_years', 'max_years'])
for job in data['jobs']:
row = [
job['title'],
job['min'],
job['max'],
]
ws.append(row)
wb.save('output.xlsx')
开始给你数据:
希望对您有所帮助