Python urllib.error.HTTPError: HTTP Error 404: Not Found
Python urllib.error.HTTPError: HTTP Error 404: Not Found
我正在尝试 运行 一个脚本来从网站上抓取新闻文本。
主页重组后好像有报废限制了
而且我不断收到此错误,但如果我测试一首歌曲 url 它会起作用。
有什么建议吗?
Traceback (most recent call last):
File "C:\Users\park\AppData\Local\Programs\Python\Python36-32\bigkinds_headers_v1.3_20170522.py", line 157, in <module>
result = fetch_news_detail(news['href'])
File "C:\Users\park\AppData\Local\Programs\Python\Python36-32\bigkinds_headers_v1.3_20170522.py", line 107, in fetch_news_detail
res = urlopen(url).read().decode('utf-8', errors='ignore')
File "C:\Users\park\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\park\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 532, in open
response = meth(req, response)
File "C:\Users\park\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 642, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Users\park\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 570, in error
return self._call_chain(*args)
File "C:\Users\park\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 504, in _call_chain
result = func(*args)
File "C:\Users\park\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 650, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 404: Not Found
这是我的代码
import urllib.request
from bs4 import BeautifulSoup
import time
import urllib.parse
import json
from urllib.request import urlopen
import random
def fetch_news_list(page, keyword,start,end):
result = []
url = "https://www.bigkinds.or.kr/news/newsResult.do"
headers = {
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_3_2 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13F69 Safari/601.1', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8', 'Connection': 'keep-alive'
}
param = {
'pageInfo':'newsResult',
'login_chk':'null',
'LOGIN_SN':'null',
'LOGIN_NAME':'null',
'indexName':'news',
'keyword':keyword,
'byLine':'',
'searchScope':'1',
'searchFtr':'1',
'startDate':start,
'endDate':end,
'sortMethod':'date',
'contentLength':'100',
'providerCode':'',
'categoryCode':'',
'incidentCode':'',
'dateCode':'',
'highlighting':'',
'sessionUSID':'',
'sessionUUID':'test',
'listMode':'',
'categoryTab':'',
'newsId':'',
'filterProviderCode':'',
'filterCategoryCode':'',
'filterIncidentCode':'',
'filterDateCode':'',
'startNo':page,
'resultNumber':'100',
'topmenuoff':'',
'resultState':'',
'keywordJson':'{"searchDetailTxt1":keyword,"agreeDetailTxt1":"","needDetailTxt1":"","exceptDetailTxt1":"","o_id":"option1","startDate":start,"endDate":end,"providerNm":"","categoryNm":"","incidentCategoryNm":"","providerCode":"","categoryCode":"","incidentCategoryCode":"","searchFtr":"1","searchScope":"1","searchKeyword":"keyword"}',
'keywordFilterJson':'',
'totalCount':'',
'interval':'',
'quotationKeyword1':'',
'quotationKeyword2':'',
'quotationKeyword3':'',
'searchFromUseYN':'N',
'mainTodayPersonYn':'',
'period':'1year'
}
param = urllib.parse.urlencode(param).encode()
req = urllib.request.Request(url, param, headers)
sleepTime = random.randint(4,10)
time.sleep(sleepTime)
print(str(sleepTime) + ' seconds wait.')
try :
res = urllib.request.urlopen(req)
except URLError as e:
if hasattr(e, 'reason'):
print('We failed to reach a server.')
print('Reason: ', e.reason)
elif hasattr(e, 'code'):
print('The server couldn\'t fulfill the request.')
print('Error code: ', e.code)
html = res.read()
soup = BeautifulSoup(html, "html.parser")
div_tags = soup.find_all('div', class_='resTxt')
for cts in div_tags:
ids = cts.find('h3')['id'][5:31]
title = cts.find('h3',class_='list_newsId').get_text(strip=True)
href = 'https://www.bigkinds.or.kr/news/detailView.do?docId=' + ids + '&returnCnt=1'
sets = {
'title' : title,
'href' : href
}
result. append(sets)
return result
def fetch_news_detail(url):
result = {}
res = urlopen(url).read().decode('utf-8', errors='ignore')
responseJson = json.loads(res)
category = responseJson.get("detail").get("CATEGORY_MAIN")
date = responseJson.get("detail").get("DATE")
provider = responseJson.get("detail").get("PROVIDER")
content = responseJson.get("detail").get("CONTENT")
result = {
'category': category,
'date': date,
'provider' : provider,
'content': content
}
return result
keyword = input('(eg., 외국인 NOT(증시 OR 순매수 OR 증권 OR 코스피 OR 코스닥 OR 주식 OR 주가 OR 투타 OR KBO OR 야구 OR KBL OR 농구 OR 축구 OR 올림픽 OR K리그))\n input word: ')
start = input('(eg., 2017-01-01)\n input startday: ')
end = input('(eg., 2017-02-01)\n input endday: ')
page = 1
count = 1
flag = True
f = open('bigkinds.txt', 'w', encoding='utf-8')
while True:
if not flag:
break
news_list = fetch_news_list(page, keyword,start,end)
sleepTime = random.randint(3,8)
time.sleep(sleepTime)
print(str(sleepTime) + ' seconds wait.')
for news in news_list:
result = fetch_news_detail(news['href'])
result['title'] = news['title']
f.write('==' * 40 + '\n')
f.write('category: ' + result['category'] + '\n')
f.write('title: ' + result['title'] + '\n')
f.write('date: ' + result['date'] + '\n')
f.write('provider: ' + result['provider'] + '\n')
f.write('content: ' + result['content'] + '\n')
f.write('==' * 40 + '\n')
count += 1
if count >=5002:
flag = False
break
sleepTime = random.randint(2,10)
time.sleep(sleepTime)
print(str(sleepTime) + ' seconds wait.')
page += 1
f.close()
我在从 Yelp 抓取数据时遇到过此类问题。很难绕过这些限制。我建议您尝试以下方法。
- 更改您的用户代理。当前的用户代理似乎适用于 iPhone。为 PC 使用有效的。
- 如果以上方法有效,但您在发出一定数量的页面请求后被阻止 - 请查看 StarCluster。您将在 https://yangwangteaching.wordpress.com/data-science-meetup/
的“4/13/2017 (UTEP CoBA 310 – CALC LAB #2)”下找到代码草案
我正在尝试 运行 一个脚本来从网站上抓取新闻文本。
主页重组后好像有报废限制了
而且我不断收到此错误,但如果我测试一首歌曲 url 它会起作用。
有什么建议吗?
Traceback (most recent call last):
File "C:\Users\park\AppData\Local\Programs\Python\Python36-32\bigkinds_headers_v1.3_20170522.py", line 157, in <module>
result = fetch_news_detail(news['href'])
File "C:\Users\park\AppData\Local\Programs\Python\Python36-32\bigkinds_headers_v1.3_20170522.py", line 107, in fetch_news_detail
res = urlopen(url).read().decode('utf-8', errors='ignore')
File "C:\Users\park\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\park\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 532, in open
response = meth(req, response)
File "C:\Users\park\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 642, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Users\park\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 570, in error
return self._call_chain(*args)
File "C:\Users\park\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 504, in _call_chain
result = func(*args)
File "C:\Users\park\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 650, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 404: Not Found
这是我的代码
import urllib.request
from bs4 import BeautifulSoup
import time
import urllib.parse
import json
from urllib.request import urlopen
import random
def fetch_news_list(page, keyword,start,end):
result = []
url = "https://www.bigkinds.or.kr/news/newsResult.do"
headers = {
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_3_2 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13F69 Safari/601.1', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8', 'Connection': 'keep-alive'
}
param = {
'pageInfo':'newsResult',
'login_chk':'null',
'LOGIN_SN':'null',
'LOGIN_NAME':'null',
'indexName':'news',
'keyword':keyword,
'byLine':'',
'searchScope':'1',
'searchFtr':'1',
'startDate':start,
'endDate':end,
'sortMethod':'date',
'contentLength':'100',
'providerCode':'',
'categoryCode':'',
'incidentCode':'',
'dateCode':'',
'highlighting':'',
'sessionUSID':'',
'sessionUUID':'test',
'listMode':'',
'categoryTab':'',
'newsId':'',
'filterProviderCode':'',
'filterCategoryCode':'',
'filterIncidentCode':'',
'filterDateCode':'',
'startNo':page,
'resultNumber':'100',
'topmenuoff':'',
'resultState':'',
'keywordJson':'{"searchDetailTxt1":keyword,"agreeDetailTxt1":"","needDetailTxt1":"","exceptDetailTxt1":"","o_id":"option1","startDate":start,"endDate":end,"providerNm":"","categoryNm":"","incidentCategoryNm":"","providerCode":"","categoryCode":"","incidentCategoryCode":"","searchFtr":"1","searchScope":"1","searchKeyword":"keyword"}',
'keywordFilterJson':'',
'totalCount':'',
'interval':'',
'quotationKeyword1':'',
'quotationKeyword2':'',
'quotationKeyword3':'',
'searchFromUseYN':'N',
'mainTodayPersonYn':'',
'period':'1year'
}
param = urllib.parse.urlencode(param).encode()
req = urllib.request.Request(url, param, headers)
sleepTime = random.randint(4,10)
time.sleep(sleepTime)
print(str(sleepTime) + ' seconds wait.')
try :
res = urllib.request.urlopen(req)
except URLError as e:
if hasattr(e, 'reason'):
print('We failed to reach a server.')
print('Reason: ', e.reason)
elif hasattr(e, 'code'):
print('The server couldn\'t fulfill the request.')
print('Error code: ', e.code)
html = res.read()
soup = BeautifulSoup(html, "html.parser")
div_tags = soup.find_all('div', class_='resTxt')
for cts in div_tags:
ids = cts.find('h3')['id'][5:31]
title = cts.find('h3',class_='list_newsId').get_text(strip=True)
href = 'https://www.bigkinds.or.kr/news/detailView.do?docId=' + ids + '&returnCnt=1'
sets = {
'title' : title,
'href' : href
}
result. append(sets)
return result
def fetch_news_detail(url):
result = {}
res = urlopen(url).read().decode('utf-8', errors='ignore')
responseJson = json.loads(res)
category = responseJson.get("detail").get("CATEGORY_MAIN")
date = responseJson.get("detail").get("DATE")
provider = responseJson.get("detail").get("PROVIDER")
content = responseJson.get("detail").get("CONTENT")
result = {
'category': category,
'date': date,
'provider' : provider,
'content': content
}
return result
keyword = input('(eg., 외국인 NOT(증시 OR 순매수 OR 증권 OR 코스피 OR 코스닥 OR 주식 OR 주가 OR 투타 OR KBO OR 야구 OR KBL OR 농구 OR 축구 OR 올림픽 OR K리그))\n input word: ')
start = input('(eg., 2017-01-01)\n input startday: ')
end = input('(eg., 2017-02-01)\n input endday: ')
page = 1
count = 1
flag = True
f = open('bigkinds.txt', 'w', encoding='utf-8')
while True:
if not flag:
break
news_list = fetch_news_list(page, keyword,start,end)
sleepTime = random.randint(3,8)
time.sleep(sleepTime)
print(str(sleepTime) + ' seconds wait.')
for news in news_list:
result = fetch_news_detail(news['href'])
result['title'] = news['title']
f.write('==' * 40 + '\n')
f.write('category: ' + result['category'] + '\n')
f.write('title: ' + result['title'] + '\n')
f.write('date: ' + result['date'] + '\n')
f.write('provider: ' + result['provider'] + '\n')
f.write('content: ' + result['content'] + '\n')
f.write('==' * 40 + '\n')
count += 1
if count >=5002:
flag = False
break
sleepTime = random.randint(2,10)
time.sleep(sleepTime)
print(str(sleepTime) + ' seconds wait.')
page += 1
f.close()
我在从 Yelp 抓取数据时遇到过此类问题。很难绕过这些限制。我建议您尝试以下方法。
- 更改您的用户代理。当前的用户代理似乎适用于 iPhone。为 PC 使用有效的。
- 如果以上方法有效,但您在发出一定数量的页面请求后被阻止 - 请查看 StarCluster。您将在 https://yangwangteaching.wordpress.com/data-science-meetup/ 的“4/13/2017 (UTEP CoBA 310 – CALC LAB #2)”下找到代码草案