Python:从列表中的 url 下载 pdf 时出现 unicode 错误
Python: unicode error when downloading a pdf from an url in a list
我最近开始了我的 Python 旅程,Whosebug 帮助我解决了我遇到的大部分问题。然而,尽管尝试了此处建议的不同解决方案,但我似乎无法捕捉到这一点。
我正在从列表中的网站收集 url。我的下一步是浏览 url 并下载它们(如果文件夹中尚不存在)。但是,某些 URL 包含非 ASCII 字符,例如 ú、é、ç。这导致下面的 unicode 错误。
UnicodeEncodeError: 'ascii' 编解码器无法对位置 64 中的字符 '\xfa' 进行编码:序号不在范围内 (128)
我现在用 try/except 逃脱了它,但需要手动下载它们。
当我使用 .encode('utf-8') 时,它也会导致错误:“TypeError: cannot use a string pattern on a bytes-like object”。
这是我的代码:
import os
import urllib
dict = (this includes a large dictionary scraped from a website)
links = []
for d in dict :
links.append(d["EncodedAbsUrl"])
# For every line in the file
for url in links:
# Split on the rightmost / and take everything on the right side of that
name = url.rsplit('/', 1)[-1]
# Combine the name and the downloads directory to get the local filename
filename = os.path.join(r'C:\PATH', name)
# Download the file if it does not exist
if not os.path.isfile(filename):
try:
urllib.request.urlretrieve(url, filename)
except UnicodeEncodeError:
print(filename + " could not be saved.")
pass
else:
print(filename + " already exists.")
编辑
根据 Ardiya 在评论中的建议(感谢一百万),我已更改使用 urllib.parse.quote_plus 方法。这似乎有效,但也 returns 一个 http 错误 400。修改后的代码现在显示为:
for url in links:
# Split on the rightmost / and take everything on the right side of that
name = url.rsplit('/', 1)[-1]
# Combine the name and the downloads directory to get the local filename
filename = os.path.join(r'C:\PATH', name)
# Download the file if it does not exist
if not os.path.isfile(filename):
try:
urllib.request.urlretrieve(url, filename)
except UnicodeEncodeError:
new_url = str(root + url.split('/')[-2] + '/' + urllib.parse.quote_plus(name))
urllib.request.urlretrieve(new_url, filename)
else:
print(filename + " already exists.")
例如,下面的 link 在源字典中: https://www4.unfccc.int/sites/ndcstaging/PublishedDocuments/Peru%20First/iNDC%20Perú%20castellano.pdf
被翻译成 https://www4.unfccc.int/sites/ndcstaging/PublishedDocuments/Peru%20First/iNDC%2520Per%C3%BA%2520castellano.pdf
,这不能正常工作。
---------------------------------------------------------------------------
UnicodeEncodeError Traceback (most recent call last)
<ipython-input-6-12f5f676515d> in <module>
25 try:
---> 26 urllib.request.urlretrieve(url, filename)
27 except UnicodeEncodeError:
~\Anaconda3\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data)
246
--> 247 with contextlib.closing(urlopen(url, data)) as fp:
248 headers = fp.info()
~\Anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
~\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
524 sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method())
--> 525 response = self._open(req, data)
526
~\Anaconda3\lib\urllib\request.py in _open(self, req, data)
541 protocol = req.type
--> 542 result = self._call_chain(self.handle_open, protocol, protocol +
543 '_open', req)
~\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
501 func = getattr(handler, meth_name)
--> 502 result = func(*args)
503 if result is not None:
~\Anaconda3\lib\urllib\request.py in https_open(self, req)
1392 def https_open(self, req):
-> 1393 return self.do_open(http.client.HTTPSConnection, req,
1394 context=self._context, check_hostname=self._check_hostname)
~\Anaconda3\lib\urllib\request.py in do_open(self, http_class, req, **http_conn_args)
1349 try:
-> 1350 h.request(req.get_method(), req.selector, req.data, headers,
1351 encode_chunked=req.has_header('Transfer-encoding'))
~\Anaconda3\lib\http\client.py in request(self, method, url, body, headers, encode_chunked)
1254 """Send a complete request to the server."""
-> 1255 self._send_request(method, url, body, headers, encode_chunked)
1256
~\Anaconda3\lib\http\client.py in _send_request(self, method, url, body, headers, encode_chunked)
1265
-> 1266 self.putrequest(method, url, **skips)
1267
~\Anaconda3\lib\http\client.py in putrequest(self, method, url, skip_host, skip_accept_encoding)
1103
-> 1104 self._output(self._encode_request(request))
1105
~\Anaconda3\lib\http\client.py in _encode_request(self, request)
1183 # ASCII also helps prevent CVE-2019-9740.
-> 1184 return request.encode('ascii')
1185
UnicodeEncodeError: 'ascii' codec can't encode character '\xfa' in position 64: ordinal not in range(128)
During handling of the above exception, another exception occurred:
HTTPError Traceback (most recent call last)
<ipython-input-6-12f5f676515d> in <module>
27 except UnicodeEncodeError:
28 new_url = str(root + url.split('/')[-2] + '/' + urllib.parse.quote_plus(name))
---> 29 urllib.request.urlretrieve(new_url, filename)
30 else:
31 print(filename + " already exists.")
~\Anaconda3\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data)
245 url_type, path = _splittype(url)
246
--> 247 with contextlib.closing(urlopen(url, data)) as fp:
248 headers = fp.info()
249
~\Anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
220 else:
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
224 def install_opener(opener):
~\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
529 for processor in self.process_response.get(protocol, []):
530 meth = getattr(processor, meth_name)
--> 531 response = meth(req, response)
532
533 return response
~\Anaconda3\lib\urllib\request.py in http_response(self, request, response)
638 # request was successfully received, understood, and accepted.
639 if not (200 <= code < 300):
--> 640 response = self.parent.error(
641 'http', request, response, code, msg, hdrs)
642
~\Anaconda3\lib\urllib\request.py in error(self, proto, *args)
567 if http_err:
568 args = (dict, 'default', 'http_error_default') + orig_args
--> 569 return self._call_chain(*args)
570
571 # XXX probably also want an abstract factory that knows when it makes
~\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
500 for handler in handlers:
501 func = getattr(handler, meth_name)
--> 502 result = func(*args)
503 if result is not None:
504 return result
~\Anaconda3\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
647 class HTTPDefaultErrorHandler(BaseHandler):
648 def http_error_default(self, req, fp, code, msg, hdrs):
--> 649 raise HTTPError(req.full_url, code, msg, hdrs, fp)
650
651 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 400: Bad Request
您的 URL 仅进行了部分编码。尝试将每个 %20
替换为文字 space 和 然后 URL - 对整个内容进行编码。
if not os.path.isfile(filename):
try:
head, tail = url.rsplit('/', 1)
url = '%s/%s' % (head, urllib.parse.quote(tail.replace('%20', ' ')))
urllib.request.urlretrieve(url, filename)
urllib.parse.quote
和urllib.parse.quote_plus
的区别是后者会将space替换成+
,而前者会URL编码回到 %20
.
我最近开始了我的 Python 旅程,Whosebug 帮助我解决了我遇到的大部分问题。然而,尽管尝试了此处建议的不同解决方案,但我似乎无法捕捉到这一点。
我正在从列表中的网站收集 url。我的下一步是浏览 url 并下载它们(如果文件夹中尚不存在)。但是,某些 URL 包含非 ASCII 字符,例如 ú、é、ç。这导致下面的 unicode 错误。
UnicodeEncodeError: 'ascii' 编解码器无法对位置 64 中的字符 '\xfa' 进行编码:序号不在范围内 (128)
我现在用 try/except 逃脱了它,但需要手动下载它们。
当我使用 .encode('utf-8') 时,它也会导致错误:“TypeError: cannot use a string pattern on a bytes-like object”。
这是我的代码:
import os
import urllib
dict = (this includes a large dictionary scraped from a website)
links = []
for d in dict :
links.append(d["EncodedAbsUrl"])
# For every line in the file
for url in links:
# Split on the rightmost / and take everything on the right side of that
name = url.rsplit('/', 1)[-1]
# Combine the name and the downloads directory to get the local filename
filename = os.path.join(r'C:\PATH', name)
# Download the file if it does not exist
if not os.path.isfile(filename):
try:
urllib.request.urlretrieve(url, filename)
except UnicodeEncodeError:
print(filename + " could not be saved.")
pass
else:
print(filename + " already exists.")
编辑
根据 Ardiya 在评论中的建议(感谢一百万),我已更改使用 urllib.parse.quote_plus 方法。这似乎有效,但也 returns 一个 http 错误 400。修改后的代码现在显示为:
for url in links:
# Split on the rightmost / and take everything on the right side of that
name = url.rsplit('/', 1)[-1]
# Combine the name and the downloads directory to get the local filename
filename = os.path.join(r'C:\PATH', name)
# Download the file if it does not exist
if not os.path.isfile(filename):
try:
urllib.request.urlretrieve(url, filename)
except UnicodeEncodeError:
new_url = str(root + url.split('/')[-2] + '/' + urllib.parse.quote_plus(name))
urllib.request.urlretrieve(new_url, filename)
else:
print(filename + " already exists.")
例如,下面的 link 在源字典中: https://www4.unfccc.int/sites/ndcstaging/PublishedDocuments/Peru%20First/iNDC%20Perú%20castellano.pdf
被翻译成 https://www4.unfccc.int/sites/ndcstaging/PublishedDocuments/Peru%20First/iNDC%2520Per%C3%BA%2520castellano.pdf
,这不能正常工作。
---------------------------------------------------------------------------
UnicodeEncodeError Traceback (most recent call last)
<ipython-input-6-12f5f676515d> in <module>
25 try:
---> 26 urllib.request.urlretrieve(url, filename)
27 except UnicodeEncodeError:
~\Anaconda3\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data)
246
--> 247 with contextlib.closing(urlopen(url, data)) as fp:
248 headers = fp.info()
~\Anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
~\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
524 sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method())
--> 525 response = self._open(req, data)
526
~\Anaconda3\lib\urllib\request.py in _open(self, req, data)
541 protocol = req.type
--> 542 result = self._call_chain(self.handle_open, protocol, protocol +
543 '_open', req)
~\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
501 func = getattr(handler, meth_name)
--> 502 result = func(*args)
503 if result is not None:
~\Anaconda3\lib\urllib\request.py in https_open(self, req)
1392 def https_open(self, req):
-> 1393 return self.do_open(http.client.HTTPSConnection, req,
1394 context=self._context, check_hostname=self._check_hostname)
~\Anaconda3\lib\urllib\request.py in do_open(self, http_class, req, **http_conn_args)
1349 try:
-> 1350 h.request(req.get_method(), req.selector, req.data, headers,
1351 encode_chunked=req.has_header('Transfer-encoding'))
~\Anaconda3\lib\http\client.py in request(self, method, url, body, headers, encode_chunked)
1254 """Send a complete request to the server."""
-> 1255 self._send_request(method, url, body, headers, encode_chunked)
1256
~\Anaconda3\lib\http\client.py in _send_request(self, method, url, body, headers, encode_chunked)
1265
-> 1266 self.putrequest(method, url, **skips)
1267
~\Anaconda3\lib\http\client.py in putrequest(self, method, url, skip_host, skip_accept_encoding)
1103
-> 1104 self._output(self._encode_request(request))
1105
~\Anaconda3\lib\http\client.py in _encode_request(self, request)
1183 # ASCII also helps prevent CVE-2019-9740.
-> 1184 return request.encode('ascii')
1185
UnicodeEncodeError: 'ascii' codec can't encode character '\xfa' in position 64: ordinal not in range(128)
During handling of the above exception, another exception occurred:
HTTPError Traceback (most recent call last)
<ipython-input-6-12f5f676515d> in <module>
27 except UnicodeEncodeError:
28 new_url = str(root + url.split('/')[-2] + '/' + urllib.parse.quote_plus(name))
---> 29 urllib.request.urlretrieve(new_url, filename)
30 else:
31 print(filename + " already exists.")
~\Anaconda3\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data)
245 url_type, path = _splittype(url)
246
--> 247 with contextlib.closing(urlopen(url, data)) as fp:
248 headers = fp.info()
249
~\Anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
220 else:
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
224 def install_opener(opener):
~\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
529 for processor in self.process_response.get(protocol, []):
530 meth = getattr(processor, meth_name)
--> 531 response = meth(req, response)
532
533 return response
~\Anaconda3\lib\urllib\request.py in http_response(self, request, response)
638 # request was successfully received, understood, and accepted.
639 if not (200 <= code < 300):
--> 640 response = self.parent.error(
641 'http', request, response, code, msg, hdrs)
642
~\Anaconda3\lib\urllib\request.py in error(self, proto, *args)
567 if http_err:
568 args = (dict, 'default', 'http_error_default') + orig_args
--> 569 return self._call_chain(*args)
570
571 # XXX probably also want an abstract factory that knows when it makes
~\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
500 for handler in handlers:
501 func = getattr(handler, meth_name)
--> 502 result = func(*args)
503 if result is not None:
504 return result
~\Anaconda3\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
647 class HTTPDefaultErrorHandler(BaseHandler):
648 def http_error_default(self, req, fp, code, msg, hdrs):
--> 649 raise HTTPError(req.full_url, code, msg, hdrs, fp)
650
651 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 400: Bad Request
您的 URL 仅进行了部分编码。尝试将每个 %20
替换为文字 space 和 然后 URL - 对整个内容进行编码。
if not os.path.isfile(filename):
try:
head, tail = url.rsplit('/', 1)
url = '%s/%s' % (head, urllib.parse.quote(tail.replace('%20', ' ')))
urllib.request.urlretrieve(url, filename)
urllib.parse.quote
和urllib.parse.quote_plus
的区别是后者会将space替换成+
,而前者会URL编码回到 %20
.