Python：从列表中的 url 下载 pdf 时出现 unicode 错误

Question

我最近开始了我的 Python 旅程，Whosebug 帮助我解决了我遇到的大部分问题。然而，尽管尝试了此处建议的不同解决方案，但我似乎无法捕捉到这一点。

我正在从列表中的网站收集 url。我的下一步是浏览 url 并下载它们（如果文件夹中尚不存在）。但是，某些 URL 包含非 ASCII 字符，例如 ú、é、ç。这导致下面的 unicode 错误。

UnicodeEncodeError: 'ascii' 编解码器无法对位置 64 中的字符 '\xfa' 进行编码：序号不在范围内 (128)

我现在用 try/except 逃脱了它，但需要手动下载它们。

当我使用 .encode('utf-8') 时，它也会导致错误：“TypeError: cannot use a string pattern on a bytes-like object”。

这是我的代码：

import os
import urllib

dict = (this includes a large dictionary scraped from a website)

links = []

for d in dict :
    links.append(d["EncodedAbsUrl"])
    
# For every line in the file

for url in links:
    # Split on the rightmost / and take everything on the right side of that
    name = url.rsplit('/', 1)[-1]

    # Combine the name and the downloads directory to get the local filename
    filename = os.path.join(r'C:\PATH', name)

    # Download the file if it does not exist
    if not os.path.isfile(filename):
        try:
            urllib.request.urlretrieve(url, filename)    
        except UnicodeEncodeError:
            print(filename + " could not be saved.")
            pass
    else:
            print(filename + " already exists.")

编辑

根据 Ardiya 在评论中的建议（感谢一百万），我已更改使用 urllib.parse.quote_plus 方法。这似乎有效，但也 returns 一个 http 错误 400。修改后的代码现在显示为：

for url in links:
   # Split on the rightmost / and take everything on the right side of that
    name = url.rsplit('/', 1)[-1]
    
    # Combine the name and the downloads directory to get the local filename
    filename = os.path.join(r'C:\PATH', name)

    # Download the file if it does not exist
    if not os.path.isfile(filename):
        try:
            urllib.request.urlretrieve(url, filename)    
        except UnicodeEncodeError:
            new_url = str(root + url.split('/')[-2] + '/' + urllib.parse.quote_plus(name))
            urllib.request.urlretrieve(new_url, filename)
    else:
            print(filename + " already exists.")

例如，下面的 link 在源字典中： https://www4.unfccc.int/sites/ndcstaging/PublishedDocuments/Peru%20First/iNDC%20Perú%20castellano.pdf 被翻译成 https://www4.unfccc.int/sites/ndcstaging/PublishedDocuments/Peru%20First/iNDC%2520Per%C3%BA%2520castellano.pdf，这不能正常工作。

---------------------------------------------------------------------------
UnicodeEncodeError                        Traceback (most recent call last)
<ipython-input-6-12f5f676515d> in <module>
     25         try:
---> 26             urllib.request.urlretrieve(url, filename)
     27         except UnicodeEncodeError:

~\Anaconda3\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data)
    246 
--> 247     with contextlib.closing(urlopen(url, data)) as fp:
    248         headers = fp.info()

~\Anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    221         opener = _opener
--> 222     return opener.open(url, data, timeout)
    223 

~\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
    524         sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method())
--> 525         response = self._open(req, data)
    526 

~\Anaconda3\lib\urllib\request.py in _open(self, req, data)
    541         protocol = req.type
--> 542         result = self._call_chain(self.handle_open, protocol, protocol +
    543                                   '_open', req)

~\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
    501             func = getattr(handler, meth_name)
--> 502             result = func(*args)
    503             if result is not None:

~\Anaconda3\lib\urllib\request.py in https_open(self, req)
   1392         def https_open(self, req):
-> 1393             return self.do_open(http.client.HTTPSConnection, req,
   1394                 context=self._context, check_hostname=self._check_hostname)

~\Anaconda3\lib\urllib\request.py in do_open(self, http_class, req, **http_conn_args)
   1349             try:
-> 1350                 h.request(req.get_method(), req.selector, req.data, headers,
   1351                           encode_chunked=req.has_header('Transfer-encoding'))

~\Anaconda3\lib\http\client.py in request(self, method, url, body, headers, encode_chunked)
   1254         """Send a complete request to the server."""
-> 1255         self._send_request(method, url, body, headers, encode_chunked)
   1256 

~\Anaconda3\lib\http\client.py in _send_request(self, method, url, body, headers, encode_chunked)
   1265 
-> 1266         self.putrequest(method, url, **skips)
   1267 

~\Anaconda3\lib\http\client.py in putrequest(self, method, url, skip_host, skip_accept_encoding)
   1103 
-> 1104         self._output(self._encode_request(request))
   1105 

~\Anaconda3\lib\http\client.py in _encode_request(self, request)
   1183         # ASCII also helps prevent CVE-2019-9740.
-> 1184         return request.encode('ascii')
   1185 

UnicodeEncodeError: 'ascii' codec can't encode character '\xfa' in position 64: ordinal not in range(128)

During handling of the above exception, another exception occurred:

HTTPError                                 Traceback (most recent call last)
<ipython-input-6-12f5f676515d> in <module>
     27         except UnicodeEncodeError:
     28             new_url = str(root + url.split('/')[-2] + '/' + urllib.parse.quote_plus(name))
---> 29             urllib.request.urlretrieve(new_url, filename)
     30     else:
     31             print(filename + " already exists.")

~\Anaconda3\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data)
    245     url_type, path = _splittype(url)
    246 
--> 247     with contextlib.closing(urlopen(url, data)) as fp:
    248         headers = fp.info()
    249 

~\Anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    220     else:
    221         opener = _opener
--> 222     return opener.open(url, data, timeout)
    223 
    224 def install_opener(opener):

~\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
    529         for processor in self.process_response.get(protocol, []):
    530             meth = getattr(processor, meth_name)
--> 531             response = meth(req, response)
    532 
    533         return response

~\Anaconda3\lib\urllib\request.py in http_response(self, request, response)
    638         # request was successfully received, understood, and accepted.
    639         if not (200 <= code < 300):
--> 640             response = self.parent.error(
    641                 'http', request, response, code, msg, hdrs)
    642 

~\Anaconda3\lib\urllib\request.py in error(self, proto, *args)
    567         if http_err:
    568             args = (dict, 'default', 'http_error_default') + orig_args
--> 569             return self._call_chain(*args)
    570 
    571 # XXX probably also want an abstract factory that knows when it makes

~\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
    500         for handler in handlers:
    501             func = getattr(handler, meth_name)
--> 502             result = func(*args)
    503             if result is not None:
    504                 return result

~\Anaconda3\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
    647 class HTTPDefaultErrorHandler(BaseHandler):
    648     def http_error_default(self, req, fp, code, msg, hdrs):
--> 649         raise HTTPError(req.full_url, code, msg, hdrs, fp)
    650 
    651 class HTTPRedirectHandler(BaseHandler):

HTTPError: HTTP Error 400: Bad Request

Answer 1

您的 URL 仅进行了部分编码。尝试将每个 %20 替换为文字 space 和然后 URL - 对整个内容进行编码。

    if not os.path.isfile(filename):
        try:
            head, tail = url.rsplit('/', 1)
            url = '%s/%s' % (head, urllib.parse.quote(tail.replace('%20', ' ')))
            urllib.request.urlretrieve(url, filename)

urllib.parse.quote和urllib.parse.quote_plus的区别是后者会将space替换成+，而前者会URL编码回到 %20.

Python：从列表中的 url 下载 pdf 时出现 unicode 错误

Python: unicode error when downloading a pdf from an url in a list

python

python-3.x

jupyter-lab

编辑