在 Python 中使用 mnist 库
Using mnist library in Python
我需要使用 mnist
Python 库来下载和读取 MNIST 数据 (PyPi, Github):
import mnist
mnist_dataset = mnist.test_images().astype(np.float32)
在我的大学集群上,我加载数据没有问题。然而,在我的本地电脑上,我得到:
---------------------------------------------------------------------------
HTTPError Traceback (most recent call last)
<ipython-input-24-78f59b728818> in <module>
8 DATASET_SIZE = 512
9 DIGIT_SIZE = 28
---> 10 mnist_dataset = mnist.test_images().astype(np.float32)
11 np.random.shuffle(mnist_dataset)
12 mnist_dataset = np.reshape(mnist_dataset[:DATASET_SIZE] / 255.0, newshape=(DATASET_SIZE, DIGIT_SIZE*DIGIT_SIZE))
~\anaconda3\lib\site-packages\mnist\__init__.py in test_images()
174 columns of the image
175 """
--> 176 return download_and_parse_mnist_file('t10k-images-idx3-ubyte.gz')
177
178
~\anaconda3\lib\site-packages\mnist\__init__.py in download_and_parse_mnist_file(fname, target_dir, force)
141 Numpy array with the dimensions and the data in the IDX file
142 """
--> 143 fname = download_file(fname, target_dir=target_dir, force=force)
144 fopen = gzip.open if os.path.splitext(fname)[1] == '.gz' else open
145 with fopen(fname, 'rb') as fd:
~\anaconda3\lib\site-packages\mnist\__init__.py in download_file(fname, target_dir, force)
57 if force or not os.path.isfile(target_fname):
58 url = urljoin(datasets_url, fname)
---> 59 urlretrieve(url, target_fname)
60
61 return target_fname
~\anaconda3\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data)
245 url_type, path = splittype(url)
246
--> 247 with contextlib.closing(urlopen(url, data)) as fp:
248 headers = fp.info()
249
~\anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
220 else:
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
224 def install_opener(opener):
~\anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
529 for processor in self.process_response.get(protocol, []):
530 meth = getattr(processor, meth_name)
--> 531 response = meth(req, response)
532
533 return response
~\anaconda3\lib\urllib\request.py in http_response(self, request, response)
639 if not (200 <= code < 300):
640 response = self.parent.error(
--> 641 'http', request, response, code, msg, hdrs)
642
643 return response
~\anaconda3\lib\urllib\request.py in error(self, proto, *args)
567 if http_err:
568 args = (dict, 'default', 'http_error_default') + orig_args
--> 569 return self._call_chain(*args)
570
571 # XXX probably also want an abstract factory that knows when it makes
~\anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
501 for handler in handlers:
502 func = getattr(handler, meth_name)
--> 503 result = func(*args)
504 if result is not None:
505 return result
~\anaconda3\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
647 class HTTPDefaultErrorHandler(BaseHandler):
648 def http_error_default(self, req, fp, code, msg, hdrs):
--> 649 raise HTTPError(req.full_url, code, msg, hdrs, fp)
650
651 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 403: Forbidden
我已经通过 inspect
模块检查了功能。本地版和集群版调用的HTTP地址相同,即http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz。我可以毫无问题地从我的网络浏览器下载它。
我可以在 Python 中对此做些什么?
这个模块很旧并且存档了。
我预计它可能会使用带有新安全系统的新服务器,并且代码可能需要一些设置 - 例如 header User-Agent
- 才能正确访问数据。
根据@wwii 评论的建议,我下载了源代码并添加了 User-Agent
现在我可以下载图片了
mint/__init__.py
try:
#from urllib.request import urlretrieve # before
from urllib.request import urlretrieve, URLopener # after
except ImportError:
#from urllib import urlretrieve # py2 # before
from urllib import urlretrieve, URLopener # py2 # after
# ... code ...
def download_file(fname, target_dir=None, force=False):
# ... code ...
if force or not os.path.isfile(target_fname):
url = urljoin(datasets_url, fname)
# before
#urlretrieve(url, target_fname)
# after
opener = URLopener()
opener.addheader('User-Agent', "Mozilla/5.0")
opener.retrieve(url, target_fname)
测试代码:
import mnist
import numpy as np
print(mnist.__file__) # to see if I uses local version with changes
print(mnist.datasets_url)
print(mnist.temporary_dir()) # to see where it is downloaded
mnist_dataset = mnist.test_images().astype(np.float32)
print(mnist_dataset)
仅使用 Python 3.8 进行测试,Python 2.x 需要
另一个解决方案是自己动手下载文件。使用我发现的函数 here and the file urls from the mnist site ...
import requests, gzip
urls = [(r'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz','training_images.gz'),
(r'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz','training_labels.gz'),
(r'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz','test_images.gz'),
(r'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz','test_labels.gz')]
def download_url(url, save_path, chunk_size=128):
r = requests.get(url, stream=True)
with open(save_path, 'wb') as fd:
for chunk in r.iter_content(chunk_size=chunk_size):
fd.write(chunk)
for url,path in urls:
download_url(url,path)
效果很好,它只是将 zip 文件下载到当前工作目录。您仍然需要 unzip them.
使用其他答案,我已经能够构建一个允许直接使用包的解决方案。
以下代码必须执行一次并在全局范围内工作:
from six.moves import urllib
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib.request.install_opener(opener)
有了这个,所有 mnist
功能都可以使用,文件将根据需要下载。在调用 mnist_dataset = mnist.test_images().astype(np.float32)
.
之前,我直接在单元格中的 Jupyter Notebook 中执行上面的代码
我需要使用 mnist
Python 库来下载和读取 MNIST 数据 (PyPi, Github):
import mnist
mnist_dataset = mnist.test_images().astype(np.float32)
在我的大学集群上,我加载数据没有问题。然而,在我的本地电脑上,我得到:
---------------------------------------------------------------------------
HTTPError Traceback (most recent call last)
<ipython-input-24-78f59b728818> in <module>
8 DATASET_SIZE = 512
9 DIGIT_SIZE = 28
---> 10 mnist_dataset = mnist.test_images().astype(np.float32)
11 np.random.shuffle(mnist_dataset)
12 mnist_dataset = np.reshape(mnist_dataset[:DATASET_SIZE] / 255.0, newshape=(DATASET_SIZE, DIGIT_SIZE*DIGIT_SIZE))
~\anaconda3\lib\site-packages\mnist\__init__.py in test_images()
174 columns of the image
175 """
--> 176 return download_and_parse_mnist_file('t10k-images-idx3-ubyte.gz')
177
178
~\anaconda3\lib\site-packages\mnist\__init__.py in download_and_parse_mnist_file(fname, target_dir, force)
141 Numpy array with the dimensions and the data in the IDX file
142 """
--> 143 fname = download_file(fname, target_dir=target_dir, force=force)
144 fopen = gzip.open if os.path.splitext(fname)[1] == '.gz' else open
145 with fopen(fname, 'rb') as fd:
~\anaconda3\lib\site-packages\mnist\__init__.py in download_file(fname, target_dir, force)
57 if force or not os.path.isfile(target_fname):
58 url = urljoin(datasets_url, fname)
---> 59 urlretrieve(url, target_fname)
60
61 return target_fname
~\anaconda3\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data)
245 url_type, path = splittype(url)
246
--> 247 with contextlib.closing(urlopen(url, data)) as fp:
248 headers = fp.info()
249
~\anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
220 else:
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
224 def install_opener(opener):
~\anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
529 for processor in self.process_response.get(protocol, []):
530 meth = getattr(processor, meth_name)
--> 531 response = meth(req, response)
532
533 return response
~\anaconda3\lib\urllib\request.py in http_response(self, request, response)
639 if not (200 <= code < 300):
640 response = self.parent.error(
--> 641 'http', request, response, code, msg, hdrs)
642
643 return response
~\anaconda3\lib\urllib\request.py in error(self, proto, *args)
567 if http_err:
568 args = (dict, 'default', 'http_error_default') + orig_args
--> 569 return self._call_chain(*args)
570
571 # XXX probably also want an abstract factory that knows when it makes
~\anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
501 for handler in handlers:
502 func = getattr(handler, meth_name)
--> 503 result = func(*args)
504 if result is not None:
505 return result
~\anaconda3\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
647 class HTTPDefaultErrorHandler(BaseHandler):
648 def http_error_default(self, req, fp, code, msg, hdrs):
--> 649 raise HTTPError(req.full_url, code, msg, hdrs, fp)
650
651 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 403: Forbidden
我已经通过 inspect
模块检查了功能。本地版和集群版调用的HTTP地址相同,即http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz。我可以毫无问题地从我的网络浏览器下载它。
我可以在 Python 中对此做些什么?
这个模块很旧并且存档了。
我预计它可能会使用带有新安全系统的新服务器,并且代码可能需要一些设置 - 例如 header User-Agent
- 才能正确访问数据。
根据@wwii 评论的建议,我下载了源代码并添加了 User-Agent
现在我可以下载图片了
mint/__init__.py
try:
#from urllib.request import urlretrieve # before
from urllib.request import urlretrieve, URLopener # after
except ImportError:
#from urllib import urlretrieve # py2 # before
from urllib import urlretrieve, URLopener # py2 # after
# ... code ...
def download_file(fname, target_dir=None, force=False):
# ... code ...
if force or not os.path.isfile(target_fname):
url = urljoin(datasets_url, fname)
# before
#urlretrieve(url, target_fname)
# after
opener = URLopener()
opener.addheader('User-Agent', "Mozilla/5.0")
opener.retrieve(url, target_fname)
测试代码:
import mnist
import numpy as np
print(mnist.__file__) # to see if I uses local version with changes
print(mnist.datasets_url)
print(mnist.temporary_dir()) # to see where it is downloaded
mnist_dataset = mnist.test_images().astype(np.float32)
print(mnist_dataset)
仅使用 Python 3.8 进行测试,Python 2.x 需要
另一个解决方案是自己动手下载文件。使用我发现的函数 here and the file urls from the mnist site ...
import requests, gzip
urls = [(r'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz','training_images.gz'),
(r'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz','training_labels.gz'),
(r'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz','test_images.gz'),
(r'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz','test_labels.gz')]
def download_url(url, save_path, chunk_size=128):
r = requests.get(url, stream=True)
with open(save_path, 'wb') as fd:
for chunk in r.iter_content(chunk_size=chunk_size):
fd.write(chunk)
for url,path in urls:
download_url(url,path)
效果很好,它只是将 zip 文件下载到当前工作目录。您仍然需要 unzip them.
使用其他答案,我已经能够构建一个允许直接使用包的解决方案。
以下代码必须执行一次并在全局范围内工作:
from six.moves import urllib
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib.request.install_opener(opener)
有了这个,所有 mnist
功能都可以使用,文件将根据需要下载。在调用 mnist_dataset = mnist.test_images().astype(np.float32)
.