为什么 python tarfile gz 没有减少文件大小
why python tarfile gz is not reducing filesize
因此,我试图将 3 个文本文件每个 10MB 压缩为一个文件 tar.gz,但它似乎并没有减少最终的 tar.gz。最终 tar.gz 文件大小仍为 30MB。
谁能告诉我为什么会这样?我有最高级别的压缩
>>> import os
>>> import sys
>>> import tarfile
>>> import tempfile
tarmode="w:gz"):
''>>> size_in_mb = 10
>>>
>>> def compress_str_to_tar(tmppath, files_str, tarfileprefix, tarmode="w:gz"):
... ''' compress string contents in files and tar. finally creates a tar file in tmppath
... @param tmppath: (str) pathdirectory where temp files to be compressed will be created
... @param files_str: (dict) {filename: filecontent_in_str} these will be compressed
... @param tarfileprefix: (str) output filename (without suffix) of tar
... @param tarmode: (str) w:gz or w:bz2
... '''
... tar = tarfile.open(os.path.join(tmppath, tarfileprefix+'.tar.'+tarmode.split(':')[1]), tarmode, compresslevel=9)
... for filename in files_str:
... with open(os.path.join(tmppath, filename), 'wb') as tmpf:
... tmpf.write(files_str[filename])
... tar.add(os.path.join(tmppath, filename), arcname=filename)
... tar.close()
...
...
>>> mail_size = 0
>>> files_str = {}
>>> for i in range(3):
... d = os.urandom(1*size_in_mb*(10**6))
... files_str['attachment'+str(i)+'.txt'] = d
... mail_size += sys.getsizeof(d)
...
...
/10**6)
tmppath = tempfile.mkdtemp()
print('tar-tmppath', tmppath)
tarfileprefix = 'tmpfoobar'
compress_str_to_tar(tmppath, files_str, tarfileprefix, 'w:gz')
print('mail_size', float(sys.getsizeof(open(os.path.join(tmppath, tarfileprefix+'.tar.gz')).read()))/10**6)
>>> print('mail_size', float(mail_size)/10**6)
('mail_size', 30.000111)
>>>
>>> tmppath = tempfile.mkdtemp()
>>> print('tar-tmppath', tmppath)
('tar-tmppath', '/tmp/tmpndifyt')
>>> tarfileprefix = 'tmpfoobar'
>>> compress_str_to_tar(tmppath, files_str, tarfileprefix, 'w:gz')
>>> print('mail_size', float(sys.getsizeof(open(os.path.join(tmppath, tarfileprefix+'.tar.gz')).read()))/10**6)
('mail_size', 30.009782)
>>>
>>>
>>>
您正在尝试压缩 os.urandom
生成的一些数据,这些数据是 随机 。
如果随机函数很好,随机数据压缩得非常糟糕。
压缩的原理是识别重复模式。随机算法越好,您发现的重复模式就越少。
我建议您尝试使用 真实 文件,或从给定的单词列表(不是随机字母)生成的随机文本,这样压缩效果会好得多。
因此,正如@Jean 所说,我能够将 3 个具有相同重复字符的 10MB 文件压缩到 0.02MB => d = ('1'*size_in_mb*10**6)
import os
import sys
import tarfile
import tempfile
size_in_mb = 10
def compress_str_to_tar(tmppath, files_str, tarfileprefix, tarmode="w:gz"):
''' compress string contents in files and tar. finally creates a tar file in tmppath
@param tmppath: (str) pathdirectory where temp files to be compressed will be created
@param files_str: (dict) {filename: filecontent_in_str} these will be compressed
@param tarfileprefix: (str) output filename (without suffix) of tar
@param tarmode: (str) w:gz or w:bz2
'''
tar = tarfile.open(os.path.join(tmppath, tarfileprefix+'.tar.'+tarmode.split(':')[1]), tarmode, compresslevel=9)
for filename in files_str:
with open(os.path.join(tmppath, filename), 'wb') as tmpf:
tmpf.write(files_str[filename])
tar.add(os.path.join(tmppath, filename), arcname=filename)
tar.close()
mail_size = 0
files_str = {}
for i in range(3):
d = ('1'*size_in_mb*10**6)
files_str['attachment'+str(i)+'.txt'] = d
mail_size += sys.getsizeof(d)
print('mail_size', float(mail_size)/10**6)
tmppath = tempfile.mkdtemp()
print('tar-tmppath', tmppath)
tarfileprefix = 'tmpfoobar'
compress_str_to_tar(tmppath, files_str, tarfileprefix, 'w:gz')
print('mail_size', float(sys.getsizeof(open(os.path.join(tmppath, tarfileprefix+'.tar.gz')).read()))/10**6)
il_size',>>> print('mail_size', float(mail_size)/10**6)
('mail_size', 30.000111)
>>>
>>> tmppath = tempfile.mkdtemp()
>>> print('tar-tmppath', tmppath)
('tar-tmppath', '/tmp/tmpA3r51N')
>>> tarfileprefix = 'tmpfoobar'
>>> compress_str_to_tar(tmppath, files_str, tarfileprefix, 'w:gz')
ize', float(sys.getsizeof(open(os.path.join(tmppath, tarfileprefix+'.tar.gz')).read()))/10**6)
>>> print('mail_size', float(sys.getsizeof(open(os.path.join(tmppath, tarfileprefix+'.tar.gz')).read()))/10**6)
('mail_size', 0.02958)
>>>
>>>
>>>
因此,我试图将 3 个文本文件每个 10MB 压缩为一个文件 tar.gz,但它似乎并没有减少最终的 tar.gz。最终 tar.gz 文件大小仍为 30MB。
谁能告诉我为什么会这样?我有最高级别的压缩
>>> import os
>>> import sys
>>> import tarfile
>>> import tempfile
tarmode="w:gz"):
''>>> size_in_mb = 10
>>>
>>> def compress_str_to_tar(tmppath, files_str, tarfileprefix, tarmode="w:gz"):
... ''' compress string contents in files and tar. finally creates a tar file in tmppath
... @param tmppath: (str) pathdirectory where temp files to be compressed will be created
... @param files_str: (dict) {filename: filecontent_in_str} these will be compressed
... @param tarfileprefix: (str) output filename (without suffix) of tar
... @param tarmode: (str) w:gz or w:bz2
... '''
... tar = tarfile.open(os.path.join(tmppath, tarfileprefix+'.tar.'+tarmode.split(':')[1]), tarmode, compresslevel=9)
... for filename in files_str:
... with open(os.path.join(tmppath, filename), 'wb') as tmpf:
... tmpf.write(files_str[filename])
... tar.add(os.path.join(tmppath, filename), arcname=filename)
... tar.close()
...
...
>>> mail_size = 0
>>> files_str = {}
>>> for i in range(3):
... d = os.urandom(1*size_in_mb*(10**6))
... files_str['attachment'+str(i)+'.txt'] = d
... mail_size += sys.getsizeof(d)
...
...
/10**6)
tmppath = tempfile.mkdtemp()
print('tar-tmppath', tmppath)
tarfileprefix = 'tmpfoobar'
compress_str_to_tar(tmppath, files_str, tarfileprefix, 'w:gz')
print('mail_size', float(sys.getsizeof(open(os.path.join(tmppath, tarfileprefix+'.tar.gz')).read()))/10**6)
>>> print('mail_size', float(mail_size)/10**6)
('mail_size', 30.000111)
>>>
>>> tmppath = tempfile.mkdtemp()
>>> print('tar-tmppath', tmppath)
('tar-tmppath', '/tmp/tmpndifyt')
>>> tarfileprefix = 'tmpfoobar'
>>> compress_str_to_tar(tmppath, files_str, tarfileprefix, 'w:gz')
>>> print('mail_size', float(sys.getsizeof(open(os.path.join(tmppath, tarfileprefix+'.tar.gz')).read()))/10**6)
('mail_size', 30.009782)
>>>
>>>
>>>
您正在尝试压缩 os.urandom
生成的一些数据,这些数据是 随机 。
如果随机函数很好,随机数据压缩得非常糟糕。
压缩的原理是识别重复模式。随机算法越好,您发现的重复模式就越少。
我建议您尝试使用 真实 文件,或从给定的单词列表(不是随机字母)生成的随机文本,这样压缩效果会好得多。
因此,正如@Jean 所说,我能够将 3 个具有相同重复字符的 10MB 文件压缩到 0.02MB => d = ('1'*size_in_mb*10**6)
import os
import sys
import tarfile
import tempfile
size_in_mb = 10
def compress_str_to_tar(tmppath, files_str, tarfileprefix, tarmode="w:gz"):
''' compress string contents in files and tar. finally creates a tar file in tmppath
@param tmppath: (str) pathdirectory where temp files to be compressed will be created
@param files_str: (dict) {filename: filecontent_in_str} these will be compressed
@param tarfileprefix: (str) output filename (without suffix) of tar
@param tarmode: (str) w:gz or w:bz2
'''
tar = tarfile.open(os.path.join(tmppath, tarfileprefix+'.tar.'+tarmode.split(':')[1]), tarmode, compresslevel=9)
for filename in files_str:
with open(os.path.join(tmppath, filename), 'wb') as tmpf:
tmpf.write(files_str[filename])
tar.add(os.path.join(tmppath, filename), arcname=filename)
tar.close()
mail_size = 0
files_str = {}
for i in range(3):
d = ('1'*size_in_mb*10**6)
files_str['attachment'+str(i)+'.txt'] = d
mail_size += sys.getsizeof(d)
print('mail_size', float(mail_size)/10**6)
tmppath = tempfile.mkdtemp()
print('tar-tmppath', tmppath)
tarfileprefix = 'tmpfoobar'
compress_str_to_tar(tmppath, files_str, tarfileprefix, 'w:gz')
print('mail_size', float(sys.getsizeof(open(os.path.join(tmppath, tarfileprefix+'.tar.gz')).read()))/10**6)
il_size',>>> print('mail_size', float(mail_size)/10**6)
('mail_size', 30.000111)
>>>
>>> tmppath = tempfile.mkdtemp()
>>> print('tar-tmppath', tmppath)
('tar-tmppath', '/tmp/tmpA3r51N')
>>> tarfileprefix = 'tmpfoobar'
>>> compress_str_to_tar(tmppath, files_str, tarfileprefix, 'w:gz')
ize', float(sys.getsizeof(open(os.path.join(tmppath, tarfileprefix+'.tar.gz')).read()))/10**6)
>>> print('mail_size', float(sys.getsizeof(open(os.path.join(tmppath, tarfileprefix+'.tar.gz')).read()))/10**6)
('mail_size', 0.02958)
>>>
>>>
>>>