在 Python 中是否有更有效的转换 tif 文件的方法?
Is there a more efficient way of converting tif files in Python?
在这里,我使用 gdal
将 tif 文件转换为 Stata 数据集。为简单起见,我将以最低分辨率 link ones(如果重要,我使用的是 2.5m 分辨率文件)。
这是我的代码:
### Be sure to copy the file I've
### linked to your current directory
from osgeo import gdal
import pandas as pd, os, glob
for file in glob.glob("*.tif"):
ds = gdal.Open(file) # Make TIF into a dataset
xyz = gdal.Translate(file+".xyz", ds) # Extracts coordinates
xyz = None
df = pd.read_csv(file+".xyz", sep = " ", header = None)
df.columns = ["_CX","_CY", "tmin"]
df.to_stata(file+".dta", write_index=False)
del ds
files_in_dir = glob.iglob('*.xyz')
for _file in files_in_dir:
print(_file)
os.remove(_file)
os.remove(file)
代码完成了我需要它做的所有事情——但在更高的分辨率下,它只需要一个半小时左右,我有兴趣优化它。有什么方法可以让这个 运行 更快?
您可以使用 multiprocessing
模块将其分布在多个内核上。这并不总能加快速度,如果您最终过度提交内存,实际上会使事情变得更糟。但值得一试。
from osgeo import gdal
import pandas as pd, os, glob
import multiprocessing as mp
def gdal_izer(file):
ds = gdal.Open(file) # Make TIF into a dataset
xyz = gdal.Translate(file+".xyz", ds) # Extracts coordinates
xyz = None
df = pd.read_csv(file+".xyz", sep = " ", header = None)
df.columns = ["_CX","_CY", "tmin"]
df.to_stata(file+".dta", write_index=False)
del ds
# TODO: Fix = this will delete xyz files used by other
# processes. Should it just be deleting the one file?
files_in_dir = glob.iglob('*.xyz')
for _file in files_in_dir:
print(_file)
os.remove(_file)
# TODO: removing the tif seems pretty extreme. maybe
# save it in case something goes wrong?
os.remove(file)
def main():
# note: all cores is likely too much
files = glob.glob("*.tif")
with mp.Pool(min(len(files), mp.cpu_count() * .6)) as pool:
result = mp.map(gdal_izer, glob.glob("*.tif"))
if __name__ == "__main__":
main()
在这里,我使用 gdal
将 tif 文件转换为 Stata 数据集。为简单起见,我将以最低分辨率 link ones(如果重要,我使用的是 2.5m 分辨率文件)。
这是我的代码:
### Be sure to copy the file I've
### linked to your current directory
from osgeo import gdal
import pandas as pd, os, glob
for file in glob.glob("*.tif"):
ds = gdal.Open(file) # Make TIF into a dataset
xyz = gdal.Translate(file+".xyz", ds) # Extracts coordinates
xyz = None
df = pd.read_csv(file+".xyz", sep = " ", header = None)
df.columns = ["_CX","_CY", "tmin"]
df.to_stata(file+".dta", write_index=False)
del ds
files_in_dir = glob.iglob('*.xyz')
for _file in files_in_dir:
print(_file)
os.remove(_file)
os.remove(file)
代码完成了我需要它做的所有事情——但在更高的分辨率下,它只需要一个半小时左右,我有兴趣优化它。有什么方法可以让这个 运行 更快?
您可以使用 multiprocessing
模块将其分布在多个内核上。这并不总能加快速度,如果您最终过度提交内存,实际上会使事情变得更糟。但值得一试。
from osgeo import gdal
import pandas as pd, os, glob
import multiprocessing as mp
def gdal_izer(file):
ds = gdal.Open(file) # Make TIF into a dataset
xyz = gdal.Translate(file+".xyz", ds) # Extracts coordinates
xyz = None
df = pd.read_csv(file+".xyz", sep = " ", header = None)
df.columns = ["_CX","_CY", "tmin"]
df.to_stata(file+".dta", write_index=False)
del ds
# TODO: Fix = this will delete xyz files used by other
# processes. Should it just be deleting the one file?
files_in_dir = glob.iglob('*.xyz')
for _file in files_in_dir:
print(_file)
os.remove(_file)
# TODO: removing the tif seems pretty extreme. maybe
# save it in case something goes wrong?
os.remove(file)
def main():
# note: all cores is likely too much
files = glob.glob("*.tif")
with mp.Pool(min(len(files), mp.cpu_count() * .6)) as pool:
result = mp.map(gdal_izer, glob.glob("*.tif"))
if __name__ == "__main__":
main()