Python: 读取并处理远程服务器中的多个 gzip 文件

Python: Reading and processing Multiple gzip files in remote server

问题陈述:

我在远程服务器中有多个 (1000+) *.gz 文件。我必须阅读这些文件并检查某些字符串。如果字符串匹配,我必须 return 文件名。我尝试了以下代码。以下程序正在运行,但似乎效率不高,因为涉及大量 IO。能否请您提出一个有效的方法来做到这一点。

我的代码:

import gzip
import os
import paramiko
import multiprocessing
from bisect import insort
synchObj=multiprocessing.Manager()
hostname = '192.168.1.2' 
port = 22
username='may'
password='Apa$sW0rd'

def miniAnalyze():
    ifile_list=synchObj.list([]) # A synchronized list to Store the File names containing the matched String.

    def analyze_the_file(file_single):
       strings = ("error 72","error 81",) # Hard Coded the Strings that needs to be searched.
       try:
          ssh=paramiko.SSHClient()
          #Code to FTP the file to local system from the remote machine.
          .....
          ........
          path_f='/home/user/may/'+filename

          #Read the Gzip file in local system after FTP is done

          with gzip.open(path_f, 'rb') as f:
            contents = f.read()
            if any(s in contents for s in strings):
                print "File " + str(path_f) + " is  a hit."
                insort(ifile_list, filename) # Push the file into the list if there is a match.
                os.remove(path_f)
            else:
                os.remove(path_f)
       except Exception, ae:
          print "Error while Analyzing file "+ str(ae)

       finally:
           if ifile_list:
             print "The Error is at "+ ifile_list
           ftp.close()
           ssh.close()


    def assign_to_proc():
        # Code to glob files matching a pattern and pass to another function via multiprocess .
        apath = '/home/remotemachine/log/'
        apattern = '"*.gz"'
        first_command = 'find {path} -name {pattern}'
        command = first_command.format(path=apath, pattern=apattern)

        try:
            ssh=paramiko.SSHClient()
            ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
            ssh.connect(hostname,username=username,password=password)
            stdin, stdout, stderr = ssh.exec_command(command)
            while not stdout.channel.exit_status_ready():
                time.sleep(2)
            filelist = stdout.read().splitlines()

            jobs = []

            for ifle in filelist:
                p = multiprocessing.Process(target=analyze_the_file,args=(ifle,))
                jobs.append(p)
                p.start()

            for job in jobs:
                job.join()


        except Exception, fe:
            print "Error while getting file names "+ str(fe)

        finally:
            ssh.close()


if __name__ == '__main__':
    miniAnalyze()

上面的代码很慢。将 GZ 文件获取到本地系统时有很多 IO。请帮助我找到更好的方法。

执行远程OS命令,例如zgrep,并在本地处理命令结果。这样,您就不必在本地计算机上传输整个文件内容。