仅获取 5 个文件，然后在下一个会话中获取 5 个文件

Question

我在文件服务器的一个文件夹中有大约 2000 个文件。我正在使用 paramiko 对这些文件进行 SFTP。一切正常。

现在，我计划选择前 100 个文件处理它，然后处理下一批。关于我如何通过 paramiko 做到这一点的任何建议。

所以，我现在有了添加的代码。逻辑是我成功地在我的服务器和桌面之间移动文件。我唯一需要做的就是添加文件批处理。

import os
from stat import S_ISDIR

import paramiko
import tika
from tika import parser
tika.initVM()

paramiko.util.log_to_file('D:/Paramiko/logs/paramiko.log')

ssh = paramiko.SSHClient()

ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())

ssh.connect('xxxx', username='ubuntu', password='', key_filename='D:/Paramiko/test.pem')
sftp = ssh.open_sftp()

depth=2
stdin, stdout, stderr = ssh.exec_command("find . -mindepth "+str(depth)+" -maxdepth "+str(depth)+" -type d ")
dlist = list(stdout.readlines())
len = (len(dlist))


def getfiles_from_directory(subfolder,filename,sftp,local_p):
    remote_path = subfolder+'/'+filename
    local_path = os.path.join(local_p,filename)
    #print('R:'+remote_path+'L:'+local_path)
    sftp.get(remote_path,local_path)
    try:
        parsed = parser.from_file(local_path)
        print('MetaData:'+str(parsed['metadata']))
    except KeyError as k:
        print(k)



folders_copied = []
files_copied = []
for direc in dlist:
        direcname = str(direc).replace('./', '')
        #print(direcname)
        if not direcname.startswith('.'):
            local_path = 'D:/Paramiko/Test' #make sure this path exists i.e Test folder inside Paramiko is made prior to execution of this line
            for i in range(depth):
                stdin, stdout, stderr = ssh.exec_command("cd "+direcname+" pwd")
                subfolder = stdout.readlines()
                #print("Query Result:"+str(subfolder))
                subfolder = str(subfolder[0]).strip()
                local_p = os.path.join(local_path,direcname.strip())
                if os.path.exists(local_p)==False:
                    os.mkdir(local_p)
                sftp.chdir(subfolder)
                children = sftp.listdir()
                # stdin, stdout, stderr = ssh.exec_command("ls")
                # subfolder_content = list(stdout.readlines())
                #getfiles_from_directory(sftp,subfolder)
                for content in children:
                    content_sub = str(content)
                    if content_sub.endswith('\n')==False and content_sub.__contains__('.'):
                        print('Found a file:'+content_sub)
                        if subfolder not in folders_copied:
                            getfiles_from_directory(subfolder,content_sub,sftp,local_p)
                            files_copied.append(subfolder+'/'+content_sub)
                    else:
                        print('Found a folder:'+content_sub)
                folders_copied.append(subfolder)
print('Folders Copied:'+str(folders_copied))
print('Files Copied:'+str(files_copied))

Answer 1

解决了。在根文件夹上使用 os.walk。计算第一个文件夹的深度并迭代它。

简单的深度逻辑。

仅获取 5 个文件，然后在下一个会话中获取 5 个文件

get only 5 files and then 5 files in next session

python

sftp

paramiko