在 Java 中拆分和合并大文件(大小以 GB 为单位)

Splitting and Merging large files (size in GB) in Java

假设,

任何人都可以帮助我为什么会有这种差异?我正在使用以下代码合并和拆分文件。

SplitFile.java

import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.RandomAccessFile;

/**
 * @author vishal.zanzrukia
 * 
 */
public class SplitFile {

    public static final String INPUT_FILE = "D:\me\projects\input\file\path.txt";
    public static final int NUMBER_OF_OUTPUT_FILES = 30;
    public static final String FILE_SUFFIX = ".txt";

    /**
     * split file
     * 
     * @throws Exception
     */
    static void splitFile() throws Exception{

        File inputFile = new File(INPUT_FILE + "_Splits");
        inputFile.mkdir();

        RandomAccessFile raf = new RandomAccessFile(INPUT_FILE, "r");

        long sourceSize = raf.length();
        long bytesPerSplit = sourceSize / NUMBER_OF_OUTPUT_FILES;
        long remainingBytes = sourceSize % NUMBER_OF_OUTPUT_FILES;

        int maxReadBufferSize = 8 * 1024; // 8KB
        for (int destIx = 1; destIx <= NUMBER_OF_OUTPUT_FILES; destIx++) {
            BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream(INPUT_FILE + "_Splits\split." + destIx + FILE_SUFFIX));
            if (bytesPerSplit > maxReadBufferSize) {
                long numReads = bytesPerSplit / maxReadBufferSize;
                long numRemainingRead = bytesPerSplit % maxReadBufferSize;
                for (int i = 0; i < numReads; i++) {
                    readWrite(raf, bw, maxReadBufferSize);
                }
                if (numRemainingRead > 0) {
                    readWrite(raf, bw, numRemainingRead);
                }
            } else {
                readWrite(raf, bw, bytesPerSplit);
            }
            bw.close();
        }
        if (remainingBytes > 0) {
            BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream("split." + NUMBER_OF_OUTPUT_FILES + 1));
            readWrite(raf, bw, remainingBytes);
            bw.close();
        }
        raf.close();
    }

    /**
     * join file
     * 
     * @throws Exception
     */
    static void joinFiles() throws Exception{
        int maxReadBufferSize = 8 * 1024; 

        BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream(INPUT_FILE + "_Splits\fullJoin" + FILE_SUFFIX));
        File inputFileDir = new File(INPUT_FILE + "_Splits");
        RandomAccessFile raf = null;
        if(inputFileDir.isDirectory()){
            for(File file : inputFileDir.listFiles()){
                raf = new RandomAccessFile(file, "r");
                long numReads = raf.length() / maxReadBufferSize;
                long numRemainingRead = raf.length()  % maxReadBufferSize;
                for (int i = 0; i < numReads; i++) {
                    readWrite(raf, bw, maxReadBufferSize);
                }
                if (numRemainingRead > 0) {
                    readWrite(raf, bw, numRemainingRead);
                }
                raf.close();
            }
        }
        bw.close();
    }

    public static void mergeFiles() {

        File[] files = new File[NUMBER_OF_OUTPUT_FILES];
        for(int i=1;i<=NUMBER_OF_OUTPUT_FILES;i++){
            files[i-1] = new File(INPUT_FILE + "_Splits\split."+i+FILE_SUFFIX);
        }

        String mergedFilePath = INPUT_FILE + "_Splits\fullJoin" + FILE_SUFFIX;


        File mergedFile = new File(mergedFilePath);

        mergeFiles(files, mergedFile);
    }

    public static void mergeFiles(File[] files, File mergedFile) {

        FileWriter fstream = null;
        BufferedWriter out = null;
        try {
            fstream = new FileWriter(mergedFile, true);
             out = new BufferedWriter(fstream);
        } catch (IOException e1) {
            e1.printStackTrace();
        }

        for (File f : files) {
            System.out.println("merging: " + f.getName());
            FileInputStream fis;
            try {
                fis = new FileInputStream(f);
                BufferedReader in = new BufferedReader(new InputStreamReader(fis));

                String aLine;
                while ((aLine = in.readLine()) != null) {
                    out.write(aLine);
                    out.newLine();
                }

                in.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        try {
            out.close();
        } catch (IOException e) {
            e.printStackTrace();
        }

    }

    public static void main(String[] args) throws Exception {
//      splitFile();
        mergeFiles();
    }

    static void readWrite(RandomAccessFile raf, BufferedOutputStream bw, long numBytes) throws IOException {
        byte[] buf = new byte[(int) numBytes];
        int val = raf.read(buf);
        if (val != -1) {
            bw.write(buf);
        }
    }
}

问题是最后一行代码:

static void readWrite(RandomAccessFile raf, BufferedOutputStream bw, long numBytes) throws IOException {
    byte[] buf = new byte[(int) numBytes];
    int val = raf.read(buf);
    if (val != -1) {
        bw.write(buf);
    }
}

写的时候回写了numBytes个数据,但是read函数有usefully returned:

the total number of bytes read into the buffer, or -1 if there is no more data because the end of this file has been reached.

因此,您的解决方法是 use a different write:

bw.write(buf, 0 val);

使用您的 joinFiles 方法:如果您想保持原样,请不要尝试使用 Reader 逐行读取文件,因为行尾可能因平台而异。

而是使用 InputStreamRandomAccessFile 将它们作为二进制文件读取,并使用 OutputStream.

写入

您的 joinFiles 方法中唯一的问题是它使用了 File.listFiles(),这不能保证返回文件的顺序。

我将您的 mergeFiles() 代码与 joinFiles() 组合在一起以完成这项工作(请记住从您的 main 方法中调用 joinFiles() 而不是 mergeFiles()

static void joinFiles(File[] files) throws Exception {
    int maxReadBufferSize = 8 * 1024;

    BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream(INPUT_FILE + "_Splits\fullJoin"
            + FILE_SUFFIX));

    RandomAccessFile raf = null;
    for (File file : files) {
        raf = new RandomAccessFile(file, "r");
        long numReads = raf.length() / maxReadBufferSize;
        long numRemainingRead = raf.length() % maxReadBufferSize;
        for (int i = 0; i < numReads; i++) {
            readWrite(raf, bw, maxReadBufferSize);
        }
        if (numRemainingRead > 0) {
            readWrite(raf, bw, numRemainingRead);
        }
        raf.close();

    }
    bw.close();
}

public static void joinFiles() throws Exception {

    File[] files = new File[NUMBER_OF_OUTPUT_FILES];
    for (int i = 1; i <= NUMBER_OF_OUTPUT_FILES; i++) {
        files[i - 1] = new File(INPUT_FILE + "_Splits\split." + i + FILE_SUFFIX);
    }

    joinFiles(files);
}