Java 尝试比较大文件时代码挂起

Java code hangs when try to compare huge files

我正在探索一个选项来比较 Java 中的两个文件并显示 html 中的差异。

下面是代码,我用的是-

import java.io.File;
import java.io.IOException;
 
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator;
import org.apache.commons.text.diff.CommandVisitor;
import org.apache.commons.text.diff.StringsComparator;
 
public class FileDiff {
 
    public static void main(String[] args) throws IOException {
        // Read both files with line iterator.
        LineIterator file1 = FileUtils.lineIterator(new File("file-1.txt"), "utf-8");
        LineIterator file2 = FileUtils.lineIterator(new File("file-2.txt"), "utf-8");
 
        // Initialize visitor.
        FileCommandsVisitor fileCommandsVisitor = new FileCommandsVisitor();
 
        // Read file line by line so that comparison can be done line by line.
        while (file1.hasNext() || file2.hasNext()) {
            /*
             * In case both files have different number of lines, fill in with empty
             * strings. Also append newline char at end so next line comparison moves to
             * next line.
             */
            String left = (file1.hasNext() ? file1.nextLine() : "") + "\n";
            String right = (file2.hasNext() ? file2.nextLine() : "") + "\n";
 
            // Prepare diff comparator with lines from both files.
            StringsComparator comparator = new StringsComparator(left, right);
 
            if (comparator.getScript().getLCSLength() > (Integer.max(left.length(), right.length()) * 0.4)) {
                /*
                 * If both lines have atleast 40% commonality then only compare with each other
                 * so that they are aligned with each other in final diff HTML.
                 */
                comparator.getScript().visit(fileCommandsVisitor);
            } else {
                /*
                 * If both lines do not have 40% commanlity then compare each with empty line so
                 * that they are not aligned to each other in final diff instead they show up on
                 * separate lines.
                 */
                StringsComparator leftComparator = new StringsComparator(left, "\n");
                leftComparator.getScript().visit(fileCommandsVisitor);
                StringsComparator rightComparator = new StringsComparator("\n", right);
                rightComparator.getScript().visit(fileCommandsVisitor);
            }
        }
 
        fileCommandsVisitor.generateHTML();
    }
}
 
/*
 * Custom visitor for file comparison which stores comparison & also generates
 * HTML in the end.
 */
class FileCommandsVisitor implements CommandVisitor<Character> {
 
    // Spans with red & green highlights to put highlighted characters in HTML
    private static final String DELETION = "<span style=\"background-color: #FB504B\">${text}</span>";
    private static final String INSERTION = "<span style=\"background-color: #45EA85\">${text}</span>";
 
    private String left = "";
    private String right = "";
 
    @Override
    public void visitKeepCommand(Character c) {
        // For new line use <br/> so that in HTML also it shows on next line.
        String toAppend = "\n".equals("" + c) ? "<br/>" : "" + c;
        // KeepCommand means c present in both left & right. So add this to both without
        // any
        // highlight.
        left = left + toAppend;
        right = right + toAppend;
    }
 
    @Override
    public void visitInsertCommand(Character c) {
        // For new line use <br/> so that in HTML also it shows on next line.
        String toAppend = "\n".equals("" + c) ? "<br/>" : "" + c;
        // InsertCommand means character is present in right file but not in left. Show
        // with green highlight on right.
        right = right + INSERTION.replace("${text}", "" + toAppend);
    }
 
    @Override
    public void visitDeleteCommand(Character c) {
        // For new line use <br/> so that in HTML also it shows on next line.
        String toAppend = "\n".equals("" + c) ? "<br/>" : "" + c;
        // DeleteCommand means character is present in left file but not in right. Show
        // with red highlight on left.
        left = left + DELETION.replace("${text}", "" + toAppend);
    }
 
    public void generateHTML() throws IOException {
 
        // Get template & replace placeholders with left & right variables with actual
        // comparison
        String template = FileUtils.readFileToString(new File("difftemplate.html"), "utf-8");
        String out1 = template.replace("${left}", left);
        String output = out1.replace("${right}", right);
        // Write file to disk.
        FileUtils.write(new File("finalDiff.html"), output, "utf-8");
        System.out.println("HTML diff generated.");
    }
}

对于较小的文件,此方法效果很好,在我的笔记本电脑上也能得到很好的结果。但是,如果文件大小更大(200MB)且有 50 万行,那么我的 IntelliJ 似乎会挂起。我笔记本电脑的 RAM 是 16GB。

我如何改进它以处理大文件以进行比较?

谢谢

您的写法FileCommandsVisitor可能会导致优化失败。您正在做的是为访问的每个字符添加字符串,例如:

left = left + toAppend;
right = right + toAppend;

这可能会导致您每次添加时都会生成一个字符串的新实例 - 到最后长度将近 200 MB 的字符串的新实例。您访问的每个角色都有一个新角色。旧的需要收集垃圾。如果您的 class 改为 StringBuilders,并且您使用 append() 方法,它可能会大大加快速度。有关详细信息,请阅读 String concatenation: concat() vs "+" operator

为清楚起见(根据评论,您现在两次错过了要点):

class FileCommandsVisitor implements CommandVisitor<Character> {

//StringBuilder as properties
private StringBuilder left = new StringBuilder();
private StringBuilder right = new StringBuilder();

@Override
public void visitKeepCommand(Character c) {
    String toAppend = "\n".equals("" + c) ? "<br/>" : "" + c;
    // append to the StringBuilders where you would concat strings
    left.append(toAppend);
    right.append(toAppend);
}

//same as above for other methods

..

public void generateHTML() throws IOException {

    String template = FileUtils.readFileToString(new File("difftemplate.html"), "utf-8");
    //turn StringBuilders into Strings only when you actually need a String.
    String out1 = template.replace("${left}", left.toString());
    String output = out1.replace("${right}", right.toString());
    FileUtils.write(new File("finalDiff.html"), output, "utf-8");
    System.out.println("HTML diff generated.");
}

}

如果这没有帮助,并且它在运行时进行了优化 - 我看不出您的操作方式有任何其他根本性错误。比较大文件的操作并不便宜,它不会比从硬盘驱动器逐行读取两个文件的速度快。您仍然在让 FileCommandsVisitor 将两个差异保存在内存中而不是按原样写入,这意味着您的代码充其量只能区分一个大小的文件等于可用 RAM 的一半。但是我注意到,您从未提到实际需要多长时间,因此很难说您看到的时间是预期的还是异常的。