Java 尝试比较大文件时代码挂起
Java code hangs when try to compare huge files
我正在探索一个选项来比较 Java 中的两个文件并显示 html 中的差异。
下面是代码,我用的是-
import java.io.File;
import java.io.IOException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator;
import org.apache.commons.text.diff.CommandVisitor;
import org.apache.commons.text.diff.StringsComparator;
public class FileDiff {
public static void main(String[] args) throws IOException {
// Read both files with line iterator.
LineIterator file1 = FileUtils.lineIterator(new File("file-1.txt"), "utf-8");
LineIterator file2 = FileUtils.lineIterator(new File("file-2.txt"), "utf-8");
// Initialize visitor.
FileCommandsVisitor fileCommandsVisitor = new FileCommandsVisitor();
// Read file line by line so that comparison can be done line by line.
while (file1.hasNext() || file2.hasNext()) {
/*
* In case both files have different number of lines, fill in with empty
* strings. Also append newline char at end so next line comparison moves to
* next line.
*/
String left = (file1.hasNext() ? file1.nextLine() : "") + "\n";
String right = (file2.hasNext() ? file2.nextLine() : "") + "\n";
// Prepare diff comparator with lines from both files.
StringsComparator comparator = new StringsComparator(left, right);
if (comparator.getScript().getLCSLength() > (Integer.max(left.length(), right.length()) * 0.4)) {
/*
* If both lines have atleast 40% commonality then only compare with each other
* so that they are aligned with each other in final diff HTML.
*/
comparator.getScript().visit(fileCommandsVisitor);
} else {
/*
* If both lines do not have 40% commanlity then compare each with empty line so
* that they are not aligned to each other in final diff instead they show up on
* separate lines.
*/
StringsComparator leftComparator = new StringsComparator(left, "\n");
leftComparator.getScript().visit(fileCommandsVisitor);
StringsComparator rightComparator = new StringsComparator("\n", right);
rightComparator.getScript().visit(fileCommandsVisitor);
}
}
fileCommandsVisitor.generateHTML();
}
}
/*
* Custom visitor for file comparison which stores comparison & also generates
* HTML in the end.
*/
class FileCommandsVisitor implements CommandVisitor<Character> {
// Spans with red & green highlights to put highlighted characters in HTML
private static final String DELETION = "<span style=\"background-color: #FB504B\">${text}</span>";
private static final String INSERTION = "<span style=\"background-color: #45EA85\">${text}</span>";
private String left = "";
private String right = "";
@Override
public void visitKeepCommand(Character c) {
// For new line use <br/> so that in HTML also it shows on next line.
String toAppend = "\n".equals("" + c) ? "<br/>" : "" + c;
// KeepCommand means c present in both left & right. So add this to both without
// any
// highlight.
left = left + toAppend;
right = right + toAppend;
}
@Override
public void visitInsertCommand(Character c) {
// For new line use <br/> so that in HTML also it shows on next line.
String toAppend = "\n".equals("" + c) ? "<br/>" : "" + c;
// InsertCommand means character is present in right file but not in left. Show
// with green highlight on right.
right = right + INSERTION.replace("${text}", "" + toAppend);
}
@Override
public void visitDeleteCommand(Character c) {
// For new line use <br/> so that in HTML also it shows on next line.
String toAppend = "\n".equals("" + c) ? "<br/>" : "" + c;
// DeleteCommand means character is present in left file but not in right. Show
// with red highlight on left.
left = left + DELETION.replace("${text}", "" + toAppend);
}
public void generateHTML() throws IOException {
// Get template & replace placeholders with left & right variables with actual
// comparison
String template = FileUtils.readFileToString(new File("difftemplate.html"), "utf-8");
String out1 = template.replace("${left}", left);
String output = out1.replace("${right}", right);
// Write file to disk.
FileUtils.write(new File("finalDiff.html"), output, "utf-8");
System.out.println("HTML diff generated.");
}
}
对于较小的文件,此方法效果很好,在我的笔记本电脑上也能得到很好的结果。但是,如果文件大小更大(200MB)且有 50 万行,那么我的 IntelliJ 似乎会挂起。我笔记本电脑的 RAM 是 16GB。
我如何改进它以处理大文件以进行比较?
谢谢
您的写法FileCommandsVisitor
可能会导致优化失败。您正在做的是为访问的每个字符添加字符串,例如:
left = left + toAppend;
right = right + toAppend;
这可能会导致您每次添加时都会生成一个字符串的新实例 - 到最后长度将近 200 MB 的字符串的新实例。您访问的每个角色都有一个新角色。旧的需要收集垃圾。如果您的 class 改为 StringBuilder
s,并且您使用 append()
方法,它可能会大大加快速度。有关详细信息,请阅读 String concatenation: concat() vs "+" operator
为清楚起见(根据评论,您现在两次错过了要点):
class FileCommandsVisitor implements CommandVisitor<Character> {
//StringBuilder as properties
private StringBuilder left = new StringBuilder();
private StringBuilder right = new StringBuilder();
@Override
public void visitKeepCommand(Character c) {
String toAppend = "\n".equals("" + c) ? "<br/>" : "" + c;
// append to the StringBuilders where you would concat strings
left.append(toAppend);
right.append(toAppend);
}
//same as above for other methods
..
public void generateHTML() throws IOException {
String template = FileUtils.readFileToString(new File("difftemplate.html"), "utf-8");
//turn StringBuilders into Strings only when you actually need a String.
String out1 = template.replace("${left}", left.toString());
String output = out1.replace("${right}", right.toString());
FileUtils.write(new File("finalDiff.html"), output, "utf-8");
System.out.println("HTML diff generated.");
}
}
如果这没有帮助,并且它在运行时进行了优化 - 我看不出您的操作方式有任何其他根本性错误。比较大文件的操作并不便宜,它不会比从硬盘驱动器逐行读取两个文件的速度快。您仍然在让 FileCommandsVisitor
将两个差异保存在内存中而不是按原样写入,这意味着您的代码充其量只能区分一个大小的文件等于可用 RAM 的一半。但是我注意到,您从未提到实际需要多长时间,因此很难说您看到的时间是预期的还是异常的。
我正在探索一个选项来比较 Java 中的两个文件并显示 html 中的差异。
下面是代码,我用的是-
import java.io.File;
import java.io.IOException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator;
import org.apache.commons.text.diff.CommandVisitor;
import org.apache.commons.text.diff.StringsComparator;
public class FileDiff {
public static void main(String[] args) throws IOException {
// Read both files with line iterator.
LineIterator file1 = FileUtils.lineIterator(new File("file-1.txt"), "utf-8");
LineIterator file2 = FileUtils.lineIterator(new File("file-2.txt"), "utf-8");
// Initialize visitor.
FileCommandsVisitor fileCommandsVisitor = new FileCommandsVisitor();
// Read file line by line so that comparison can be done line by line.
while (file1.hasNext() || file2.hasNext()) {
/*
* In case both files have different number of lines, fill in with empty
* strings. Also append newline char at end so next line comparison moves to
* next line.
*/
String left = (file1.hasNext() ? file1.nextLine() : "") + "\n";
String right = (file2.hasNext() ? file2.nextLine() : "") + "\n";
// Prepare diff comparator with lines from both files.
StringsComparator comparator = new StringsComparator(left, right);
if (comparator.getScript().getLCSLength() > (Integer.max(left.length(), right.length()) * 0.4)) {
/*
* If both lines have atleast 40% commonality then only compare with each other
* so that they are aligned with each other in final diff HTML.
*/
comparator.getScript().visit(fileCommandsVisitor);
} else {
/*
* If both lines do not have 40% commanlity then compare each with empty line so
* that they are not aligned to each other in final diff instead they show up on
* separate lines.
*/
StringsComparator leftComparator = new StringsComparator(left, "\n");
leftComparator.getScript().visit(fileCommandsVisitor);
StringsComparator rightComparator = new StringsComparator("\n", right);
rightComparator.getScript().visit(fileCommandsVisitor);
}
}
fileCommandsVisitor.generateHTML();
}
}
/*
* Custom visitor for file comparison which stores comparison & also generates
* HTML in the end.
*/
class FileCommandsVisitor implements CommandVisitor<Character> {
// Spans with red & green highlights to put highlighted characters in HTML
private static final String DELETION = "<span style=\"background-color: #FB504B\">${text}</span>";
private static final String INSERTION = "<span style=\"background-color: #45EA85\">${text}</span>";
private String left = "";
private String right = "";
@Override
public void visitKeepCommand(Character c) {
// For new line use <br/> so that in HTML also it shows on next line.
String toAppend = "\n".equals("" + c) ? "<br/>" : "" + c;
// KeepCommand means c present in both left & right. So add this to both without
// any
// highlight.
left = left + toAppend;
right = right + toAppend;
}
@Override
public void visitInsertCommand(Character c) {
// For new line use <br/> so that in HTML also it shows on next line.
String toAppend = "\n".equals("" + c) ? "<br/>" : "" + c;
// InsertCommand means character is present in right file but not in left. Show
// with green highlight on right.
right = right + INSERTION.replace("${text}", "" + toAppend);
}
@Override
public void visitDeleteCommand(Character c) {
// For new line use <br/> so that in HTML also it shows on next line.
String toAppend = "\n".equals("" + c) ? "<br/>" : "" + c;
// DeleteCommand means character is present in left file but not in right. Show
// with red highlight on left.
left = left + DELETION.replace("${text}", "" + toAppend);
}
public void generateHTML() throws IOException {
// Get template & replace placeholders with left & right variables with actual
// comparison
String template = FileUtils.readFileToString(new File("difftemplate.html"), "utf-8");
String out1 = template.replace("${left}", left);
String output = out1.replace("${right}", right);
// Write file to disk.
FileUtils.write(new File("finalDiff.html"), output, "utf-8");
System.out.println("HTML diff generated.");
}
}
对于较小的文件,此方法效果很好,在我的笔记本电脑上也能得到很好的结果。但是,如果文件大小更大(200MB)且有 50 万行,那么我的 IntelliJ 似乎会挂起。我笔记本电脑的 RAM 是 16GB。
我如何改进它以处理大文件以进行比较?
谢谢
您的写法FileCommandsVisitor
可能会导致优化失败。您正在做的是为访问的每个字符添加字符串,例如:
left = left + toAppend;
right = right + toAppend;
这可能会导致您每次添加时都会生成一个字符串的新实例 - 到最后长度将近 200 MB 的字符串的新实例。您访问的每个角色都有一个新角色。旧的需要收集垃圾。如果您的 class 改为 StringBuilder
s,并且您使用 append()
方法,它可能会大大加快速度。有关详细信息,请阅读 String concatenation: concat() vs "+" operator
为清楚起见(根据评论,您现在两次错过了要点):
class FileCommandsVisitor implements CommandVisitor<Character> {
//StringBuilder as properties
private StringBuilder left = new StringBuilder();
private StringBuilder right = new StringBuilder();
@Override
public void visitKeepCommand(Character c) {
String toAppend = "\n".equals("" + c) ? "<br/>" : "" + c;
// append to the StringBuilders where you would concat strings
left.append(toAppend);
right.append(toAppend);
}
//same as above for other methods
..
public void generateHTML() throws IOException {
String template = FileUtils.readFileToString(new File("difftemplate.html"), "utf-8");
//turn StringBuilders into Strings only when you actually need a String.
String out1 = template.replace("${left}", left.toString());
String output = out1.replace("${right}", right.toString());
FileUtils.write(new File("finalDiff.html"), output, "utf-8");
System.out.println("HTML diff generated.");
}
}
如果这没有帮助,并且它在运行时进行了优化 - 我看不出您的操作方式有任何其他根本性错误。比较大文件的操作并不便宜,它不会比从硬盘驱动器逐行读取两个文件的速度快。您仍然在让 FileCommandsVisitor
将两个差异保存在内存中而不是按原样写入,这意味着您的代码充其量只能区分一个大小的文件等于可用 RAM 的一半。但是我注意到,您从未提到实际需要多长时间,因此很难说您看到的时间是预期的还是异常的。