Java 中 UTF-8 编码的 2GB txt 文件中每个词的词频
Word Frequency of each word in 2GB txt file in UTF-8 Encoding in Java
我正在做一个项目,我需要在超过 1 亿个孟加拉语单词的大型语料库中找出每个单词的频率。文件大小约为 2GB。我实际上需要最常出现的 20 个词和最不常出现的 20 个词的频率计数。我在 PHP 中完成了相同的代码,但它花了很长时间(一周后代码仍然是 运行)。因此,我正在尝试在 Java 中执行此操作。
在这段代码中,它应该像下面这样工作,
-从语料库中读取一行nahidd_filtered.txt
-使用空格分割
对于每个吐出的单词,读取整个频率文件freq3.txt
如果找到单词则增加频率计数并存储在该文件中
else count = 1(新词)并将频率计数存储在该文件中
我尝试使用循环从 nahidd_filtered.txt 语料库中读取文本块,频率高的单词存储在 freq3.txt 中。 freq3.txt 文件存储的频率计数是这样的,
Word1 Frequncy1 (single whitespace in between)
Word2 Frequency2
...........
简单来说,我需要从 UTF-8 编码的大型语料库文件中获取最频繁出现的前 20 个单词和最不频繁出现的 20 个单词以及它们的频率计数。请检查代码并告诉我为什么这不起作用或任何其他建议。非常感谢。
import java.io.*;
import java.util.*;
import java.util.concurrent.TimeUnit;
public class Main {
private static String fileToString(String filename) throws IOException {
FileInputStream inputStream = null;
Scanner reader = null;
inputStream = new FileInputStream(filename);
reader = new Scanner(inputStream, "UTF-8");
/*BufferedReader reader = new BufferedReader(new FileReader(filename));*/
StringBuilder builder = new StringBuilder();
// For every line in the file, append it to the string builder
while (reader.hasNextLine()) {
String line = reader.nextLine();
builder.append(line);
}
reader.close();
return builder.toString();
}
public static final String UTF8_BOM = "\uFEFF";
private static String removeUTF8BOM(String s) {
if (s.startsWith(UTF8_BOM)) {
s = s.substring(1);
}
return s;
}
public static void main(String[] args) throws IOException {
long startTime = System.nanoTime();
System.out.println("-------------- Start Contents of file: ---------------------");
FileInputStream inputStream = null;
Scanner sc = null;
String path = "C:/xampp/htdocs/thesis_freqeuncy_2/nahidd_filtered.txt";
try {
inputStream = new FileInputStream(path);
sc = new Scanner(inputStream, "UTF-8");
int countWord = 0;
BufferedWriter writer = null;
while (sc.hasNextLine()) {
String word = null;
String line = sc.nextLine();
String[] wordList = line.split("\s+");
for (int i = 0; i < wordList.length; i++) {
word = wordList[i].replace("।", "");
word = word.replace(",", "").trim();
ArrayList<String> freqword = new ArrayList<>();
String freq = fileToString("C:/xampp/htdocs/thesis_freqeuncy_2/freq3.txt");
/*freqword = freq.split("\r?\n");*/
Collections.addAll(freqword, freq.split("\r?\n"));
int flag = 0;
String[] freqwordsp = null;
int k;
for (k = 0; k < freqword.size(); k++) {
freqwordsp = freqword.get(k).split("\s+");
String word2 = freqwordsp[0];
word = removeUTF8BOM(word);
word2 = removeUTF8BOM(word2);
word.replaceAll("\P{Print}", "");
word2.replaceAll("\P{Print}", "");
if (word2.toString().equals(word.toString())) {
flag = 1;
break;
}
}
int count = 0;
if (flag == 1) {
count = Integer.parseInt(freqwordsp[1]);
}
count = count + 1;
word = word + " " + count + "\n";
freqword.add(word);
System.out.println(freqword);
writer = new BufferedWriter(new FileWriter("C:/xampp/htdocs/thesis_freqeuncy_2/freq3.txt"));
writer.write(String.valueOf(freqword));
}
}
// writer.close();
System.out.println(countWord);
System.out.println("-------------- End Contents of file: ---------------------");
long endTime = System.nanoTime();
long totalTime = (endTime - startTime);
System.out.println(TimeUnit.MINUTES.convert(totalTime, TimeUnit.NANOSECONDS));
// note that Scanner suppresses exceptions
if (sc.ioException() != null) {
throw sc.ioException();
}
} finally {
if (inputStream != null) {
inputStream.close();
}
if (sc != null) {
sc.close();
}
}
}
}
首先:
for each spitted word,read whole frequency file freq3.txt
别这样!磁盘 IO 操作非常非常慢。你有足够的内存来将文件读入内存吗?好像是的:
String freq = fileToString("C:/xampp/htdocs/thesis_freqeuncy_2/freq3.txt");
Collections.addAll(freqword, freq.split("\r?\n"));
如果您确实需要此文件,请加载一次并使用内存。同样在这种情况下,地图(单词到频率)可能比列表更舒服。计算完成后将集合保存在磁盘上。
接下来,您可以 bufferize 您的输入流,它可能会显着提高性能:
inputStream = new BufferedInputStream(new FileInputStream(path));
并且不要忘记关闭 stream/reader/writer。显式或使用 try-with-resource 语句。
一般而言,代码可能会根据使用情况进行简化API。例如:
public class DemoApplication {
public static final String UTF8_BOM = "\uFEFF";
private static String removeUTF8BOM(String s) {
if (s.startsWith(UTF8_BOM)) {
s = s.substring(1);
}
return s;
}
private static final String PATH = "words.txt";
private static final String REGEX = " ";
public static void main(String[] args) throws IOException {
Map<String, Long> frequencyMap;
try (BufferedReader reader = new BufferedReader(new FileReader(PATH))) {
frequencyMap = reader
.lines()
.flatMap(s -> Arrays.stream(s.split(REGEX)))
.map(DemoApplication::removeUTF8BOM)
.collect(Collectors.groupingBy(Function.identity(), Collectors.counting()));
}
frequencyMap
.entrySet()
.stream()
.sorted(Comparator.comparingLong(Map.Entry::getValue))
.limit(20)
.forEach(System.out::println);
}
}
我正在做一个项目,我需要在超过 1 亿个孟加拉语单词的大型语料库中找出每个单词的频率。文件大小约为 2GB。我实际上需要最常出现的 20 个词和最不常出现的 20 个词的频率计数。我在 PHP 中完成了相同的代码,但它花了很长时间(一周后代码仍然是 运行)。因此,我正在尝试在 Java 中执行此操作。
在这段代码中,它应该像下面这样工作,
-从语料库中读取一行nahidd_filtered.txt
-使用空格分割
对于每个吐出的单词,读取整个频率文件freq3.txt
如果找到单词则增加频率计数并存储在该文件中
else count = 1(新词)并将频率计数存储在该文件中
我尝试使用循环从 nahidd_filtered.txt 语料库中读取文本块,频率高的单词存储在 freq3.txt 中。 freq3.txt 文件存储的频率计数是这样的,
Word1 Frequncy1 (single whitespace in between)
Word2 Frequency2
...........
简单来说,我需要从 UTF-8 编码的大型语料库文件中获取最频繁出现的前 20 个单词和最不频繁出现的 20 个单词以及它们的频率计数。请检查代码并告诉我为什么这不起作用或任何其他建议。非常感谢。
import java.io.*;
import java.util.*;
import java.util.concurrent.TimeUnit;
public class Main {
private static String fileToString(String filename) throws IOException {
FileInputStream inputStream = null;
Scanner reader = null;
inputStream = new FileInputStream(filename);
reader = new Scanner(inputStream, "UTF-8");
/*BufferedReader reader = new BufferedReader(new FileReader(filename));*/
StringBuilder builder = new StringBuilder();
// For every line in the file, append it to the string builder
while (reader.hasNextLine()) {
String line = reader.nextLine();
builder.append(line);
}
reader.close();
return builder.toString();
}
public static final String UTF8_BOM = "\uFEFF";
private static String removeUTF8BOM(String s) {
if (s.startsWith(UTF8_BOM)) {
s = s.substring(1);
}
return s;
}
public static void main(String[] args) throws IOException {
long startTime = System.nanoTime();
System.out.println("-------------- Start Contents of file: ---------------------");
FileInputStream inputStream = null;
Scanner sc = null;
String path = "C:/xampp/htdocs/thesis_freqeuncy_2/nahidd_filtered.txt";
try {
inputStream = new FileInputStream(path);
sc = new Scanner(inputStream, "UTF-8");
int countWord = 0;
BufferedWriter writer = null;
while (sc.hasNextLine()) {
String word = null;
String line = sc.nextLine();
String[] wordList = line.split("\s+");
for (int i = 0; i < wordList.length; i++) {
word = wordList[i].replace("।", "");
word = word.replace(",", "").trim();
ArrayList<String> freqword = new ArrayList<>();
String freq = fileToString("C:/xampp/htdocs/thesis_freqeuncy_2/freq3.txt");
/*freqword = freq.split("\r?\n");*/
Collections.addAll(freqword, freq.split("\r?\n"));
int flag = 0;
String[] freqwordsp = null;
int k;
for (k = 0; k < freqword.size(); k++) {
freqwordsp = freqword.get(k).split("\s+");
String word2 = freqwordsp[0];
word = removeUTF8BOM(word);
word2 = removeUTF8BOM(word2);
word.replaceAll("\P{Print}", "");
word2.replaceAll("\P{Print}", "");
if (word2.toString().equals(word.toString())) {
flag = 1;
break;
}
}
int count = 0;
if (flag == 1) {
count = Integer.parseInt(freqwordsp[1]);
}
count = count + 1;
word = word + " " + count + "\n";
freqword.add(word);
System.out.println(freqword);
writer = new BufferedWriter(new FileWriter("C:/xampp/htdocs/thesis_freqeuncy_2/freq3.txt"));
writer.write(String.valueOf(freqword));
}
}
// writer.close();
System.out.println(countWord);
System.out.println("-------------- End Contents of file: ---------------------");
long endTime = System.nanoTime();
long totalTime = (endTime - startTime);
System.out.println(TimeUnit.MINUTES.convert(totalTime, TimeUnit.NANOSECONDS));
// note that Scanner suppresses exceptions
if (sc.ioException() != null) {
throw sc.ioException();
}
} finally {
if (inputStream != null) {
inputStream.close();
}
if (sc != null) {
sc.close();
}
}
}
}
首先:
for each spitted word,read whole frequency file freq3.txt
别这样!磁盘 IO 操作非常非常慢。你有足够的内存来将文件读入内存吗?好像是的:
String freq = fileToString("C:/xampp/htdocs/thesis_freqeuncy_2/freq3.txt");
Collections.addAll(freqword, freq.split("\r?\n"));
如果您确实需要此文件,请加载一次并使用内存。同样在这种情况下,地图(单词到频率)可能比列表更舒服。计算完成后将集合保存在磁盘上。
接下来,您可以 bufferize 您的输入流,它可能会显着提高性能:
inputStream = new BufferedInputStream(new FileInputStream(path));
并且不要忘记关闭 stream/reader/writer。显式或使用 try-with-resource 语句。
一般而言,代码可能会根据使用情况进行简化API。例如:
public class DemoApplication {
public static final String UTF8_BOM = "\uFEFF";
private static String removeUTF8BOM(String s) {
if (s.startsWith(UTF8_BOM)) {
s = s.substring(1);
}
return s;
}
private static final String PATH = "words.txt";
private static final String REGEX = " ";
public static void main(String[] args) throws IOException {
Map<String, Long> frequencyMap;
try (BufferedReader reader = new BufferedReader(new FileReader(PATH))) {
frequencyMap = reader
.lines()
.flatMap(s -> Arrays.stream(s.split(REGEX)))
.map(DemoApplication::removeUTF8BOM)
.collect(Collectors.groupingBy(Function.identity(), Collectors.counting()));
}
frequencyMap
.entrySet()
.stream()
.sorted(Comparator.comparingLong(Map.Entry::getValue))
.limit(20)
.forEach(System.out::println);
}
}