使用 StreamTokenizer () 、 HashMap() 、 HashSet() 计算词频。在 Java 核心
Calculating Word Frequency Using StreamTokenizer () , HashMap() , HashSet(). in Java Core
import java.io.*;
import java.util.*;
class A {
public static void main(String args[]) throws Exception {
Console con = System.console();
String str;
int i=0;
HashMap map = new HashMap();
HashSet set = new HashSet();
System.out.println("Enter File Name : ");
str = con.readLine();
File f = new File(str);
f.createNewFile();
FileInputStream fis = new FileInputStream(str);
StreamTokenizer st = new StreamTokenizer(fis);
while(st.nextToken()!=StreamTokenizer.TT_EOF) {
String s;
switch(st.ttype) {
case StreamTokenizer.TT_NUMBER: s = st.nval+"";
break;
case StreamTokenizer.TT_WORD: s = st.sval;
break;
default: s = ""+((char)st.ttype);
}
map.put(i+"",s);
set.add(s);
i++;
}
Iterator iter = set.iterator();
System.out.println("Frequency Of Words :");
while(iter.hasNext()) {
String word;
int count=0;
word=(String)iter.next();
for(int j=0; j<i ; j++) {
String word2;
word2=(String)map.get(j+"");
if(word.equals(word2))
count++;
}
System.out.println(" WORD : "+ word+" = "+count);
}
System.out.println("Total Words In Files: "+i);
}
}
In This code First I have already created a text file which contains the following data :
@Hello Hii World# * c++ java salesforce
And the output of this code is :
**词频:
字数:# = 1
字数:@=1
WORD : c = 1
WORD : 销售人员 = 1
字数:* = 1
WORD : Hii = 1
字数:+=2
字数:java = 1
WORD : 世界 = 1
WORD : 你好 = 1
文件中的总字数:11**
where i am unable to find why this shows c++ as a seperate words . I
want to combine c++ as a single word as in the output
你可以这样做
// Create the file at path specified in the String str
// ...
HashMap<String, Integer> map = new HashMap<>();
InputStream fis = new FileInputStream(str);
Reader bufferedReader = new BufferedReader(new InputStreamReader(fis));
StreamTokenizer st = new StreamTokenizer(bufferedReader);
st.wordChars('+', '+');
while(st.nextToken() != StreamTokenizer.TT_EOF) {
String s;
switch(st.ttype) {
case StreamTokenizer.TT_NUMBER:
s = String.valueOf(st.nval);
break;
case StreamTokenizer.TT_WORD:
s = st.sval;
break;
default:
s = String.valueOf((char)st.ttype);
}
Integer val = map.get(s);
if(val == null)
val = 1;
else
val++;
map.put(s, val);
}
Set<String> keySet = map.keySet();
Iterator<String> iter = keySet.iterator();
System.out.println("Frequency Of Words :");
int sum = 0;
while(iter.hasNext()) {
String word = iter.next();
int count = map.get(word);
sum += count;
System.out.println(" WORD : " + word + " = " + count);
}
System.out.println("Total Words In Files: " + sum);
请注意,我已经使用泛型而不是 HashMap 和 Iterator 的原始版本更新了您的代码。此外,您用于 StreamTokenizer 的构造函数已被弃用。同时使用 map 和 set 是没有用的,因为您可以使用 .keySet() 方法迭代 map 的键集。映射现在从字符串(单词)到整数(单词数)。
无论如何,对于你做的这个例子,我认为一个简单的拆分方法会更合适。
有关 StreamTokenizer 的 wordChars 方法的更多信息,您可以查看 #wordChars(int, int)
import java.io.*;
import java.util.*;
class A {
public static void main(String args[]) throws Exception {
Console con = System.console();
String str;
int i=0;
HashMap map = new HashMap();
HashSet set = new HashSet();
System.out.println("Enter File Name : ");
str = con.readLine();
File f = new File(str);
f.createNewFile();
FileInputStream fis = new FileInputStream(str);
StreamTokenizer st = new StreamTokenizer(fis);
while(st.nextToken()!=StreamTokenizer.TT_EOF) {
String s;
switch(st.ttype) {
case StreamTokenizer.TT_NUMBER: s = st.nval+"";
break;
case StreamTokenizer.TT_WORD: s = st.sval;
break;
default: s = ""+((char)st.ttype);
}
map.put(i+"",s);
set.add(s);
i++;
}
Iterator iter = set.iterator();
System.out.println("Frequency Of Words :");
while(iter.hasNext()) {
String word;
int count=0;
word=(String)iter.next();
for(int j=0; j<i ; j++) {
String word2;
word2=(String)map.get(j+"");
if(word.equals(word2))
count++;
}
System.out.println(" WORD : "+ word+" = "+count);
}
System.out.println("Total Words In Files: "+i);
}
}
In This code First I have already created a text file which contains the following data :
@Hello Hii World# * c++ java salesforce
And the output of this code is :
**词频:
字数:# = 1
字数:@=1
WORD : c = 1
WORD : 销售人员 = 1
字数:* = 1
WORD : Hii = 1
字数:+=2
字数:java = 1
WORD : 世界 = 1
WORD : 你好 = 1
文件中的总字数:11**
where i am unable to find why this shows c++ as a seperate words . I want to combine c++ as a single word as in the output
你可以这样做
// Create the file at path specified in the String str
// ...
HashMap<String, Integer> map = new HashMap<>();
InputStream fis = new FileInputStream(str);
Reader bufferedReader = new BufferedReader(new InputStreamReader(fis));
StreamTokenizer st = new StreamTokenizer(bufferedReader);
st.wordChars('+', '+');
while(st.nextToken() != StreamTokenizer.TT_EOF) {
String s;
switch(st.ttype) {
case StreamTokenizer.TT_NUMBER:
s = String.valueOf(st.nval);
break;
case StreamTokenizer.TT_WORD:
s = st.sval;
break;
default:
s = String.valueOf((char)st.ttype);
}
Integer val = map.get(s);
if(val == null)
val = 1;
else
val++;
map.put(s, val);
}
Set<String> keySet = map.keySet();
Iterator<String> iter = keySet.iterator();
System.out.println("Frequency Of Words :");
int sum = 0;
while(iter.hasNext()) {
String word = iter.next();
int count = map.get(word);
sum += count;
System.out.println(" WORD : " + word + " = " + count);
}
System.out.println("Total Words In Files: " + sum);
请注意,我已经使用泛型而不是 HashMap 和 Iterator 的原始版本更新了您的代码。此外,您用于 StreamTokenizer 的构造函数已被弃用。同时使用 map 和 set 是没有用的,因为您可以使用 .keySet() 方法迭代 map 的键集。映射现在从字符串(单词)到整数(单词数)。
无论如何,对于你做的这个例子,我认为一个简单的拆分方法会更合适。
有关 StreamTokenizer 的 wordChars 方法的更多信息,您可以查看 #wordChars(int, int)