单词 统计续(补)
Posted goubb
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了单词 统计续(补)相关的知识,希望对你有一定的参考价值。
短语统计
我们在处理文本的时候只需将提取出来的文本数据进行特殊的分割处理,比如只需将英文的“,”,“.”,“?”,“!"以及回车符设为分隔符。并将一些无用单词作为间断比如
"a", "it", "the", "and", "this"等。
package analyse_word;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Scanner;
import java.util.Set;
import java.io.FileNotFoundException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Scanner;
import java.util.Set;
public class recognize_sentence
public static boolean useless(String str) throws FileNotFoundException
File file = new File("D:\\useless.txt");// 读取文件
String words[] = new String [100000];
int out_words[] = new int [100000];
if (!file.exists()) // 如果文件打不开或不存在则提示错误
System.out.println("文件不存在");
return false;
Scanner x = new Scanner(file);
HashMap<String, Integer> hashMap = new HashMap<String, Integer>();
while (x.hasNextLine())
String line = x.nextLine();
String[] lineWords = line.split("[\n]");
Set<String> wordSet = hashMap.keySet();
for (int i = 0; i < lineWords.length; i++)
if (wordSet.contains(lineWords[i]))
Integer number = hashMap.get(lineWords[i]);
number++;
hashMap.put(lineWords[i], number);
else
hashMap.put(lineWords[i], 1);
Iterator<String> iterator = hashMap.keySet().iterator();
int max = 0,i=0;
while (iterator.hasNext())
String word = iterator.next();
if(!"".equals(word)&&word!=null&&!"a".equals(word)&&!"the".equals(word)&&!" ".equals(word))
if(str.indexOf(" "+word+" ")==0)
return true;
words[i]=word;
out_words[i]=hashMap.get(word);
i++;
return true;
public static boolean useless(String str) throws FileNotFoundException
File file = new File("D:\\useless.txt");// 读取文件
String words[] = new String [100000];
int out_words[] = new int [100000];
if (!file.exists()) // 如果文件打不开或不存在则提示错误
System.out.println("文件不存在");
return false;
Scanner x = new Scanner(file);
HashMap<String, Integer> hashMap = new HashMap<String, Integer>();
while (x.hasNextLine())
String line = x.nextLine();
String[] lineWords = line.split("[\n]");
Set<String> wordSet = hashMap.keySet();
for (int i = 0; i < lineWords.length; i++)
if (wordSet.contains(lineWords[i]))
Integer number = hashMap.get(lineWords[i]);
number++;
hashMap.put(lineWords[i], number);
else
hashMap.put(lineWords[i], 1);
Iterator<String> iterator = hashMap.keySet().iterator();
int max = 0,i=0;
while (iterator.hasNext())
String word = iterator.next();
if(!"".equals(word)&&word!=null&&!"a".equals(word)&&!"the".equals(word)&&!" ".equals(word))
if(str.indexOf(" "+word+" ")==0)
return true;
words[i]=word;
out_words[i]=hashMap.get(word);
i++;
return true;
public static void recognize() throws FileNotFoundException
File file = new File("D:\\Englis_letters.txt");// 读取文件
if (!file.exists()) // 如果文件打不开或不存在则提示错误
System.out.println("文件不存在");
return;
Scanner x = new Scanner(file);
HashMap<String, Integer> hashMap = new HashMap<String, Integer>();
while (x.hasNextLine())
String line = x.nextLine();
String[] lineWords = line.split("[\\t+;.,“”‘’?!\n+]");
Set<String> wordSet = hashMap.keySet();
for (int i = 0; i < lineWords.length; i++)
if (wordSet.contains(lineWords[i]))
Integer number = hashMap.get(lineWords[i]);
number++;
hashMap.put(lineWords[i], number);
else
hashMap.put(lineWords[i], 1);
Iterator<String> iterator = hashMap.keySet().iterator();
while (iterator.hasNext())
String word = iterator.next();
if(useless(word))
System.out.println(word);
File file = new File("D:\\Englis_letters.txt");// 读取文件
if (!file.exists()) // 如果文件打不开或不存在则提示错误
System.out.println("文件不存在");
return;
Scanner x = new Scanner(file);
HashMap<String, Integer> hashMap = new HashMap<String, Integer>();
while (x.hasNextLine())
String line = x.nextLine();
String[] lineWords = line.split("[\\t+;.,“”‘’?!\n+]");
Set<String> wordSet = hashMap.keySet();
for (int i = 0; i < lineWords.length; i++)
if (wordSet.contains(lineWords[i]))
Integer number = hashMap.get(lineWords[i]);
number++;
hashMap.put(lineWords[i], number);
else
hashMap.put(lineWords[i], 1);
Iterator<String> iterator = hashMap.keySet().iterator();
while (iterator.hasNext())
String word = iterator.next();
if(useless(word))
System.out.println(word);
public static void main(String[] args) throws FileNotFoundException
recognize();
recognize();
以上是关于单词 统计续(补)的主要内容,如果未能解决你的问题,请参考以下文章
Elasticsearch Suggester API(自动补全)