统计文本中重复的内容

Posted panzer

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了统计文本中重复的内容相关的知识,希望对你有一定的参考价值。

 

1.统计一个文本中重复的内容

package count;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

public class countWord {

    public static void main(String[] args) {
      count("F:\\A\\B.xml");
    }

public static void count(String filepath)
{
    try
    {
        File file = new File(filepath);
        if(!file.exists())
        {
            System.out.println("file not exist");
            return;
        }

        //create BufferedReader to improve efficient
        InputStreamReader isr = new InputStreamReader(new FileInputStream(file), "UTF-8");  
        BufferedReader bufReader = new BufferedReader(isr); 
        String line = null;

        //create map collection to record information
        Map<String,Integer> map = new HashMap<String,Integer>();
        while((line = bufReader.readLine()) != null)
        {
            if(map.containsKey(line))
                map.put(line,map.get(line)+1);
            else
                map.put(line,1);
        }
        //print map collction
        showMap(map);
    }
    catch (Exception ex)
    {
        ex.printStackTrace();
    }
}
private static void showMap(Map<String,Integer> map)
{
    if(map == null)
        return;
    Set<String> keyset = map.keySet();
    Iterator<String> it = keyset.iterator();
    int count = 0;
    while(it.hasNext())
    {
        String s = it.next();
        if(map.get(s) > 1) {//个数大于1
            System.out.println( s+ "......" + map.get(s));
            count++;
        }
    }
    System.out.println("重复两次的数据:" + count);
}
}

 

2.统计两个文本中重复的内容

package count;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

public class countWordTowFile {

    public static void main(String[] args) {
      count("F:\\A\\B.xml","C:\\D\\E.txt");
    }


public static void count(String filepath,String filepath2)
{
    try
    {
        File file = new File(filepath);
        File file2 = new File(filepath2);
        if(!file.exists() || !file2.exists())
        {
            System.out.println("file not exist");
            return;
        }

        //create BufferedReader to improve efficient
        InputStreamReader isr = new InputStreamReader(new FileInputStream(file), "UTF-8");  
        InputStreamReader isr2 = new InputStreamReader(new FileInputStream(file2), "UTF-8");  
        BufferedReader bufReader = new BufferedReader(isr); 
        BufferedReader bufReader2 = new BufferedReader(isr2); 
        String line = null;
        String line2 = null;

        //create map collection to record information
        Map<String,Integer> map = new HashMap<String,Integer>();
        Map<String,Integer> map2 = new HashMap<String,Integer>();
        while((line = bufReader.readLine()) != null)//读取第一个文件中的数据
        {
            map.put(line,1);
        }
        while((line2 = bufReader2.readLine()) != null) {//读取第二个文件中的内容
            if(map.containsKey(line2)) {
                map2.put(line2,map.get(line2)+1);
            }
        }
        //print map collction
        showMap(map2);
    }
    catch (Exception ex)
    {
        ex.printStackTrace();
    }
}
private static void showMap(Map<String,Integer> map)
{
    if(map == null)
        return;
    Set<String> keyset = map.keySet();
    Iterator<String> it = keyset.iterator();
    int count = 0;
    while(it.hasNext())
    {
        String s = it.next();
        System.out.println( s+ "......" + map.get(s));
        count++;

    }
    System.out.println("重复两次的数据:" + count);
}
}

 

以上是关于统计文本中重复的内容的主要内容,如果未能解决你的问题,请参考以下文章

干货通俗理解自然语言处理中N-Gram模型

DELPHI清除TXT文件内重复字符串

从 HTML 正文中提取文本片段(在 .NET 中)

如何获取html的文本内容[重复]

201671010432词频统计软件项目报告

iOS代码片段CodeSnippets