一个文本分词程序

Posted 风轻云淡走天涯

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了一个文本分词程序相关的知识,希望对你有一定的参考价值。

 WordMap类从分词库中读入分词 

将分词存入unordered_map<std::string, int> 中

#pragma once
#include<istream>
#include<unordered_map>
#include<string>
#include<ctime>
class WordMap {
public:
    WordMap(const std::string& filename);
    ~WordMap();
    bool init();
    std::unordered_map<std::string, int> m_map;
    std::string m_filename;

private:
    time_t difftime;
    std::string timestr( tm*);
};
#include"wordmap.h"
#include<fstream>
#include<iostream>
#include<sstream>
#include<ctime>
WordMap::WordMap(const std::string& filename):m_filename(filename), difftime(5)
{
}

WordMap::~WordMap() {}

bool WordMap::init()
{
    std::ifstream input(m_filename);
    std::istringstream inputstring;
    time_t last;
    time(&last);
    time_t cur;
    if (input.is_open())
    {
        std::string inputs;
        tm nowtime; 
        localtime_s(&nowtime,&last);

        std::cout << "开始初始化分词库,当前时间" << timestr(&nowtime)<<std::endl;
        while (std::getline(input, inputs))
        {
            time(&cur);
            std::istringstream inputstring(inputs);
            int num;
            int num2;
            std::string word;
            inputstring >> num;
            inputstring >> word;
            inputstring >> num2;
            m_map[word] = num2;
            if (cur - last > difftime)
            {
                std::cout << "已初始化分词个数:" << m_map.size() << std::endl;
                last = cur;
            }

        }
        time(&cur);
        localtime_s(&nowtime, &cur);
        std::cout << "结束初始化分词库,当前时间" << timestr(&nowtime) << std::endl;
    }
    else
    {
        std::cerr << "can‘t not open file:" << m_filename;
        return false;
    }
    return true;
}

std::string WordMap::timestr(tm* nowtime)
{
    std::ostringstream out;
    out << nowtime->tm_hour << ":" << nowtime->tm_min << ":" << nowtime->tm_sec;
    return  std::move(out.str());
}

从文本中读入,对文本进行分词,分词方法详见

http://yangshangchuan.iteye.com/blog/2031813

以下是实现

#pragma once
#include<string>
using std::string;
#include<vector>
using std::vector;
#include"wordmap.h"
class FindWord 
{
public:
    FindWord() {};
    ~FindWord() {};
    vector<string> GetKeyWords(const string& filename,const WordMap& wordmap);
private:
    int wsize = 5;
    bool ischinese(const char* c);
public:
    int getlocalfindstring(const string& ostring, int begpos);
};
@ -0,0 +1,71 @@
#include "findword.h"
#include<fstream>
#include<sstream>
#include<iostream>
using std::ifstream;
using std::istringstream;
vector<string> FindWord::GetKeyWords(const string & filename, const WordMap& wordmap)
{
    vector<string> l_keyword;
    ifstream inputfile(filename);
    if (!inputfile.is_open())
    {
        std::cerr << "cann‘t not open file:" << filename;
        return l_keyword;
    }
    string sinput;
    string last;
    while (std::getline(inputfile, sinput))
    {
        last = sinput;
        int begpos = 0;
        int length;
        while ((length = getlocalfindstring(last, begpos)) != 0)
        {
            int movelen = ischinese(&last[begpos]) ?  2:1;
            int findlen = -1;
            while (movelen<=length)
            {
                string ls = last.substr(begpos, movelen);
                auto res = wordmap.m_map.find(ls);
                if (res != wordmap.m_map.end())
                {
                    findlen = movelen;
                }
                movelen += ischinese(&last[begpos + movelen]) ? 2 : 1;
            }
            if (findlen != -1)
            {
                l_keyword.push_back(last.substr(begpos, findlen));
                begpos = begpos + findlen;
            }
            else {
                begpos += length;
            }
        }
    }
    return l_keyword;
}

bool FindWord::ischinese(const char* c)
{
    unsigned char cur = *c;
    unsigned char next = *(c + 1);
    if (next == 0)return false;
    return (cur >= 0xB0 && cur <= 0xF7) && (next >= 0xA1 && next <= 0xFE);
}

int FindWord::getlocalfindstring(const string& ostring,int begpos)
{
    int size = wsize;
    int endpos = begpos;
    while (size > 0 && ostring[endpos])
    {
        if (ischinese(&ostring[endpos])) {
            endpos++;
        }
        size--;
        endpos++;
    }
    return endpos-begpos;
}

样例程序

@ -0,0 +1,16 @@
#include"wordmap.h"
#include<iostream>
#include<string>
#include"findword.h"
using std::string;
int main()
{

    WordMap m_wordmap("../../../word/word1.txt");
    FindWord m_findword;
    if (!m_wordmap.init()) { return 0; };
    vector<string> res= m_findword.GetKeyWords("../../../inputfile/1999.txt", m_wordmap);
    for (auto elems : res)
        std::cout << elems << " ";
    return 0;
}

github:https://github.com/wuzhuorui/kjct.git

以上是关于一个文本分词程序的主要内容,如果未能解决你的问题,请参考以下文章

OpenNLP:驾驭文本,分词那些事

机器学习之自然语言处理——中文分词jieba库详解(代码+原理)

201671010432词频统计软件项目报告

一个简单的文本聚类实现(python)

内审实务文本挖掘在串通投标行为识别中的应用

浅谈文本分析分词及关系图