基于COCA词频表的文本词汇分布测试工具v0.2

Posted banmei-brandy

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了基于COCA词频表的文本词汇分布测试工具v0.2相关的知识,希望对你有一定的参考价值。

update:

  • 简单整理了一下代码的组织。
  • 处理的单词封装成类,单词的修正,信息的显示都作为其内的方法。

 

写得还比较糙,工具本身可以封装,还有对于单词的变形基本没什么处理,以后有时间再改。

项目托管到github上了。https://github.com/MorpheusDong/TextVocabularyAnalyzer

 

TypeDefine.h

#ifndef _TYPE_DEFINE_H_
#define _TYPE_DEFINE_H_

#include <iostream>
#include <fstream>
#include <string>
#include <array>
#include <vector>
#include <iterator>
#include <map>

using namespace std;

#define COCA_WORDS_NUM                       20201U
#define WORDS_HEAD_NUM                       26U
                                             
#define WORDS_HEAD_A                         0U
#define WORDS_HEAD_B                         1U
#define WORDS_HEAD_C                         2U
#define WORDS_HEAD_D                         3U
#define WORDS_HEAD_E                         4U
#define WORDS_HEAD_F                         5U
#define WORDS_HEAD_G                         6U
#define WORDS_HEAD_H                         7U
#define WORDS_HEAD_I                         8U
#define WORDS_HEAD_J                         9U
#define WORDS_HEAD_K                         10U
#define WORDS_HEAD_L                         11U
#define WORDS_HEAD_M                         12U
#define WORDS_HEAD_N                         13U
#define WORDS_HEAD_O                         14U
#define WORDS_HEAD_P                         15U
#define WORDS_HEAD_Q                         16U
#define WORDS_HEAD_R                         17U
#define WORDS_HEAD_S                         18U
#define WORDS_HEAD_T                         19U
#define WORDS_HEAD_U                         20U
#define WORDS_HEAD_V                         21U
#define WORDS_HEAD_W                         22U
#define WORDS_HEAD_X                         23U
#define WORDS_HEAD_Y                         24U
#define WORDS_HEAD_Z                         25U
                                             
#define USUAL_WORD_NUM                       17U


typedef enum WordFrequencyType
{
    WORD_UNDER_4000 = 0,
    WORD_4000_6000,
    WORD_6000_8000,
    WORD_8000_10000,
    WORD_10000_12000,
    WORD_12000_14000,
    WORD_14000_16000,
    WORD_OVER_16000,
    WORD_NOT_FOUND_COCA,
    WORD_LEVEL_NUM
}TagWordFrequencyType;

const string alphabet_str = "abcdefghijklmnopqrstuvwxyz";

const string report_str[WORD_LEVEL_NUM] = {
    "UNDER 4000: ",
    "4000-6000: ",
    "6000-8000: ",
    "8000-10000: ",
    "10000-12000: ",
    "12000-14000: ",
    "14000-16000: ",
    "16000-20000+: ",
    "
Not found in COCA:"
};

//for usual words not included in COCA
const string usual_w_out_of_COCA_str[USUAL_WORD_NUM] =
{
    "s","is","are","re","was","were",
    "an","won","t","has","had","been",
    "did","does","cannot","got","men"
};


#endif

 

TextVocabularyAnalyzer.h

#ifndef _TEXT_VOCABULARY_ANALYZER_H_
#define _TEXT_VOCABULARY_ANALYZER_H_

#include "TypeDefine.h"

extern TagWordFrequencyType frequency_classify(const int wfrq);
extern void word_frequency_analyze(array<int, WORD_LEVEL_NUM>& wfrq_array, TagWordFrequencyType wfrq_tag);
extern bool isaletter(const char& c);

class CLetters
{
private:
    string m_word;

public:
    CLetters();
    ~CLetters();
    void fill(vector<char>& vw);
    const string word();
    const char firstletter();
    void processing();
    bool usual_recheck();
    bool form_recheck();
};



#endif // !_TEXT_VOCABULARY_ANALYZER_H_

 

TextVocabularyAnalyzer.cpp

/* TextVocabularyAnalyzer.cpp */

#include <algorithm>
#include "TextVocabularyAnalyzer.h"

TagWordFrequencyType frequency_classify(const int wfrq)
{
    if (wfrq == 0)
    {
        return WORD_NOT_FOUND_COCA;
    }
    else if (wfrq > 0 && wfrq <= 4000)
    {
        return WORD_UNDER_4000;
    }
    else if (wfrq > 4000 && wfrq <= 6000)
    {
        return WORD_4000_6000;
    }
    else if (wfrq > 6000 && wfrq <= 8000)
    {
        return WORD_6000_8000;
    }
    else if (wfrq > 8000 && wfrq <= 10000)
    {
        return WORD_8000_10000;
    }
    else if (wfrq > 10000 && wfrq <= 12000)
    {
        return WORD_10000_12000;
    }
    else if (wfrq > 12000 && wfrq <= 14000)
    {
        return WORD_12000_14000;
    }
    else if (wfrq > 14000 && wfrq <= 16000)
    {
        return WORD_14000_16000;
    }
    else
    {
        return WORD_OVER_16000;
    }
}

void word_frequency_analyze(array<int, WORD_LEVEL_NUM>& wfrq_array, TagWordFrequencyType wfrq_tag)
{
    switch (wfrq_tag)
    {
    case WORD_UNDER_4000:
    {
        wfrq_array[WORD_UNDER_4000] += 1;
        break;
    }
    case WORD_4000_6000:
    {
        wfrq_array[WORD_4000_6000] += 1;
        break;
    }
    case WORD_6000_8000:
    {
        wfrq_array[WORD_6000_8000] += 1;
        break;
    }
    case WORD_8000_10000:
    {
        wfrq_array[WORD_8000_10000] += 1;
        break;
    }
    case WORD_10000_12000:
    {
        wfrq_array[WORD_10000_12000] += 1;
        break;
    }
    case WORD_12000_14000:
    {
        wfrq_array[WORD_12000_14000] += 1;
        break;
    }
    case WORD_14000_16000:
    {
        wfrq_array[WORD_14000_16000] += 1;
        break;
    }
    case WORD_OVER_16000:
    {
        wfrq_array[WORD_OVER_16000] += 1;
        break;
    }
    default:
    {
        wfrq_array[WORD_NOT_FOUND_COCA] += 1;
        break;
    }
    }
}

bool isaletter(const char& c)
{
    if ((c >= a && c <= z) || (c >= A && c <= Z))
    {
        return true;
    }
    else
    {
        return false;
    }
}


//Class Cletters realization
CLetters::CLetters()
{
    m_word = "";
}

CLetters::~CLetters()
{
    //do nothing
}

void CLetters::fill(vector<char>& vw)
{
    //store the word with lower form
    m_word.assign(vw.begin(), vw.end());
    transform(m_word.begin(), m_word.end(), m_word.begin(), tolower);
}

const string CLetters::word()
{
    return m_word;
}

const char CLetters::firstletter()
{
    return m_word[0];
}

void CLetters::processing()
{
    cout << "Finding word "" << m_word << ""...	";
}


bool CLetters::usual_recheck()
{
    //check if the word is usual
    bool RetVal = false;
    for (int i = 0; i < USUAL_WORD_NUM; i++)
    {
        if (m_word == usual_w_out_of_COCA_str[i])
        {
            RetVal = true;
        }
        else
        {
            //do nothing
        }
    }
    return RetVal;
}

bool CLetters::form_recheck()
{
    bool RetVal = false;
    if (m_word.length() > 3)
    {
        char e1, e2, e3;
        e3 = m_word[m_word.length() - 3];    //last but two letter
        e2 = m_word[m_word.length() - 2];    //last but one letter
        e1 = m_word[m_word.length() - 1];    //last letter

        if (e1 == s)
        {
            m_word.erase(m_word.length() - 1);
            RetVal = true;
        }
        else if (e2 == e && e1 == d)
        {
            m_word.erase(m_word.length() - 1);
            m_word.erase(m_word.length() - 1);
            RetVal = true;
        }
        else if (e3 == i && e2 == n && e1 == g)
        {
            m_word.erase(m_word.length() - 1);
            m_word.erase(m_word.length() - 1);
            m_word.erase(m_word.length() - 1);
            RetVal = true;
        }
        else
        {
            //do nothing
        }
    }
    return RetVal;
}

 

main.cpp

/* main .cpp */

#include <numeric>
#include <iomanip>
#include <ctime>
#include "TextVocabularyAnalyzer.h"

int main()
{
    //file init
    ifstream COCA_txt("D:\COCA.txt");
    ifstream USER_txt("D:\JobsSpeech.txt");

    //time init
    clock_t startTime, endTime;
    double build_map_time = 0;
    double process_time = 0;

    startTime = clock();    //build time start

    //build COCA words map
    map<string, int> COCA_WordsList[WORDS_HEAD_NUM];
    int readlines = 0;

    while (readlines < COCA_WORDS_NUM)
    {
        int frequency = 0; string word = "";
        COCA_txt >> frequency;
        COCA_txt >> word;

        //transform to lower uniformly 
        transform(word.begin(), word.end(), word.begin(), tolower);

        //import every word
        for (int whead = WORDS_HEAD_A; whead < WORDS_HEAD_NUM; whead++)
        {
            //check word head 
            if (word[0] == alphabet_str[whead])
            {
                //if a word already exists, only load its lower frequency
                if (COCA_WordsList[whead].find(word) == COCA_WordsList[whead].end())
                {
                    COCA_WordsList[whead].insert(make_pair(word, frequency));
                }
                else
                {
                    COCA_WordsList[whead][word] = frequency < COCA_WordsList[whead][word] ? frequency : COCA_WordsList[whead][word];
                }
            }
            else
            {
                // do nothing
            }
        }
        readlines++;
    }

    endTime = clock();    //build time stop
    build_map_time = (double)(endTime - startTime) / CLOCKS_PER_SEC;

    //user prompt
    cout << "COCA words list imported.
Press any key to start frequency analysis...
";
    cin.get();

    startTime = clock();    //process time start

    //find text words
    vector<char> content_read;
    CLetters word_readed;
    vector<int> frequecy_processed = { 0 };
    array<int, WORD_LEVEL_NUM> words_analysis_array{ 0 };
    char char_read =  ;

    //get text char one by one
    while (USER_txt.get(char_read))
    {
        //only letters and ‘-‘ between letters will be received
        if (isaletter(char_read) || char_read == -)
        {
            content_read.push_back(char_read);
        }
        else
        {
            //char which is not a letter marks the end of a word
            if (!content_read.empty())    //skip single letter 
            {
                int current_word_frequency = 0;

                //assign letters to make the word
                word_readed.fill(content_read);
                word_readed.processing();

                cout << "Frequency:";
                //check the word‘s head and find its frequency in COCA list
                for (int whead = WORDS_HEAD_A; whead < WORDS_HEAD_NUM; whead++)
                {
                    if (word_readed.firstletter() == alphabet_str[whead])
                    {
                        cout << COCA_WordsList[whead][word_readed.word()];
                        current_word_frequency = COCA_WordsList[whead][word_readed.word()];

                        //check if the word has been processed
                        if (current_word_frequency == 0)
                        {
                            //addtional check
                            if (word_readed.usual_recheck())
                            {
                                word_frequency_analyze(words_analysis_array, WORD_UNDER_4000);
                            }
                            else if (word_readed.form_recheck())
                            {
                                current_word_frequency = COCA_WordsList[whead][word_readed.word()];    //try again
                                if (current_word_frequency > 0)
                                {
                                    frequecy_processed.push_back(current_word_frequency);
                                    word_frequency_analyze(words_analysis_array, frequency_classify(current_word_frequency));
                                }
                                else
                                {
                                    // do nothing
                                }
                            }
                            else
                            {
                                word_frequency_analyze(words_analysis_array, WORD_NOT_FOUND_COCA);
                            }
                        }
                        else if (find(frequecy_processed.begin(), frequecy_processed.end(), current_word_frequency)
                            == frequecy_processed.end())
                        {
                            //classify this word and make statistics
                            frequecy_processed.push_back(current_word_frequency);
                            word_frequency_analyze(words_analysis_array, frequency_classify(current_word_frequency));
                        }
                        else
                        {
                            // do nothing
                        }
                    }
                    else
                    {
                        //do nothing
                    }
                }
                cout << endl;

                content_read.clear();
            }
            else
            {
                //do nothing
            }
        }
    }

    endTime = clock();    //process time stop
    process_time = (double)(endTime - startTime) / CLOCKS_PER_SEC;

    //calc whole words processed
    int whole_words_analyzed = 0;
    whole_words_analyzed = accumulate(words_analysis_array.begin(), words_analysis_array.end(), 0);

    //report result
    cout << "
////////// Report ////////// 
";
    for (int i = 0;i< words_analysis_array.size();i++)
    {
        cout << report_str[i] <<"	"<< words_analysis_array[i] << " (";
        cout<<fixed<<setprecision(2)<<(float)words_analysis_array[i] * 100 / whole_words_analyzed << "%)" << endl;
    }
    cout << "
Words totally analyzed: " << whole_words_analyzed << endl;

    //show run time
    cout << "Map build time: " << build_map_time*1000 << "ms.
";
    cout << "Process time: " << process_time*1000 << "ms.
";
    cout << "////////////////////////////" << endl;

    //close file
    COCA_txt.close();
    USER_txt.close();

    return 0;
}

 

以上是关于基于COCA词频表的文本词汇分布测试工具v0.2的主要内容,如果未能解决你的问题,请参考以下文章

中文词频统计

中文词频统计

中文词频统计

Java实现的词频统计

中文词频统计

中文词频统计