基于COCA词频表的文本词汇分布测试工具v0.2
Posted banmei-brandy
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了基于COCA词频表的文本词汇分布测试工具v0.2相关的知识,希望对你有一定的参考价值。
update:
- 简单整理了一下代码的组织。
- 处理的单词封装成类,单词的修正,信息的显示都作为其内的方法。
写得还比较糙,工具本身可以封装,还有对于单词的变形基本没什么处理,以后有时间再改。
项目托管到github上了。https://github.com/MorpheusDong/TextVocabularyAnalyzer
TypeDefine.h
#ifndef _TYPE_DEFINE_H_ #define _TYPE_DEFINE_H_ #include <iostream> #include <fstream> #include <string> #include <array> #include <vector> #include <iterator> #include <map> using namespace std; #define COCA_WORDS_NUM 20201U #define WORDS_HEAD_NUM 26U #define WORDS_HEAD_A 0U #define WORDS_HEAD_B 1U #define WORDS_HEAD_C 2U #define WORDS_HEAD_D 3U #define WORDS_HEAD_E 4U #define WORDS_HEAD_F 5U #define WORDS_HEAD_G 6U #define WORDS_HEAD_H 7U #define WORDS_HEAD_I 8U #define WORDS_HEAD_J 9U #define WORDS_HEAD_K 10U #define WORDS_HEAD_L 11U #define WORDS_HEAD_M 12U #define WORDS_HEAD_N 13U #define WORDS_HEAD_O 14U #define WORDS_HEAD_P 15U #define WORDS_HEAD_Q 16U #define WORDS_HEAD_R 17U #define WORDS_HEAD_S 18U #define WORDS_HEAD_T 19U #define WORDS_HEAD_U 20U #define WORDS_HEAD_V 21U #define WORDS_HEAD_W 22U #define WORDS_HEAD_X 23U #define WORDS_HEAD_Y 24U #define WORDS_HEAD_Z 25U #define USUAL_WORD_NUM 17U typedef enum WordFrequencyType { WORD_UNDER_4000 = 0, WORD_4000_6000, WORD_6000_8000, WORD_8000_10000, WORD_10000_12000, WORD_12000_14000, WORD_14000_16000, WORD_OVER_16000, WORD_NOT_FOUND_COCA, WORD_LEVEL_NUM }TagWordFrequencyType; const string alphabet_str = "abcdefghijklmnopqrstuvwxyz"; const string report_str[WORD_LEVEL_NUM] = { "UNDER 4000: ", "4000-6000: ", "6000-8000: ", "8000-10000: ", "10000-12000: ", "12000-14000: ", "14000-16000: ", "16000-20000+: ", " Not found in COCA:" }; //for usual words not included in COCA const string usual_w_out_of_COCA_str[USUAL_WORD_NUM] = { "s","is","are","re","was","were", "an","won","t","has","had","been", "did","does","cannot","got","men" }; #endif
TextVocabularyAnalyzer.h
#ifndef _TEXT_VOCABULARY_ANALYZER_H_ #define _TEXT_VOCABULARY_ANALYZER_H_ #include "TypeDefine.h" extern TagWordFrequencyType frequency_classify(const int wfrq); extern void word_frequency_analyze(array<int, WORD_LEVEL_NUM>& wfrq_array, TagWordFrequencyType wfrq_tag); extern bool isaletter(const char& c); class CLetters { private: string m_word; public: CLetters(); ~CLetters(); void fill(vector<char>& vw); const string word(); const char firstletter(); void processing(); bool usual_recheck(); bool form_recheck(); }; #endif // !_TEXT_VOCABULARY_ANALYZER_H_
TextVocabularyAnalyzer.cpp
/* TextVocabularyAnalyzer.cpp */ #include <algorithm> #include "TextVocabularyAnalyzer.h" TagWordFrequencyType frequency_classify(const int wfrq) { if (wfrq == 0) { return WORD_NOT_FOUND_COCA; } else if (wfrq > 0 && wfrq <= 4000) { return WORD_UNDER_4000; } else if (wfrq > 4000 && wfrq <= 6000) { return WORD_4000_6000; } else if (wfrq > 6000 && wfrq <= 8000) { return WORD_6000_8000; } else if (wfrq > 8000 && wfrq <= 10000) { return WORD_8000_10000; } else if (wfrq > 10000 && wfrq <= 12000) { return WORD_10000_12000; } else if (wfrq > 12000 && wfrq <= 14000) { return WORD_12000_14000; } else if (wfrq > 14000 && wfrq <= 16000) { return WORD_14000_16000; } else { return WORD_OVER_16000; } } void word_frequency_analyze(array<int, WORD_LEVEL_NUM>& wfrq_array, TagWordFrequencyType wfrq_tag) { switch (wfrq_tag) { case WORD_UNDER_4000: { wfrq_array[WORD_UNDER_4000] += 1; break; } case WORD_4000_6000: { wfrq_array[WORD_4000_6000] += 1; break; } case WORD_6000_8000: { wfrq_array[WORD_6000_8000] += 1; break; } case WORD_8000_10000: { wfrq_array[WORD_8000_10000] += 1; break; } case WORD_10000_12000: { wfrq_array[WORD_10000_12000] += 1; break; } case WORD_12000_14000: { wfrq_array[WORD_12000_14000] += 1; break; } case WORD_14000_16000: { wfrq_array[WORD_14000_16000] += 1; break; } case WORD_OVER_16000: { wfrq_array[WORD_OVER_16000] += 1; break; } default: { wfrq_array[WORD_NOT_FOUND_COCA] += 1; break; } } } bool isaletter(const char& c) { if ((c >= ‘a‘ && c <= ‘z‘) || (c >= ‘A‘ && c <= ‘Z‘)) { return true; } else { return false; } } //Class Cletters realization CLetters::CLetters() { m_word = ""; } CLetters::~CLetters() { //do nothing } void CLetters::fill(vector<char>& vw) { //store the word with lower form m_word.assign(vw.begin(), vw.end()); transform(m_word.begin(), m_word.end(), m_word.begin(), tolower); } const string CLetters::word() { return m_word; } const char CLetters::firstletter() { return m_word[0]; } void CLetters::processing() { cout << "Finding word "" << m_word << ""... "; } bool CLetters::usual_recheck() { //check if the word is usual bool RetVal = false; for (int i = 0; i < USUAL_WORD_NUM; i++) { if (m_word == usual_w_out_of_COCA_str[i]) { RetVal = true; } else { //do nothing } } return RetVal; } bool CLetters::form_recheck() { bool RetVal = false; if (m_word.length() > 3) { char e1, e2, e3; e3 = m_word[m_word.length() - 3]; //last but two letter e2 = m_word[m_word.length() - 2]; //last but one letter e1 = m_word[m_word.length() - 1]; //last letter if (e1 == ‘s‘) { m_word.erase(m_word.length() - 1); RetVal = true; } else if (e2 == ‘e‘ && e1 == ‘d‘) { m_word.erase(m_word.length() - 1); m_word.erase(m_word.length() - 1); RetVal = true; } else if (e3 == ‘i‘ && e2 == ‘n‘ && e1 == ‘g‘) { m_word.erase(m_word.length() - 1); m_word.erase(m_word.length() - 1); m_word.erase(m_word.length() - 1); RetVal = true; } else { //do nothing } } return RetVal; }
main.cpp
/* main .cpp */ #include <numeric> #include <iomanip> #include <ctime> #include "TextVocabularyAnalyzer.h" int main() { //file init ifstream COCA_txt("D:\COCA.txt"); ifstream USER_txt("D:\JobsSpeech.txt"); //time init clock_t startTime, endTime; double build_map_time = 0; double process_time = 0; startTime = clock(); //build time start //build COCA words map map<string, int> COCA_WordsList[WORDS_HEAD_NUM]; int readlines = 0; while (readlines < COCA_WORDS_NUM) { int frequency = 0; string word = ""; COCA_txt >> frequency; COCA_txt >> word; //transform to lower uniformly transform(word.begin(), word.end(), word.begin(), tolower); //import every word for (int whead = WORDS_HEAD_A; whead < WORDS_HEAD_NUM; whead++) { //check word head if (word[0] == alphabet_str[whead]) { //if a word already exists, only load its lower frequency if (COCA_WordsList[whead].find(word) == COCA_WordsList[whead].end()) { COCA_WordsList[whead].insert(make_pair(word, frequency)); } else { COCA_WordsList[whead][word] = frequency < COCA_WordsList[whead][word] ? frequency : COCA_WordsList[whead][word]; } } else { // do nothing } } readlines++; } endTime = clock(); //build time stop build_map_time = (double)(endTime - startTime) / CLOCKS_PER_SEC; //user prompt cout << "COCA words list imported. Press any key to start frequency analysis... "; cin.get(); startTime = clock(); //process time start //find text words vector<char> content_read; CLetters word_readed; vector<int> frequecy_processed = { 0 }; array<int, WORD_LEVEL_NUM> words_analysis_array{ 0 }; char char_read = ‘ ‘; //get text char one by one while (USER_txt.get(char_read)) { //only letters and ‘-‘ between letters will be received if (isaletter(char_read) || char_read == ‘-‘) { content_read.push_back(char_read); } else { //char which is not a letter marks the end of a word if (!content_read.empty()) //skip single letter { int current_word_frequency = 0; //assign letters to make the word word_readed.fill(content_read); word_readed.processing(); cout << "Frequency:"; //check the word‘s head and find its frequency in COCA list for (int whead = WORDS_HEAD_A; whead < WORDS_HEAD_NUM; whead++) { if (word_readed.firstletter() == alphabet_str[whead]) { cout << COCA_WordsList[whead][word_readed.word()]; current_word_frequency = COCA_WordsList[whead][word_readed.word()]; //check if the word has been processed if (current_word_frequency == 0) { //addtional check if (word_readed.usual_recheck()) { word_frequency_analyze(words_analysis_array, WORD_UNDER_4000); } else if (word_readed.form_recheck()) { current_word_frequency = COCA_WordsList[whead][word_readed.word()]; //try again if (current_word_frequency > 0) { frequecy_processed.push_back(current_word_frequency); word_frequency_analyze(words_analysis_array, frequency_classify(current_word_frequency)); } else { // do nothing } } else { word_frequency_analyze(words_analysis_array, WORD_NOT_FOUND_COCA); } } else if (find(frequecy_processed.begin(), frequecy_processed.end(), current_word_frequency) == frequecy_processed.end()) { //classify this word and make statistics frequecy_processed.push_back(current_word_frequency); word_frequency_analyze(words_analysis_array, frequency_classify(current_word_frequency)); } else { // do nothing } } else { //do nothing } } cout << endl; content_read.clear(); } else { //do nothing } } } endTime = clock(); //process time stop process_time = (double)(endTime - startTime) / CLOCKS_PER_SEC; //calc whole words processed int whole_words_analyzed = 0; whole_words_analyzed = accumulate(words_analysis_array.begin(), words_analysis_array.end(), 0); //report result cout << " ////////// Report ////////// "; for (int i = 0;i< words_analysis_array.size();i++) { cout << report_str[i] <<" "<< words_analysis_array[i] << " ("; cout<<fixed<<setprecision(2)<<(float)words_analysis_array[i] * 100 / whole_words_analyzed << "%)" << endl; } cout << " Words totally analyzed: " << whole_words_analyzed << endl; //show run time cout << "Map build time: " << build_map_time*1000 << "ms. "; cout << "Process time: " << process_time*1000 << "ms. "; cout << "////////////////////////////" << endl; //close file COCA_txt.close(); USER_txt.close(); return 0; }
以上是关于基于COCA词频表的文本词汇分布测试工具v0.2的主要内容,如果未能解决你的问题,请参考以下文章