歌词VSM实现!!!
Posted 天下岂有长生不灭者
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了歌词VSM实现!!!相关的知识,希望对你有一定的参考价值。
主体VSM_SetUP.cpp;
1 /*[ar:歌词作者] 2 [ti:歌词(歌曲)的标题] 3 */ 4 //编写程序实现lrc(歌词) 文件的检索,检索模型要求采用向量空间模型。请将源程序和文档压缩后,一并上传。 5 #include<iostream> 6 7 #include"Document_Index.h" 8 9 using namespace std; 10 void main() { 11 cout << "*****本程序实现一个向量空间模型,对(D:\\暂时的)文件夹下的lrc文件进行遍历建立一个对应于歌曲名,作曲者,歌词主体的检索系统,请根据需要进行查询*****" << endl << endl; 12 cout << "*****检索出的文档编号对应的文档名字请在(检索结果.txt)内查找*****" << endl; 13 string query; 14 int select,result; 15 int isornot = 2; 16 Doc_Analysis doc_analysis; 17 doc_analysis.SETUP_Index(); 18 ReQuery getResult(&doc_analysis); 19 20 while (isornot>=1) { 21 cout << "输入查询词项:"; 22 if (isornot != 2) { 23 getline(cin, query); 24 } 25 getline(cin, query); 26 27 cout << "请选择查询模式(1为查歌曲,2为查歌手,3为查歌词主体):"; 28 cin >> select; 29 cout << "请选择返回结果的数量:"; 30 cin >> result; 31 getResult.Query(query, result, select); 32 cout << "is or not(1表示继续查询,0表示退出查询) ? " ; 33 cin >> isornot; 34 } 35 }
类的保存文件内:Document_Index.cpp和Document_Index.h
1 #pragma once 2 #include<iostream> 3 #include<fstream> 4 #include<vector> 5 #include<math.h> 6 #include<string> 7 #include<iomanip> 8 #include <stdio.h> 9 #include<io.h> 10 #include <windows.h> 11 using namespace std; 12 13 const static int Maxsize = 10000; 14 const static int maxsize = 100;//a line and a smalllist 15 16 //得分和相应的文档编号 17 struct ScoreandDoc { 18 float score; 19 int text_number; 20 }; 21 22 //包含词的所在的文档编号,词项在此文档出现的频率tf 23 struct Word_Doc { 24 int text_number; 25 int text_fre; 26 }; 27 28 //设计保存词项倒排记录头部的一个结构 29 struct Index_List {//存储每个词的头项,包含单词和指向倒排记录的指针,存储df,df是包含词项的文档的数目,同时也是倒排记录的长度。next指向下一个词项 30 float df; 31 string word; 32 vector<Word_Doc> head_docID;// = nullptr; 33 Index_List * next = nullptr; 34 }; 35 36 //文档检索类,VSM的主体 37 class Doc_Analysis { 38 39 string BTEMP[Maxsize]; 40 string TEMP[maxsize]; 41 int arsize;// 42 int tisize; 43 Index_List * arofMusic_idList; //歌曲作者索引的链表头 44 Index_List * tiofMusic_idList; //歌曲名字索引的链表头 45 Index_List * idList; //主体歌词倒排索引的链表头 46 int size; //文档的词项的数目,即文档长度 47 int allsize;//总的词数 48 int N; //歌词主体文档集的大小,共有N篇文档 49 int sizeofmusicname; //歌词名字和歌词作者的大小 50 string Inp_Temp_Lyrics[Maxsize]; //存放歌词主体 51 string ti_Temp[maxsize/5];//歌曲名称和歌曲作者的暂时存放之地 52 string ar_Temp[maxsize / 5];//歌曲名称和歌曲作者的暂时存放之地 53 public: 54 Doc_Analysis() { 55 size = 0; 56 allsize = 0; 57 arsize = 0; 58 tisize = 0; 59 N = 0; 60 idList = nullptr; 61 arofMusic_idList = nullptr; 62 tiofMusic_idList = nullptr; 63 }; 64 ~Doc_Analysis() { 65 66 }; 67 68 69 //把从文档中检索的词插入Inp_Temp_Words[Maxsize],如果不在就直接插入,如果已经存在则加一个 70 71 //打开文件输入歌词,对词项进行分析,把歌曲的作者和歌曲名取出,存入对应的倒排索引,但是这个索引很小,所以可以直接构建倒排索引表。参数分别为:文件名,文档的编号。先对文档内歌词的作者和调用分离出的词项最终存储在Inp_Temp_Words[Maxsize],返回文档的词数 72 int Doc_input(string filename, int number); 73 74 //被int Doc_input(string filename, int number);调用,将分离出的词项存储在temp_Words[]中,size表示其大小,j表示其从哪一个数开始放入 75 int Temp_Insert(string temp_words[],char T[],int &size); 76 77 //对此文档的词项的表进行归并排序(按字典序) 78 void Doc_mergesort(string *inputWord, string* Temp, int left, int right); 79 80 //将此次输入的文档分词排序后得到的词项表存入最终的倒排索引中 81 Index_List* insert_IndexList(string *inputWord, int n,int NofDoc, Index_List * idList); 82 83 //歌曲名,作者,歌词主体倒排索引总体构建 84 void SETUP_Index();// {}; 85 86 //返回最终查询的文档集大小 87 int SizeOfDocSet() { 88 return N; 89 } 90 91 //返回最终生成的歌词主体倒排索引表 92 Index_List* tiIndex_head() { 93 return tiofMusic_idList; 94 }; 95 96 //返回最终生成的歌ming倒排索引表 97 Index_List* DocIndex_head() { 98 return idList; 99 }; 100 101 //返回最终生成的作者倒排索引表 102 Index_List* arIndex_head() { 103 return arofMusic_idList; 104 }; 105 106 //此函数实现寻找指定文件夹下的指定后缀文件,并且保存其完整的路径 107 void GetAllFormatFiles(string path, vector<string>& files, string format); 108 // 109 }; 110 111 //对输入的查询词项进行分析,返回输入结果 112 class ReQuery { 113 Doc_Analysis* LMA;//歌词倒排索引的链表头 114 float *Scores;//每个查询词项的初始得分 115 float *arLength;//每个文档的长度 116 float *tiLength; 117 float *Length; 118 int N;//文档集大小 119 public: 120 ReQuery(Doc_Analysis* TEMP) {//得到Doc_Analysis返回的文档集长度和链表头 121 N = TEMP->SizeOfDocSet(); 122 Scores = new float[N]; 123 Length = new float[N]; 124 125 LMA = TEMP; 126 for (int i = 0; i < N; i++) { 127 Scores[i] = 0; 128 Length[i] = 0; 129 } 130 }; 131 ~ReQuery() { 132 delete[] Scores; 133 delete[] Length; 134 delete[] arLength; 135 delete[] tiLength; 136 }; 137 138 //查询所有词项,对所有倒排索引表遍历一次,将每个向量的长度计算出来,初始化得分数组 139 void initialLength(Index_List * idList, float *tempLength); 140 141 //输入查询歌词词项,输出查询结果,返回排名前k的文档编号,select 为选择的查询模式,1为查歌名,2为查歌手,3为查歌词主体 142 bool Query(string query, int k,int select=1); 143 144 //查询某一个词是否在其中,有则返回其df,无则返回0 145 int isInner(string elem, Index_List idList[],Index_List &nowTemp); 146 147 //某个词项对于suoyou文档的得分 148 float ScoreofaDoc(Index_List *idList, Index_List* word, int iQ); 149 150 //进行堆排序,将所有的得分进行排序 151 bool HeapSort(float Scores[], int n,int k); 152 153 }; 154 155 //堆排序,建立最大堆 156 class Max_Heap { 157 ScoreandDoc *Heap; 158 int size; 159 int n; 160 void siftdown(int elem); 161 public: 162 Max_Heap(int num, int max, ScoreandDoc *temp) { 163 n = num; 164 size = max; 165 Heap = temp; 166 buildHeap(); 167 }; 168 void buildHeap() { 169 for (int i = n / 2 - 1; i >= 0; i--) 170 siftdown(i); 171 }; 172 int heapsize()const { 173 return n; 174 } 175 bool isLeaf(int pos)const { 176 return (pos >= n / 2) && (pos < n); 177 } 178 int leftchild(int pos)const { 179 return 2 * pos + 1; 180 } 181 int rightchild(int pos)const { 182 return 2 * pos + 2; 183 } 184 ScoreandDoc removemax(float it); 185 186 };
——————————————————————————————————————————————————————————————————————————————————
1 #pragma once 2 #include"Document_Index.h" 3 #include<iostream> 4 #include<fstream> 5 #include<vector> 6 #include<math.h> 7 #include<string> 8 #include<iomanip> 9 #include <stdio.h> 10 #include<io.h> 11 #include <windows.h> 12 using namespace std; 13 14 ///////////类:Doc_Analysis/////////////////////// 15 16 //打开文件输入歌词,对词项进行分析,把歌曲的作者和歌曲名取出,存入对应的倒排索引,但是这个索引很小,所以可以直接构建倒排索引表。参数分别为:文件名,文档的编号。调用分离出的词项最终存储在Inp_Temp_Words[Maxsize],返回歌词文档的词数 17 int Doc_Analysis::Doc_input(string filename, int number) { 18 ifstream fin(filename); 19 if (!fin.is_open()) { 20 exit(0); 21 } 22 N++; 23 // cout << "此文件夹文档数目:" << N << endl; 24 char c[maxsize] = { ‘\0‘ }; 25 int ic = 0, i = 2; 26 int numberofDoc = 0; 27 string str; 28 getline(fin, str); 29 30 //分离出作者 31 for (; str[i] != ‘]‘; i++) { 32 if (str[i] == ‘:‘&&str[i - 1] == ‘r‘&&str[i - 2] == ‘a‘) { 33 for (; str[i] != ‘]‘; i++) { 34 if (((int)str[i] >= 65) && ((int)str[i] <= 91)) 35 c[ic++] = (int)str[i] + 32; 36 else 37 c[ic++] = str[i]; 38 } 39 i--; 40 } 41 } 42 Temp_Insert(ar_Temp,c,arsize); 43 44 Doc_mergesort(ar_Temp, TEMP, 0, arsize - 1); 45 46 47 //分离出歌名 48 getline(fin, str); 49 i = 2; 50 ic = 0; 51 for (; str[i] != ‘]‘; i++) { 52 if (str[i] == ‘:‘&&str[i - 1] == ‘i‘&&str[i - 2] == ‘t‘) { 53 for (; str[i] != ‘]‘; i++) { 54 if (((int)str[i] >= 65) && ((int)str[i] <= 91)) 55 c[ic++] = (int)str[i] + 32; 56 else 57 c[ic++] = str[i]; 58 } 59 i--; 60 } 61 } 62 63 //cout << "歌名:" << c << endl; 64 Temp_Insert(ti_Temp,c, tisize); 65 66 //遍历整个geci文档主体,每次读取一行,然后进行分析 67 getline(fin, str); 68 69 do { 70 //cout <<"收到: " <<str << endl; 71 ic = 0; 72 for (i = 0; str[i] != ‘]‘; i++); 73 for (i++; str[i] != ‘\r‘&&str[i] != ‘\n‘&&str[i] != ‘\0‘; i++) { 74 75 //去掉引号后面的字符,但是如果是t的话就不去 76 if ((int)str[i] == 39) { 77 while (str[i] != ‘ ‘&&str[i] != ‘\r‘&&str[i] != ‘\n‘&&str[i] != ‘\0‘) { 78 i++; 79 if (str[i] == ‘t‘) { 80 i--; 81 break; 82 } 83 } 84 if (str[i] == ‘\r‘ || str[i] == ‘\n‘ || str[i] == ‘\0‘) 85 break; 86 } 87 88 //除去大小写 89 if (((int)str[i] >= 65) && ((int)str[i] <= 91)) 90 c[ic++] = (int)str[i] + 32; 91 else 92 c[ic++] = str[i]; 93 } 94 c[ic] = ‘\0‘; 95 96 Temp_Insert(Inp_Temp_Lyrics, c, numberofDoc); 97 getline(fin, str); 98 } while (!fin.eof()); 99 fin.close(); 100 allsize += numberofDoc; 101 102 //cout << "本文档最终分离出词数:" << size << endl; 103 size = numberofDoc; 104 return numberofDoc; 105 }; 106 107 //被int Doc_input(string filename, int number);调用,将分离出的词项存储在temp_Words[]中,size表示其大小 108 int Doc_Analysis::Temp_Insert(string temp_words[], char T[],int &size) { 109 const char *d = "[] -;,:/?!.()\"";//以这些字符为分界符[] -;,:/?!.()\" 110 char *p = NULL; 111 char *next_p = NULL; 112 p = strtok_s(T, d, &next_p); 113 while (p) 114 { 115 //cout << p << endl; 116 temp_words[size++] = p;//put the char* into temp table 117 p = strtok_s(NULL, d, &next_p); 118 } 119 120 return size; 121 }; 122 123 //对此文档的词项的表进行归并排序(按字典序) 124 void Doc_Analysis::Doc_mergesort(string *inputWord, string* Temp, int left, int right) { 125 int i, j, k, mid = (left + right) / 2; 126 if (left == right) 127 return; 128 Doc_mergesort(inputWord, Temp, left, mid); 129 Doc_mergesort(inputWord, Temp, mid + 1, right); 130 for (i = mid; i >= left; i--) 131 Temp[i] = inputWord[i]; 132 for (j = 1; j <= right - mid; j++) 133 Temp[right - j + 1] = inputWord[j + mid]; 134 for (i = left, j = right, k = left; k <= right; k++) 135 if (Temp[i]<= Temp[j]) 136 inputWord[k] = Temp[i++]; 137 else 138 inputWord[k] = Temp[j--]; 139 }; 140 141 //将此次输入的文档分词排序后得到的词项表存入最终的倒排索引中,numberofDoc为此文档分离出的词的数目,(不是词项)NofDoc为文档的编号 142 Index_List* Doc_Analysis::insert_IndexList(string *inputWord, int numberofDoc,int NofDoc, Index_List * idListx) { 143 int i = 0, j = 0; 144 Index_List* pre_idList = idListx,*idList=idListx, *idListHead = idListx; 145 //cout << "词数" << numberofDoc << endl; 146 if (i < numberofDoc) { 147 //cout << " 当前文档的词: " << inputWord[i] << endl; 148 while ((idList != nullptr)&&(i<numberofDoc)) {//将整个倒排索引在此遍历完全,在文档也未结束的情况下 149 //1.词项和目前监测的节点值一样,则直接在其后的此词项的后面加上本文档的相关信息即可 150 if (inputWord[i] == idList->word) { 151 //cout << "此时词项" << inputWord[i] << "已存在索引表中" << endl; 152 Word_Doc *temp = new Word_Doc; 153 temp->text_number = NofDoc;//这个词项的文档编号,把所有相同的词项合并在一起 154 temp->text_fre = 0;//肯定已经在这个文档出现了一次 155 do { 156 temp->text_fre++; 157 i++; 158 if (i == numberofDoc) 159 break; 160 } while (inputWord[i] == idList->word);//只有当文档检测的词项不一样时退出 161 162 idList->df++;//出现该词项的文档数增1,应该为df 163 164 idList->head_docID.push_back(*temp); 165 166 /*cout << (idList->head_docID)[idList->head_docID.size() - 1].text_number << endl;*/ 167 168 pre_idList = idList;//前一个链表值 169 170 //print2(idList);//查看这个idList的具体值 171 172 idList = idList->next;//索引表下移 173 } 174 //2.当这个词项比当前索引的词项小时,说明词项肯定在倒排索引中排在当前词项的前面,则将其插入在其之前,注意区分第一个和中间的 175 else if (inputWord[i] < idList->word) { 176 //cout << inputWord[i] << "比索引表的——" << idList->word << " 小" << endl; 177 Index_List* newidList = new Index_List; 178 vector<Word_Doc> forID ;//因为是单独建一个词项的索引,故建立存储倒排索引的容器 179 Word_Doc *temp = new Word_Doc; 180 temp->text_number = NofDoc; 181 temp->text_fre = 0;//肯定已经出现过一次,把所有相同的词项合并在一起 182 if (idList->word==pre_idList->word) { 183 //cout << "这个词即将插入索引头。" << endl; 184 idListHead = newidList; 185 } 186 else { 187 pre_idList->next = newidList; 188 } 189 do { 190 temp->text_fre++; 191 i++; 192 if (i == numberofDoc) 193 break; 194 } while (inputWord[i] == inputWord[i - 1]);//只有当文档检测的词项不一样时退出 195 196 forID.push_back(*temp); 197 198 newidList->df = 1; 199 newidList->next = idList; 200 201 pre_idList = newidList; 202 newidList->word = inputWord[i-1]; 203 newidList->head_docID = forID; 204 205 206 } 207 //3.当目前文档的词比索引的词项大时,倒排索引表向后走 208 else { 209 //cout << inputWord[i] << "比索引表的——" << idList->word << " 大" << endl; 210 211 pre_idList = idList; 212 //cout << idList->word << endl; 213 idList = idList->next; 214 } 215 } 216 //idList==nullptr,,,if条件句成立意味着倒排索引表已经到达尾部,接下来的所有词项都大于索引表内任何词项,可以直接插入,注意区分第一个和中间的 217 while (i < numberofDoc) { 218 idList = new Index_List; 219 if (idListHead == nullptr) {//如果是 220 pre_idList = idList; 221 idListHead = idList; 222 } 223 else { 224 pre_idList->next = idList; 225 } 226 227 vector<Word_Doc> forID;//建立存储这个词项的倒排索引的容器 228 Word_Doc *temp = new Word_Doc; 229 temp->text_number = NofDoc; 230 temp->text_fre = 0; 231 do { 232 temp->text_fre++; 233 i++; 234 if (i == numberofDoc) 235 break; 236 237 } while (inputWord[i] == inputWord[i - 1]);//把所有相同的词项合并在一起,只有词项不一致时才退出 238 forID.push_back(*temp); 239 240 idList->df = 1; 241 242 pre_idList = idList; 243 idList->word = inputWord[i-1]; 244 idList->head_docID = forID; 245 246 //print2(idList); 247 idList = idList->next; 248 } 249 } 250 //print1(idListHead); 251 return idListHead; 252 }; 253 254 //歌曲名,作者,歌词主体倒排索引总体构建 255 void Doc_Analysis::SETUP_Index(){ 256 int i; 257 string tx_filePath = "\0", filePath = "D:\\暂时的", distAll = "检索结果.txt", format = ".lrc"; 258 vector<string> files; 259 260 GetAllFormatFiles(filePath, files, format); 261 distAll = filePath + "\\" + distAll; 262 ofstream ofn(distAll); 263 int tsize = files.size(); 264 cout << "文件夹下的.lrc数目:" << tsize << endl;//查询出文件夹下文档的数目 265 for (i = 0; i < tsize; i++)//一次遍历,每检索一个文档将其存入相应的缓冲区,然后建立倒排索引 266 { 267 ofn <<"文档"<<i<<": "<< files[i] << endl; // 写入文件 268 Doc_input(files[i], i); 269 Doc_mergesort(ar_Temp, TEMP, 0, arsize-1); 270 Doc_mergesort(ti_Temp, TEMP, 0, tisize-1); 271 Doc_mergesort(Inp_Temp_Lyrics,BTEMP, 0, size-1); 272 273 //插入倒排索引 274 275 arofMusic_idList = insert_IndexList(ar_Temp, arsize, i, arofMusic_idList); 276 tiofMusic_idList = insert_IndexList(ti_Temp, tisize, i, tiofMusic_idList); 277 idList=insert_IndexList(Inp_Temp_Lyrics, size, i, idList); 278 279 280 arsize = 0; 281 tisize = 0; 282 size = 0; 283 284 //cout << "索引链表内容如下:" << endl; 285 //cout << "作者:" << endl; 286 //print1(arofMusic_idList); 287 //cout << "歌名:" << endl; 288 //print1(tiofMusic_idList); 289 //cout << "主体:" << endl; 290 //print1(idList) 291 //insert_IndexList(ti_Temp, arsize, i, tiofMusic_idList); 292 //insert_IndexList(Inp_Temp_Lyrics, arsize, i, idList); 293 } 294 //至此,索引构建完毕 295 /*cout << "主体:" << endl; 296 print1(idList);*/ 297 ofn <<endl<< "文件夹下的.lrc数目:" << tsize << endl; 298 cout << endl; 299 ofn << "检索出词数(非词项数):" << allsize << endl; 300 ofn.close(); 301 cout << "一共检索出词数(非词项数):" << allsize << endl; 302 cout << "歌曲名索引构建完毕!!!" << endl; 303 cout << "作曲者索引构建完毕!!!" << endl; 304 cout << "歌词主体索引构建完毕!!!" << endl; 305 cout << endl; 306 }; 307 308 //此函数实现寻找指定文件夹下的指定后缀文件,并且保存其完整的路径 309 void Doc_Analysis::GetAllFormatFiles(string path, vector<string>& files, string format) 310 { 311 //文件句柄 312 long hFile = 0; 313 //文件信息 314 struct _finddata_t fileinfo; 315 string p; 316 if ((hFile = _findfirst(p.assign(path).append("\\*" + format).c_str(), &fileinfo)) != -1) 317 { 318 do 319 { 320 if ((fileinfo.attrib & _A_SUBDIR)) 321 { 322 if (strcmp(fileinfo.name, ".") != 0 && strcmp(fileinfo.name, "..") != 0) 323 { 324 files.push_back(p.assign(path).append("\\").append(fileinfo.name)); 325 GetAllFormatFiles(p.assign(path).append("\\").append(fileinfo.name), files, format); 326 } 327 } 328 else 329 { 330 files.push_back(p.assign(path).append("\\").append(fileinfo.name));; //将文件路径保存,也可以只保存文件名: p.assign(fileinfo.name) 331 } 332 } while (_findnext(hFile, &fileinfo) == 0); 333 334 _findclose(hFile); 335 } 336 }; 337 338 339 ////////////////////类:ReQuery//////////////////// 340 341 //查询所有词项,对倒排索引表遍历一次,初始化得分数组 342 343 void ReQuery::initialLength(Index_List * idList, float *tempLength) { 344 float idf;//记录log N/df 345 int size; 346 int i; 347 348 while (idList != nullptr) { 349 idf = log(N/idList->df) / log(10); 350 size = idList->head_docID.size(); 351 i = 0; 352 for (; i < size; i++) { 353 tempLength[idList->head_docID[i].text_number] += (idf*(1 + log(idList->head_docID[i].text_fre) / log(10)))*(idf*(1 + log(idList->head_docID[i].text_fre) / log(10))); 354 } 355 idList = idList->next; 356 } 357 for (i = 0; i < N; i++) 358 tempLength[i] = sqrt(tempLength[i]); 359 /*for (int i = 0; i < LMA->SizeOfDocSet();i++) 360 cout << "文档" << i <<" 长度为 "<< tempLength[i] << endl;*/ 361 } 362 363 //输入查询词项,输出查询结果,返回排名前k的文档编号,select 为选择的查询模式,1为查歌名,2为查歌手,3为查歌词主体 364 bool ReQuery::Query(string query, int k, int select) { 365 Index_List QUERY[10]; 366 char aa[50] = { ‘\0‘ }; 367 int ia = 0, i = 0, iQ = 0, j = 0; 368 369 for (; i<query.length(); i++) { 370 while (query[i] != ‘ ‘&& i < query.length()) { 371 if (((int)query[i] >= 65) && ((int)query[i] <= 91)) 372 aa[ia++] = query[i++] + 32; 373 else 374 aa[ia++] = query[i++]; 375 } 376 377 aa[ia] = ‘\0‘; 378 for (; j < iQ; j++) { 379 if (QUERY[j].word == aa) { 380 QUERY[j].df++; 381 j = -1; 382 break; 383 } 384 } 385 if (j != -1) { 386 QUERY[iQ++].word = aa; 387 QUERY[iQ-1].df = 1; 388 } 389 j = 0; 390 ia = 0; 391 //cout << f[ic1 - 1] << endl; 392 } 393 394 //查歌名 395 if (select == 1) { 396 initialLength(LMA->tiIndex_head(), Length); 397 //计算查询de得分 398 ScoreofaDoc(LMA->tiIndex_head(), QUERY, iQ); 399 } 400 //查作者 401 if (select == 2) { 402 initialLength(LMA->arIndex_head(), Length); 403 //计算查询de得分 404 ScoreofaDoc(LMA->arIndex_head(), QUERY, iQ); 405 } 406 //查歌词主体 407 if (select == 3) { 408 initialLength(LMA->DocIndex_head(), Length); 409 //计算查询de得分 410 ScoreofaDoc(LMA->DocIndex_head(), QUERY, iQ); 411 } 412 /*for (int i = 0; i < N; i++) { 413 cout << Scores[i] << endl; 414 }*/ 415 //对得分数组建堆,并且返回前K个 416 HeapSort(Scores, N, k); 417 return true; 418 }; 419 420 //词项对于suoyou文档的得分 421 float ReQuery::ScoreofaDoc(Index_List *idList, Index_List Tword[],int iQ) { 422 int size = 0; 423 int df; 424 float idf; 425 Word_Doc TEMPS; 426 Index_List nowTemp; 427 for (int i = 0; i < iQ;i++) { 428 429 if ((df = isInner(Tword[i].word,idList,nowTemp)) != 0) { 430 idf = log(N/df) / log(10); 431 size =nowTemp.head_docID.size(); 432 for (int j = 0; j < size; j++) { 433 TEMPS = (nowTemp.head_docID)[j]; 434 Scores[TEMPS.text_number] += ((idf)*Tword[i].df)*(idf*(1 + log(nowTemp.head_docID[j].text_fre) / log(10))); 435 } 436 } 437 } 438 cout <<endl <<"各文档依次得分:" << endl; 439 for (int i = 0; i < N; i++) { 440 if(Scores[i]!=0) 441 Scores[i] = Scores[i]/ Length[i]; 442 cout << Scores[i] <<" "; 443 } 444 return 0; 445 } 446 447 //查询某一个词是否在其中,有则返回其df,无则返回0 448 int ReQuery::isInner(string elem, Index_List idList[], Index_List &nowTemp) { 449 int i = 0; 450 //cout << "查找单词" << elem << endl; 451 while (idList != nullptr) { 452 if (idList[i].word == elem) { 453 //cout << "单词" << elem << "在其中,文档编号"<<idList[i].head_docID[0].text_number<<endl; 454 nowTemp = idList[i]; 455 return idList->df; 456 } 457 if (idList->word > elem) 458 return 0; 459 idList = idList->next; 460 }; 461 return 0; 462 } 463 464 //进行堆排序,将所有的得分进行排序,找出前k个,n为数组大小 465 bool ReQuery::HeapSort(float Scores[],int n,int k) { 466 float doc = -1; 467 int i = 0; 468 ScoreandDoc *TScores = new ScoreandDoc[n]; 469 for (; i < n; i++) { 470 TScores[i].score = Scores[i]; 471 TScores[i].text_number = i; 472 } 473 474 Max_Heap H(n, n, TScores); 475 i = 0; 476 cout << endl << endl << "---------------------向您推荐如下文档--------------------------------" << endl; 477 for (; i < k; i++) { 478 ScoreandDoc temp= H.removemax(doc); 479 cout << "文档编号:" << temp.text_number << " 得分:" << temp.score << endl; 480 } 481 482 cout << endl; 483 delete[]TScores; 484 return true; 485 486 }; 487 488 //////////////////////////////lei Max_Heap/////////////////////////// 489 //堆建立 490 491 //建立整堆 492 void Max_Heap::siftdown(int pos) { 493 while (!isLeaf(pos)) { 494 int j = leftchild(pos); 495 int rc = rightchild(pos); 496 if ((rc < n) && (Heap[j].score < Heap[rc].score)) { 497 j = rc; 498 } 499 if (!(Heap[pos].score < Heap[j].score)) 500 return; 501 ScoreandDoc xxx = Heap[pos]; 502 Heap[pos] = Heap[j]; 503 Heap[j] = xxx; 504 pos = j; 505 } 506 }; 507 508 509 //每次找出移除最大的 510 ScoreandDoc Max_Heap::removemax(float it) { 511 //if (n == 0) 512 // return ; 513 ScoreandDoc xxx = Heap[--n]; 514 Heap[n] = Heap[0]; 515 Heap[0] = xxx; 516 if (n != 0) 517 siftdown(0); 518 //it = Heap[n].score; 519 return Heap[n]; 520 }
以上是关于歌词VSM实现!!!的主要内容,如果未能解决你的问题,请参考以下文章