词频统计（未完成，错误）

Posted 2021-01-28 astonc
tags:
篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了词频统计（未完成，错误）相关的知识，希望对你有一定的参考价值。
#include<iostream>
#include<cstdio>
#include<cmath>
#include<cstdlib>
#include<cstring>
#include<algorithm>
#include<vector>
using namespace std;
#define KEYLENGTH 15
#define MAXWORDLEN 80
#define MAXTABLESIZE 100000
typedef char ET[KEYLENGTH+1];
typedef int Index;
typedef struct LNode* PtrToLNode;
struct LNode{
    ET Data;
    PtrToLNode Next;
    int Count;
};
typedef PtrToLNode Position;
typedef PtrToLNode List;
typedef struct TblNode *HashTable;
struct TblNode{
    int TableSize;
    List Heads;
};
int flag,cnt;
vector<LNode> v;
bool cmp(LNode a,LNode b){
    if(a.Count > b.Count )
        return true;
    if(a.Count ==b.Count ){
        if(strcmp(a.Data,b.Data)<0)
            return true;
    }
    return false;
}
int NextPrime( int N ){
    int i, p = (N%2) ? N+2 :N+1;
    
    while(p<=MAXTABLESIZE){
        for(i=(int)sqrt(p); i>2; i--)
            if(! (p%i)) break;
        if(i == 2) break;
        else p += 2;
    }
    return p;
}
HashTable CreateTable( int TableSize){
    HashTable H;
    int i;
    
    H = (HashTable)malloc(sizeof(struct TblNode));
    H->TableSize = NextPrime(TableSize);
    
    H->Heads = (List)malloc(H->TableSize *sizeof(struct LNode));
    
    for( i=0; i<H->TableSize ; i++){
        H->Heads [i].Data[0] = ‘‘;
        H->Heads [i].Next = NULL;
    }
    return H;
}
int Hash(const char* Key, int TableSize){
    unsigned int H=0;
    while(* Key !=‘‘)
        H = (H<<5) + *Key++;
    return H % TableSize;
}
Position Find(HashTable H, ET Key){
    Position P;
    Index Pos;
    
    Pos = Hash(Key, H->TableSize );
    P = H->Heads [Pos].Next;
    
    while(P && strcmp(P->Data , Key))
        P = P->Next ;
    return P;
}
void InsertAndCount(HashTable H, ET Key){
    if(Key[0]==‘‘) return;
    Position P, NewCell;
    Index Pos;
    P=Find(H,Key);
    if(!P){
        NewCell = (Position)malloc(sizeof(LNode));
        strcpy(NewCell->Data ,Key);
        NewCell->Count =1;
        Pos=Hash(Key,H->TableSize);
        NewCell->Next =H->Heads[Pos].Next;
        H->Heads [Pos].Next=NewCell;
        H->Heads [Pos].Count++;
    }
    else
        P->Count ++;
} 
bool IsWordChar(char c){
    if(c>=‘a‘&&c<=‘z‘||c>=‘0‘&&c<=‘9‘||c==‘_‘)
        return true;
    else
        return false;
}
void GetAWord(ET word){
    char tempword[MAXWORDLEN+1], c;
    int len=0;
    scanf("%c",&c);
    if(c==‘#‘){
        flag=0;
        return;
    }
    while(c!=‘#‘){
        if(c>=‘A‘&&c<=‘Z‘)
            c+=32;
        if(IsWordChar(c))
            tempword[len++]=c;
        scanf("%c",&c);
        
        if(len&&!IsWordChar(c))
            break;
    }
    if(c==‘#‘) flag=0;
    tempword[len] = ‘‘;
    if(len>KEYLENGTH)
        tempword[KEYLENGTH] = ‘‘;
    strcpy(word, tempword);
    //cout<<"0 "<<word<<endl;
    //cout<<c<<flag<<endl;
}
void Show(HashTable H, double percent){
    int diffwordcount=0;
    int maxf = 0;
    int * diffwords;
    int maxCollision = 0;
    int minCollision = 100;
    Position L;
    int i, j ,k, lowerbound, count = 0;
    
    for(i = 0; i<H->TableSize ;i++){
        diffwordcount += H->Heads [i].Count;
        if(maxCollision < H->Heads [i].Count)
            maxCollision = H->Heads [i].Count;
        if(minCollision > H->Heads [i].Count)
            minCollision = H->Heads [i].Count;
        L = H->Heads [i].Next;
        while(L){
            if(maxf < L->Count ) maxf = L->Count ;
            L = L->Next ;
        }
    }
    printf("%d
",diffwordcount);
    cnt=diffwordcount;
    diffwords = (int * )malloc((maxf+1)*sizeof(int));
    for(i = 0; i <= maxf; i++)
        diffwords[i]=0;
    for(i = 0; i < H->TableSize ; i++){
        L = H->Heads [i].Next;
        while(L){
            diffwords[ L->Count ]++;
            L = L->Next ;
        }
    }
    
    lowerbound = (int)( diffwordcount * percent);
    for(i = maxf; i >= 1 && count<lowerbound; i--)
        count += diffwords[i];
    
    for(j = maxf; j >= i; j--){
        for(k = 0; k<H->TableSize ; k++){
            L = H->Heads [k].Next;
            while(L){
                if(j==L->Count ){
                    //printf("%d:%-15s
",L->Count ,L->Data );
                    struct LNode temp;
                    temp.Count =L->Count ;
                    strcpy(temp.Data,L->Data );
                    v.push_back(temp);
                }
                    
                L = L->Next ;
            }
        }
    }
    free(diffwords);
}
void DestoryTable(HashTable H){
    int i;
    Position P, Tmp;
    
    for(i=0; i<H->TableSize ;i++){
        P = H->Heads [i].Next;
        while(P){
            Tmp = P->Next ;
            free(P);
            P=Tmp;
        }
    }
    free(H->Heads );
    free(H);
}
int main(){
    HashTable H;
    ET word;
    int TableSize = 100;
    int length, wordcount = 0;
    H = CreateTable(TableSize);
    flag=1;
    do{
        GetAWord(word);
        wordcount++;
        
        InsertAndCount(H, word);
    }while(flag);
    Show(H, 10.0/100);
    sort(v.begin(),v.end(),cmp);
    for(int i=0;i<cnt/10;i++)
        printf("%d:%15s
",v[i].Count,v[i].Data);
    DestoryTable(H);
    return 0;
}
View Code
以上是关于词频统计（未完成，错误）的主要内容，如果未能解决你的问题，请参考以下文章