AC多模匹配+完整实现源码

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了AC多模匹配+完整实现源码相关的知识,希望对你有一定的参考价值。

前段时间同事讲了一下AC多模匹配的原理,就试着写了下代码,现在再看代码的时候有些地方连自己都快看不懂了,所以想做一个笔记

查找10M的日志文件,即使最后一行,用时不到0.2毫秒
能正确查找出ssss串中的s,ss,sss,ssss个数和位置,notepad无法实现

1、构建树

构建模式串(需要搜索的串)为{"hee","he", "h","she", "his" ,"her","hers"}的树。注:下划线序号表示层高

技术分享图片

 

 

构建过程也是查找的过程,举例讲解构建:hee过程,其他类似

             构建hee过程:

             1、构建h节点:输入串为h,以根节点作为起始节点SNode,查找SNode下是否存在值为h,层高为strlen(h)=1的节点,不存在,插入

             2、构建第一个e节点:输入串为he,以根节点作为起始节点SNode,查找SNode下是否存在值为h,层高为strlen(h)=1的节点,存在,以h节点作为SNode,查找是否存值为e,层高为strlen(he)的节点,不存在,插入

             3、构建第二个e节点:输入串为hee,以根节点作为起始节点SNode,查找SNode下是否存在值为h,层高为strlen(h)=1的节点,存在,以h节点作为SNode,查找是否存值为e,层高为strlen(he)的节点,存在,以e节点作为SNode节点,查找是否存值为e,层高为strlen(hee)的节点,不存在,插入。构建字符e为构建串的最后一个字符时,该节点为输出节点

构建一个串中某个字符的节点时,会将该字符的前面所有字符+该字符作为一个串输入,搜索树并插入。因为每个字符必须要知道前面的路径

 

2、构建失败节点

  1. 根节点的失败节点为根节点,第一层节点的失败节点为根节点
  2. 某个节点的失败节点为:以该节点的父节点的失败节点为起点查找该节点,如果能找到,找到的节点为失败节点,不能找到,则失败节点为根节点

       已知: f(root) = root;f(h_1)=root;f(s_1)=root

          解: f(e_2) = g(f(h_1),e) = g(root,e) = root

                  f(e_3) = g(f(e_2),e) = g(root,e) = root

                  f(r_3) = g(f(e_2),r) = g(root,r) = root

                  f(s_4) = g(f(r_3),s) = g(root,s) = s_1

                  f(i_2) = g(f(h_1),i) = g(root,i) = root

                  f(s_3) = g(f(i_2),s) = g(root,s) = s_1

                  f(h_2) = g(f(s_1),h) = g(root,h) = h_1

                  f(e_3) = g(f(h_2),e)=g(h_1,e)=e_2

 

3、查找"ushers"中的模式串

            1、遍历"ushers",从根节点作为起始节点SNode,查找字符为u的节点,查找失败,以SNode的失败节点为起始节点查找下一个字符s

            2、以SNode作为起始节点,查找字符为s的节点,查找成功,判断节点是否为匹配节点,输出匹配值,以s节点作为起始节点查找h节点,直到遍历完"ushers"

 

大家可以手动试一下构建模式串为{"s","ss","sss","ssss"}的树,构建每个节点的失败节点,并在内容"ssss"中查找模式串

 

文字只讲了大致流程,还有很多细节没有描述出来,语言水平有限啊,细节大家看代码吧...

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

AC.h

#ifndef AC_H
#define AC_H
#define MAX_CHILD_LEN 128
#define MAX_OUTPUT    128
#include <string.h>
#include <stdio.h>
class Node
{
public:
    Node(char ele){
        element = ele;
        parent = NULL;
        failNode = NULL;
        isMatchNode=false;
        memset(nodeList,0,MAX_CHILD_LEN);
        childNum=0;
        high = 0;
        memset(outPut,0,MAX_OUTPUT);
        outPutNum = 0;
    }
    ~Node(){
        printf("node release,high:%d value:%c\\n",high,element);
        for(int i=0;i<outPutNum;i++){
            char * ele = outPut[i];
            printf("release output str:%s\\n",ele);
            delete ele;
            ele = NULL;
        }
    }
    char element;
    Node * parent;
    Node * nodeList[MAX_CHILD_LEN];
    Node * failNode;
    bool isMatchNode;
    int childNum;
    int high;
    char * outPut[MAX_OUTPUT];
    int outPutNum;
};
typedef void(*Func)(char * matchedStr,int post);
class AC
{
public:
    AC() {root = new Node(NULL);root->failNode = root;}
    ~AC(){
        delete root;
        root = NULL;
    }
    void initTree(char * patterns[],int patLen){
        for(int i=0;i<patLen;i++){
            char * element = patterns[i];
            int eleLen = strlen(element);
            for(int j=0;j<eleLen;j++){
                bool isMatch = false;
                if(j == eleLen-1)
                    isMatch = true;
                char * p = new char[j+2];
                memset(p,‘\\0‘,j+2);
                strncpy(p,element,j+1);
                insert(p,isMatch);
            }
        }
    }
    void buildFailNode(){
        traceAllNodes(root);
    }
    void match(char * srcTxt,int txtLen,Func f){
        Node * startNode = root;
        for(int i=0;i<txtLen;i++){
            char e = srcTxt[i];
            bool isOk = false;
            for(int j=0;j<startNode->childNum;j++){
                Node * node = startNode->nodeList[j];
                if(node->element == e){
                    isOk = true;
                    startNode = node;
                    Node * failNode = node->failNode;
                    while(failNode!=root){
                        if(failNode->isMatchNode){
                            for(int k=0;k<failNode->outPutNum;k++)
                                f(failNode->outPut[k],i);
                        }
                        failNode = failNode->failNode;
                    }
                    if(node->isMatchNode){
                        for(int k=0;k<node->outPutNum;k++)
                            f(node->outPut[k],i);
                    }
                    break;
                }
            }
            if(!isOk){
                startNode = startNode->failNode;
                if(startNode!=root)
                    i--;
            }
        }
    }
    void deleteTree(){
        printf("delete tree--------------------------------\\n");
        traceDelNodes(root);
    }
    void printACTree(){
        printf("tree structure-----------------------------\\n");
        printf("high  value  match   failNode  childNum  children  outPutStr   \\n");
        tracePrintNodes(root);
    }
private:
    void insert(char * ele,bool isMatch){
        int eleLen = strlen(ele);
        //搜索ele最后一个元素节点是否存在,不存在则返回父节点
        int startH = 1;
        Node * pnode = NULL;
        if(!search(root,startH,ele,pnode)){
            Node * cnode = new Node(ele[eleLen-1]);
            cnode->high = eleLen;
            cnode->parent = pnode;
            cnode->isMatchNode = isMatch;
            pnode->nodeList[pnode->childNum]=cnode;
            pnode->childNum++;
            if(isMatch)
                cnode->outPut[cnode->outPutNum++]=ele;
        }
    }
    bool search(Node * pnode,int & index,char * ele,Node * &retNode){
        for(int i=0;i<pnode->childNum;i++){
            Node * node = pnode->nodeList[i];
            if(node->element == ele[index-1]){
                if(index == strlen(ele))
                    return true;
                index++;
                return search(node,index,ele,retNode);
            }
        }
        retNode = pnode;
        return false;
    }
    void initFailNode(Node * node){
        if(node->high == 1){
            node->failNode = root;
        }else{
            //以父节点的失败函数作为起点,node的element作为触发边得到node的失败函数
            Node * failNode = NULL;
            searchFailNode(node->parent,node,failNode);
            node->failNode = failNode;
        }
    }
    void traceAllNodes(Node * node){
        for(int i=0;i<node->childNum;i++){
            Node * cnode = node->nodeList[i];
            initFailNode(cnode);
            traceAllNodes(cnode);
        }
    }
    void searchFailNode(Node * pnode,Node * cnode,Node * & retNode){
        Node * failNode = pnode->failNode;
        for(int i=0;i<failNode->childNum;i++){
            Node * node = failNode->nodeList[i];
            if(node->element == cnode->element){
                retNode = node;
                return;
            }
        }
        //循环已经走完,说明没找到节点,如果已经搜索了根节点没找到,则返回根节点作为失败节点,否则继续搜索
        if(pnode->failNode == root)
            retNode = root;
        else
            searchFailNode(pnode->parent,pnode,retNode);
    }
    void tracePrintNodes(Node *node){
        for(int i=0;i<node->childNum;i++){
            Node * cnode = node->nodeList[i];
            printf("  %d     %c     %d       %d           %d        ",cnode->high,cnode->element,cnode->isMatchNode,cnode->failNode->high,cnode->childNum);
            for(int j=0;j<cnode->childNum;j++){
                Node * lnode = cnode->nodeList[j];
                printf("%c ",lnode->element);
            }
            printf("         ");
            for(int j=0;j<cnode->outPutNum;j++)
                printf("%s ",cnode->outPut[j]);
            printf("\\n");
            tracePrintNodes(cnode);
        }
    }
    void traceDelNodes(Node * node){
        for(int i=0;i<node->childNum;i++){
            Node * cnode = node->nodeList[i];
            traceDelNodes(cnode);
        }
        if(node!=root){
            delete node;
            node = NULL;
        }
    }
private:
    Node * root;
};
#endif // AC_H

-----------------------------------------------------------------------------------
main.cpp
#include "AC.h"
#include <time.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>
#include <unistd.h>
#include <map>
#define MAX_FILE_CONTENT  1024*1024*20
void currentTime(char * timeOutPut,int timeLen){
    time_t t = time(NULL);
    struct tm * timeinfo = localtime(&t);
    strftime (timeOutPut,timeLen,"%Y-%m-%d %H:%M:%S:",timeinfo);
    struct timeval tval;
    gettimeofday(&tval,NULL);
    sprintf(timeOutPut+strlen(timeOutPut),"%d",tval.tv_usec/1000);
}
//回调函数在外面进行统计,找到一个回调一次
void findCallBack(char * matchedStr,int startPos){
    char curTime[32]={0};
    currentTime(curTime,sizeof(curTime));
    printf("end time:%s  matchedStr:%s  matchedPos:%d\\n",curTime,matchedStr,startPos);
}

bool readFile(char * fileName,char * fileContent){
    int fd = open(fileName,O_RDONLY);
    if(-1 == fd){
        printf("open file error:%d\\n",errno);
        return false;
    }
    int len = read(fd,fileContent,MAX_FILE_CONTENT);
    if(-1 == len){
        printf("read file error:%d\\n",errno);
        close(fd);
        return false;
    }
    close(fd);
    return true;
}
void ac_func(char ** pattern,int patLen,char *txt,int txtLen){
    AC ac;
    ac.initTree(pattern,patLen);
    ac.buildFailNode();
    ac.printACTree();
    char curTime[32]={0};
    currentTime(curTime,sizeof(curTime));
    printf("searching tree-----------------------------\\n");
    printf("start time of search:%s\\n",curTime);
    ac.match(txt,txtLen,findCallBack);
    ac.deleteTree();
}
void test1(){
    char *pattern[]={"hee","he", "h","she", "his" ,"her","hers"};
    char *txt = "ushers";
    int patternLen = sizeof(pattern)/sizeof(char*);
    int txtLen = strlen(txt);
    ac_func(pattern,patternLen,txt,txtLen);
}
void test2(){
    //模式串出现的顺序在文件中刚好相反,里面存在一个关键字在日志文件的最后一行
    char * pattern[] = {"39_347990193541512029","39_347739859976612007","13_1002D375","2017-09-29 17:43:37:517"};
    //文件内容太大,使用堆内存
    char * txt = new char[MAX_FILE_CONTENT];
    bzero(txt,MAX_FILE_CONTENT);
    //test.log为一个10M的日志
    if(!readFile("../testfile/test.log",txt))
        return;
    int patternLen = sizeof(pattern)/sizeof(char*);
    int txtLen = strlen(txt);
    ac_func(pattern,patternLen,txt,txtLen);
    delete txt;
    txt = NULL;
}
void test3(){
    //模式串和内容出现自包含,notepad遇到这样的搜索有bug
    char *pattern[]={"s", "ss","sss","ssss"};
    char *txt = "ssss";
    int patternLen = sizeof(pattern)/sizeof(char*);
    int txtLen = strlen(txt);
    ac_func(pattern,patternLen,txt,txtLen);
}
int main()
{
    test1();
    test2();
    test3();
    return 0;
}
----------------------------------------------------------------------------------------------------


运行结果部分截图

技术分享图片

技术分享图片

 

 




 

 

          


 









以上是关于AC多模匹配+完整实现源码的主要内容,如果未能解决你的问题,请参考以下文章

AC自动机详解

Aho-Corasick automaton

多模匹配算法之Aho-Corasick

AC自动机算法详解以及Java代码实现

AC自动机

AC自动机初步