前段时间同事讲了一下AC多模匹配的原理,就试着写了下代码,现在再看代码的时候有些地方连自己都快看不懂了,所以想做一个笔记
查找10M的日志文件,即使最后一行,用时不到0.2毫秒
能正确查找出ssss串中的s,ss,sss,ssss个数和位置,notepad无法实现
1、构建树
构建模式串(需要搜索的串)为{"hee","he", "h","she", "his" ,"her","hers"}的树。注:下划线序号表示层高
构建过程也是查找的过程,举例讲解构建:hee过程,其他类似
构建hee过程:
1、构建h节点:输入串为h,以根节点作为起始节点SNode,查找SNode下是否存在值为h,层高为strlen(h)=1的节点,不存在,插入
2、构建第一个e节点:输入串为he,以根节点作为起始节点SNode,查找SNode下是否存在值为h,层高为strlen(h)=1的节点,存在,以h节点作为SNode,查找是否存值为e,层高为strlen(he)的节点,不存在,插入
3、构建第二个e节点:输入串为hee,以根节点作为起始节点SNode,查找SNode下是否存在值为h,层高为strlen(h)=1的节点,存在,以h节点作为SNode,查找是否存值为e,层高为strlen(he)的节点,存在,以e节点作为SNode节点,查找是否存值为e,层高为strlen(hee)的节点,不存在,插入。构建字符e为构建串的最后一个字符时,该节点为输出节点
构建一个串中某个字符的节点时,会将该字符的前面所有字符+该字符作为一个串输入,搜索树并插入。因为每个字符必须要知道前面的路径
2、构建失败节点
- 根节点的失败节点为根节点,第一层节点的失败节点为根节点
- 某个节点的失败节点为:以该节点的父节点的失败节点为起点查找该节点,如果能找到,找到的节点为失败节点,不能找到,则失败节点为根节点
已知: f(root) = root;f(h_1)=root;f(s_1)=root
解: f(e_2) = g(f(h_1),e) = g(root,e) = root
f(e_3) = g(f(e_2),e) = g(root,e) = root
f(r_3) = g(f(e_2),r) = g(root,r) = root
f(s_4) = g(f(r_3),s) = g(root,s) = s_1
f(i_2) = g(f(h_1),i) = g(root,i) = root
f(s_3) = g(f(i_2),s) = g(root,s) = s_1
f(h_2) = g(f(s_1),h) = g(root,h) = h_1
f(e_3) = g(f(h_2),e)=g(h_1,e)=e_2
3、查找"ushers"中的模式串
1、遍历"ushers",从根节点作为起始节点SNode,查找字符为u的节点,查找失败,以SNode的失败节点为起始节点查找下一个字符s
2、以SNode作为起始节点,查找字符为s的节点,查找成功,判断节点是否为匹配节点,输出匹配值,以s节点作为起始节点查找h节点,直到遍历完"ushers"
大家可以手动试一下构建模式串为{"s","ss","sss","ssss"}的树,构建每个节点的失败节点,并在内容"ssss"中查找模式串
文字只讲了大致流程,还有很多细节没有描述出来,语言水平有限啊,细节大家看代码吧...
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
AC.h
#ifndef AC_H
#define AC_H
#define MAX_CHILD_LEN 128
#define MAX_OUTPUT 128
#include <string.h>
#include <stdio.h>
class Node
{
public:
Node(char ele){
element = ele;
parent = NULL;
failNode = NULL;
isMatchNode=false;
memset(nodeList,0,MAX_CHILD_LEN);
childNum=0;
high = 0;
memset(outPut,0,MAX_OUTPUT);
outPutNum = 0;
}
~Node(){
printf("node release,high:%d value:%c\\n",high,element);
for(int i=0;i<outPutNum;i++){
char * ele = outPut[i];
printf("release output str:%s\\n",ele);
delete ele;
ele = NULL;
}
}
char element;
Node * parent;
Node * nodeList[MAX_CHILD_LEN];
Node * failNode;
bool isMatchNode;
int childNum;
int high;
char * outPut[MAX_OUTPUT];
int outPutNum;
};
typedef void(*Func)(char * matchedStr,int post);
class AC
{
public:
AC() {root = new Node(NULL);root->failNode = root;}
~AC(){
delete root;
root = NULL;
}
void initTree(char * patterns[],int patLen){
for(int i=0;i<patLen;i++){
char * element = patterns[i];
int eleLen = strlen(element);
for(int j=0;j<eleLen;j++){
bool isMatch = false;
if(j == eleLen-1)
isMatch = true;
char * p = new char[j+2];
memset(p,‘\\0‘,j+2);
strncpy(p,element,j+1);
insert(p,isMatch);
}
}
}
void buildFailNode(){
traceAllNodes(root);
}
void match(char * srcTxt,int txtLen,Func f){
Node * startNode = root;
for(int i=0;i<txtLen;i++){
char e = srcTxt[i];
bool isOk = false;
for(int j=0;j<startNode->childNum;j++){
Node * node = startNode->nodeList[j];
if(node->element == e){
isOk = true;
startNode = node;
Node * failNode = node->failNode;
while(failNode!=root){
if(failNode->isMatchNode){
for(int k=0;k<failNode->outPutNum;k++)
f(failNode->outPut[k],i);
}
failNode = failNode->failNode;
}
if(node->isMatchNode){
for(int k=0;k<node->outPutNum;k++)
f(node->outPut[k],i);
}
break;
}
}
if(!isOk){
startNode = startNode->failNode;
if(startNode!=root)
i--;
}
}
}
void deleteTree(){
printf("delete tree--------------------------------\\n");
traceDelNodes(root);
}
void printACTree(){
printf("tree structure-----------------------------\\n");
printf("high value match failNode childNum children outPutStr \\n");
tracePrintNodes(root);
}
private:
void insert(char * ele,bool isMatch){
int eleLen = strlen(ele);
//搜索ele最后一个元素节点是否存在,不存在则返回父节点
int startH = 1;
Node * pnode = NULL;
if(!search(root,startH,ele,pnode)){
Node * cnode = new Node(ele[eleLen-1]);
cnode->high = eleLen;
cnode->parent = pnode;
cnode->isMatchNode = isMatch;
pnode->nodeList[pnode->childNum]=cnode;
pnode->childNum++;
if(isMatch)
cnode->outPut[cnode->outPutNum++]=ele;
}
}
bool search(Node * pnode,int & index,char * ele,Node * &retNode){
for(int i=0;i<pnode->childNum;i++){
Node * node = pnode->nodeList[i];
if(node->element == ele[index-1]){
if(index == strlen(ele))
return true;
index++;
return search(node,index,ele,retNode);
}
}
retNode = pnode;
return false;
}
void initFailNode(Node * node){
if(node->high == 1){
node->failNode = root;
}else{
//以父节点的失败函数作为起点,node的element作为触发边得到node的失败函数
Node * failNode = NULL;
searchFailNode(node->parent,node,failNode);
node->failNode = failNode;
}
}
void traceAllNodes(Node * node){
for(int i=0;i<node->childNum;i++){
Node * cnode = node->nodeList[i];
initFailNode(cnode);
traceAllNodes(cnode);
}
}
void searchFailNode(Node * pnode,Node * cnode,Node * & retNode){
Node * failNode = pnode->failNode;
for(int i=0;i<failNode->childNum;i++){
Node * node = failNode->nodeList[i];
if(node->element == cnode->element){
retNode = node;
return;
}
}
//循环已经走完,说明没找到节点,如果已经搜索了根节点没找到,则返回根节点作为失败节点,否则继续搜索
if(pnode->failNode == root)
retNode = root;
else
searchFailNode(pnode->parent,pnode,retNode);
}
void tracePrintNodes(Node *node){
for(int i=0;i<node->childNum;i++){
Node * cnode = node->nodeList[i];
printf(" %d %c %d %d %d ",cnode->high,cnode->element,cnode->isMatchNode,cnode->failNode->high,cnode->childNum);
for(int j=0;j<cnode->childNum;j++){
Node * lnode = cnode->nodeList[j];
printf("%c ",lnode->element);
}
printf(" ");
for(int j=0;j<cnode->outPutNum;j++)
printf("%s ",cnode->outPut[j]);
printf("\\n");
tracePrintNodes(cnode);
}
}
void traceDelNodes(Node * node){
for(int i=0;i<node->childNum;i++){
Node * cnode = node->nodeList[i];
traceDelNodes(cnode);
}
if(node!=root){
delete node;
node = NULL;
}
}
private:
Node * root;
};
#endif // AC_H
-----------------------------------------------------------------------------------
main.cpp
#include "AC.h"
#include <time.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>
#include <unistd.h>
#include <map>
#define MAX_FILE_CONTENT 1024*1024*20
void currentTime(char * timeOutPut,int timeLen){
time_t t = time(NULL);
struct tm * timeinfo = localtime(&t);
strftime (timeOutPut,timeLen,"%Y-%m-%d %H:%M:%S:",timeinfo);
struct timeval tval;
gettimeofday(&tval,NULL);
sprintf(timeOutPut+strlen(timeOutPut),"%d",tval.tv_usec/1000);
}
//回调函数在外面进行统计,找到一个回调一次
void findCallBack(char * matchedStr,int startPos){
char curTime[32]={0};
currentTime(curTime,sizeof(curTime));
printf("end time:%s matchedStr:%s matchedPos:%d\\n",curTime,matchedStr,startPos);
}
bool readFile(char * fileName,char * fileContent){
int fd = open(fileName,O_RDONLY);
if(-1 == fd){
printf("open file error:%d\\n",errno);
return false;
}
int len = read(fd,fileContent,MAX_FILE_CONTENT);
if(-1 == len){
printf("read file error:%d\\n",errno);
close(fd);
return false;
}
close(fd);
return true;
}
void ac_func(char ** pattern,int patLen,char *txt,int txtLen){
AC ac;
ac.initTree(pattern,patLen);
ac.buildFailNode();
ac.printACTree();
char curTime[32]={0};
currentTime(curTime,sizeof(curTime));
printf("searching tree-----------------------------\\n");
printf("start time of search:%s\\n",curTime);
ac.match(txt,txtLen,findCallBack);
ac.deleteTree();
}
void test1(){
char *pattern[]={"hee","he", "h","she", "his" ,"her","hers"};
char *txt = "ushers";
int patternLen = sizeof(pattern)/sizeof(char*);
int txtLen = strlen(txt);
ac_func(pattern,patternLen,txt,txtLen);
}
void test2(){
//模式串出现的顺序在文件中刚好相反,里面存在一个关键字在日志文件的最后一行
char * pattern[] = {"39_347990193541512029","39_347739859976612007","13_1002D375","2017-09-29 17:43:37:517"};
//文件内容太大,使用堆内存
char * txt = new char[MAX_FILE_CONTENT];
bzero(txt,MAX_FILE_CONTENT);
//test.log为一个10M的日志
if(!readFile("../testfile/test.log",txt))
return;
int patternLen = sizeof(pattern)/sizeof(char*);
int txtLen = strlen(txt);
ac_func(pattern,patternLen,txt,txtLen);
delete txt;
txt = NULL;
}
void test3(){
//模式串和内容出现自包含,notepad遇到这样的搜索有bug
char *pattern[]={"s", "ss","sss","ssss"};
char *txt = "ssss";
int patternLen = sizeof(pattern)/sizeof(char*);
int txtLen = strlen(txt);
ac_func(pattern,patternLen,txt,txtLen);
}
int main()
{
test1();
test2();
test3();
return 0;
}
----------------------------------------------------------------------------------------------------
运行结果部分截图