c_cpp 使用后缀数组的Logest Repeated Substring(LRS)。复杂性:O(n.log(n))

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了c_cpp 使用后缀数组的Logest Repeated Substring(LRS)。复杂性:O(n.log(n))相关的知识,希望对你有一定的参考价值。

/*====================================================================
 Title: Computes Longest Repeated Substring from a given string
 Complexity: O(n.log(n))
 Author : Sudipto Chandra (Dipu)
 =====================================================================*/
#include <bits/stdc++.h>
using namespace std;
#define mem(a,b) memset(a, b, sizeof(a))
#define loop(i, x) for(__typeof((x).begin()) i=(x).begin(); i!=(x).end(); ++i)
#define rloop(i, x) for(__typeof((x).rbegin()) i=(x).rbegin(); i!=(x).rend(); ++i)
/*------------------------------------------------------------------------------------*/

int test, cas = 1;

const int SIZ = 10000050; // maximum possible size

int n; // text length
char T[SIZ]; // text
int SA[SIZ], tempSA[SIZ]; // the sorted suffixes
int RA[SIZ], tempRA[SIZ]; // ranks of suffix array
int L[SIZ]; // used in counting sort
int Phi[SIZ]; // the jumping parameter
int LCP[SIZ]; // longest common prefix
int PLCP[SIZ]; // permuted longest common prefix

inline int getRA(int i)
{
    return (i < n) ? RA[i] : 0;
}

void radix_sort(int k)
{
    mem(L, 0);
    // count frequencies
    for(int i = 0; i < n; ++i)
    {
        L[getRA(i + k)]++;
    }
    // save first index of every characters
    int mx = max(n, 130);
    for(int i = 0, s = 0; i < mx; ++i)
    {
        int x = L[i];
        L[i] = s;
        s += x;
    }
    // build sorted tempSA
    for(int i = 0; i < n; ++i)
    {
        int& x = L[getRA(SA[i] + k)];
        tempSA[x++] = SA[i];
    }
    // copy tempSA to SA
    for(int i = 0; i < n; ++i)
    {
        SA[i] = tempSA[i];
    }
}
// text must ends with a $
void buildSA()
{
    // initialize
    n = strlen(T);
    T[n++] = '$', T[n] = 0; // append $
    for(int i = 0; i < n; ++i)
    {
        SA[i] = i;
        RA[i] = T[i];
    }

    // algorithm loop
    for(int k = 1; k < n; k <<= 1)
    {
        // sort by k-th ranks
        radix_sort(k);
        radix_sort(0);
        // compute new ranks
        tempRA[SA[0]] = 0;
        for(int i = 1, r = 0; i < n; ++i)
        {
            if(getRA(SA[i-1]) != getRA(SA[i])) {
                r++;
            }
            else if(getRA(SA[i-1]+k) != getRA(SA[i]+k)) {
                r++;
            }
            tempRA[SA[i]] = r;
        }
        for(int i = 0; i < n; ++i)
        {
            RA[i] = tempRA[i];
        }
        if(RA[SA[n - 1]] == n - 1) break;
    }
}

// Finds the Longest Common Prefix (LCP) between two adjacent suffixes
// excluding the first suffix using the Permuted LCP (PLCP) theorem.
void computeLCP()
{
    Phi[SA[0]] = -1;
    for(int i = 1; i < n; ++i)
    {
        Phi[SA[i]] = SA[i - 1];
    }

    for(int i = 0, L = 0; i < n; ++i)
    {
        if(Phi[i] == -1)
        {
            PLCP[i] = 0;
            continue;
        }
        // longest common prefix length
        L = max(L - 1, 0);
        while(T[i + L] == T[Phi[i] + L])
        {
            L++;
        }
        PLCP[i] = L;
    }

    for(int i = 0; i < n; ++i)
    {
        LCP[i] = PLCP[SA[i]];
    }
}

// LRS length is unique. But LRS substring may not be unique.
// This function returns alphabetically smallest substring
// that has been repeated twice or more.
string findLRS()
{
    int pos, lrs = 0;
    for(int i = 1; i < n; i++)
    {
        if(LCP[i] > lrs)
        {
            lrs = LCP[i];
            pos = i;
        }
    }

    if(lrs == 0)
    {
        return "";
    }

    string res = T + SA[pos];
    return res.substr(0, lrs);
}

int main()
{
    /*
        GATAGACA
        abcabxabcd
    */

    gets(T);
    buildSA();
    computeLCP();
    string res = findLRS();
    printf("Longest Repeated Substring: %s [size = %u]\n", res.data(), res.size());

    return 0;
}

以上是关于c_cpp 使用后缀数组的Logest Repeated Substring(LRS)。复杂性:O(n.log(n))的主要内容,如果未能解决你的问题,请参考以下文章

c_cpp 使用Radix Sort实现后缀数组。复杂性:O(n.log(n))

c_cpp 使用后缀数组查找最长公共前缀(LCP)。复杂性:SA = O(n.log(n)),LCP = O(n)

c_cpp 表达式树 - 中缀到后缀

c_cpp ç判断文件后缀是否符合要求的.c

c_cpp 关于数组使用的一切。

c_cpp 使用数组实现堆栈