python Boyer-Moore字符串搜索算法:Python实现

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python Boyer-Moore字符串搜索算法:Python实现相关的知识,希望对你有一定的参考价值。

# coding = utf-8
from copy import deepcopy


def generate_bad_char(string: str) -> {int: {str: int}}:
    # when scan to a position, generate the bad char list for it
    bad_char_dic: {int: {str: int}} = {}
    for i in range(len(string)):

        bad_char_dic[i] = {}

        if i > 0:
            last_char = string[i - 1]
            bad_char_dic[i] = deepcopy(bad_char_dic[i - 1])
            bad_char_dic[i][last_char] = i - 1

    return bad_char_dic


def generate_prefix(string: str) -> {str: int}:
    """generate prefix until position"""
    prefix_dic: {str: int} = {}
    for i in range(len(string) - 1):
        prefix_dic[string[:i + 1]] = i

    return prefix_dic


def generate_good_suffix(string: str) -> [int]:
    """given certain length of suffix, if mismatch, where to go"""
    prefix_dic = generate_prefix(string)
    length = len(string)

    # when there is a good suffix, where to move
    good_suffix_dic: [int] = [-1]
    for i in range(length - 1, 0, -1):
        current_suffix = string[i:]
        perfect_match = string[:-1].rfind(current_suffix)
        if perfect_match != -1:
            good_suffix_dic.append(perfect_match + (length - i) - 1)
        else:
            sub_suffix_last_seen = -1
            for j in range(1, len(current_suffix)):
                sub_suffix = current_suffix[j:]

                try:
                    sub_suffix_last_seen = prefix_dic[sub_suffix]
                    break
                except KeyError:
                    pass
            good_suffix_dic.append(sub_suffix_last_seen)

    return good_suffix_dic


def search_for_keyword(haystack: str, needle: str) -> int:
    if haystack is None or needle is None:
        return -1

    if type(haystack) is list:
        haystack = ''.join(map(str, haystack))
    else:
        haystack = str(haystack)

    if type(needle) is list:
        needle = ''.join(map(str, needle))
    else:
        needle = str(needle)

    bad_char_dict = generate_bad_char(needle)
    good_suffix_dict = generate_good_suffix(needle)

    needle_length, haystack_length = len(needle), len(haystack)
    if needle_length > haystack_length:
        return -1
    elif needle_length == haystack_length:
        return 0 if haystack == needle else -1
    elif needle_length == 0:
        return 0
    elif needle_length == 1:
        return haystack.find(needle)

    haystack_pointer = needle_length - 1

    while haystack_pointer < haystack_length:
        match = True
        good_suffix_length = 0
        for needle_pointer in range(needle_length - 1, -1, -1):
            if haystack[haystack_pointer] != needle[needle_pointer]:
                match = False

                # calculate bad char shift
                bad_char_last_seen = -1
                try:
                    bad_char_last_seen = bad_char_dict[needle_pointer][haystack[haystack_pointer]]
                except KeyError:
                    pass

                bad_char_shift = needle_pointer - bad_char_last_seen

                # calculate good suffix shift
                good_suffix_shift = 0
                if good_suffix_length > 0:
                    good_suffix_position = good_suffix_dict[good_suffix_length]
                    good_suffix_shift = needle_length - 1 - good_suffix_position

                shift = max(bad_char_shift, good_suffix_shift)

                haystack_pointer += shift + good_suffix_length
                break
            else:
                haystack_pointer -= 1
                good_suffix_length += 1

        if match:
            return haystack_pointer + 1

    return -1


if __name__ == '__main__':
    print(search_for_keyword('671111091113298117115999711432117110973297103117106973210111032117110321129710697114', '111109111'))

以上是关于python Boyer-Moore字符串搜索算法:Python实现的主要内容,如果未能解决你的问题,请参考以下文章

哪个是更好的字符串搜索算法? Boyer-Moore 还是 Boyer Moore Horspool? [关闭]

哪个是更好的字符串搜索算法? Boyer-Moore 还是 Boyer Moore Horspool? [关闭]

字符串匹配的 Boyer-Moore 算法

字符串匹配算法之 ---- Boyer-Moore 算法

是否有一个Boyer-Moore字符串搜索和快速搜索和替换功能以及Delphi 2010 String(UnicodeString)的快速字符串计数?

字符串匹配的Boyer-Moore(BM)算法