ruby 法国Stemmer

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了ruby 法国Stemmer相关的知识,希望对你有一定的参考价值。

# -*- encoding: utf-8 -*-
#
# Implementation of the stemming algorithm at http://snowball.tartarus.org/algorithms/french/stemmer.html
# Based on the javascript port made by Kasun Gajasinghe http://snowball.tartarus.org/otherlangs/french_javascript.txt
# 
# Testing:
#   It uses the file voc.txt (http://snowball.tartarus.org/algorithms/french/voc.txt)
#   and compares results with output.txt (http://snowball.tartarus.org/algorithms/french/output.txt)
#   
# At the time being, it fails for 242 words on 20403, feel free to edit this gist.

def stem(word)
    #    Letters in French include the following accented forms,
    #        â   à   ç   ë   é   ê   è   ï   î   ô   û   ù
    #    The following letters are vowels:
    #        a   e   i   o   u   y   â   à   ë   é   ê   è   ï   î   ô   û   ù
    
    original_word = word

    # Downcase it
    word = word.downcase
    tmp = -1

    # Uppercase some part to exclude them later on
    word.gsub!(/qu/, 'qU')
    word.gsub!(/([aeiouyâàëéêèïîôûù])u([aeiouyâàëéêèïîôûù])/, '\1U\2')
    word.gsub!(/([aeiouyâàëéêèïîôûù])i([aeiouyâàëéêèïîôûù])/, '\1I\2')
    word.gsub!(/([aeiouyâàëéêèïîôûù])y/, '\1Y')
    word.gsub!(/y([aeiouyâàëéêèïîôûù])/, 'Y\1')

    # Determine RV
    rv = '';
    rv_index = -1;

    if word =~ /^(par|col|tap)/ || word =~ /^[aeiouyâàëéêèïîôûù]{2}/
        rv = word[3..word.length]
        rv_index = 3
    else
        rv_index = (word[1..word.length]) =~ /[aeiouyâàëéêèïîôûù]/
        if rv_index
            rv_index += 2
            rv = word[rv_index..word.length]
        else
            rv_index = word.length
        end
    end

    # R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel.
    # R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel
    r1_index = word =~ /[aeiouyâàëéêèïîôûù][^aeiouyâàëéêèïîôûù]/
    r1 = ''
    if r1_index
        r1_index += 2
        r1 = word[r1_index..word.length]
    else
        r1_index = word.length
    end

    r2_index = -1
    r2 = ''
    if r1_index
        r2_index = r1 =~ /[aeiouyâàëéêèïîôûù][^aeiouyâàëéêèïîôûù]/
        if r2_index
            r2_index += 2
            r2 = r1[r2_index..r1.length]
            r2_index += r1_index
        else
            r2 = ''
            r2_index = word.length
        end
    end
    if r1_index && r1_index < 3
        r1_index = 3
        r1 = word[r1_index..word.length]
    end

    # Step 1: Standard suffix removal
    
    a1_index = word =~ /(ance|iqUe|isme|able|iste|eux|ances|iqUes|ismes|ables|istes)$/
    a2_index = word =~ /(atrice|ateur|ation|atrices|ateurs|ations)$/
    a3_index = word =~ /(logie|logies)$/
    a4_index = word =~ /(usion|ution|usions|utions)$/
    a5_index = word =~ /(ence|ences)$/
    a6_index = word =~ /(ement|ements)$/
    a7_index = word =~ /(ité|ités)$/
    a8_index = word =~ /(if|ive|ifs|ives)$/
    a9_index = word =~ /(eaux)$/
    a10_index = word =~ /(aux)$/
    a11_index = word =~ /(euse|euses)$/
    a12_index = word =~ /[^aeiouyâàëéêèïîôûù](issement|issements)$/
    a13_index = word =~ /(amment)$/
    a14_index = word =~ /(emment)$/
    a15_index = word =~ /[aeiouyâàëéêèïîôûù](ment|ments)$/

    if a1_index && a1_index >= r2_index
        word = word[0..a1_index - 1]
    elsif a2_index && a2_index >= r2_index
        word = word[0..a2_index - 1]
        a2_index2 = word =~ /(ic)$/
        if a2_index2 && a2_index2 >= r2_index
            word = word[0..a2_index2 - 1]
        else
            word.gsub!(/(ic)$/, 'iqU')
        end
    elsif a3_index && a3_index >= r2_index
        word.gsub!(/(logie|logies)$/, 'log')
    elsif a4_index && a4_index >= r2_index
        word.gsub!(/(usion|ution|usions|utions)$/, 'u')
    elsif a5_index && a5_index >= r2_index
        word.gsub!(/(ence|ences)$/, 'ent')
    elsif a6_index && a6_index >= rv_index
        word = word[0..a6_index - 1]
        tmp = word =~ /(iv)$/
        if !tmp.nil? && tmp >= r2_index
            word.gsub!(/(iv)$/, '')
            tmp = word =~ /(at)$/
            if !tmp.nil? && tmp >= r2_index
                word.gsub!(/(at)$/, '')
            end
        elsif word =~ /(eus)$/
            a6_index2 = word =~ /(eus)$/
            if a6_index2 >= r2_index
                word = word[0..a6_index2 - 1]
            elsif a6_index2 >= r1_index
                word = word[0..a6_index2 - 1] + 'eux';
            end
        elsif !(tmp = (word =~ /(abl|iqU)$/)).nil? && tmp >= r2_index
            word.gsub!(/(abl|iqU)$/, '')
        elsif !(tmp = (word =~ /(ièr|Ièr)$/)).nil? && tmp >= rv_index
            word.gsub!(/(ièr|Ièr)$/, 'i')
        end
    elsif a7_index && a7_index >= r2_index
        word = word[0..a7_index - 1]
        if word =~ /(abil)$/
            a7_index2 = word =~ /(abil)$/
            if a7_index2 >= r2_index
                word = word[0..a7_index2 - 1]
            else
                word = word[0..a7_index2 - 1] + 'abl'
            end
        elsif word =~ /(ic)$/
            a7_index3 = word =~ /(ic)$/
            if a7_index3 && a7_index3 >= r2_index
                word = word[0..a7_index3 - 1]
            else
                word.gsub!(/(ic)$/, 'iqU')
            end
        elsif !(tmp = (word =~ /(iv)$/)).nil? && tmp != r2_index
            word.gsub!(/(iv)$/, '')
        end
    elsif a8_index && a8_index >= r2_index
        word = word[0..a8_index - 1]
        tmp = word =~ /(at)$/
        if !tmp.nil? && tmp >= r2_index
            word.gsub!(/(at)$/, '')
            tmp = word =~ /(ic)$/
            if !tmp.nil? && tmp >= r2_index
                word.gsub!(/(ic)$/, '')
            else
                word.gsub!(/(ic)$/, 'iqU')
            end
        end
    elsif a9_index
        word.gsub!(/(eaux)/, 'eau')
    elsif a10_index && a10_index >= r1_index
        word.gsub!(/(aux)/, 'al')
    elsif a11_index
        a11_index2 = word =~ /(euse|euses)$/
        if a11_index2 >= r2_index
            word = word[0..a11_index2 - 1]
        elsif a11_index2 >= r1_index
            word = word[0..a11_index2 - 1] + 'eux'
        end
    elsif a12_index && a12_index >= r1_index
        word = word[0..a12_index]
    elsif a13_index && a13_index >= rv_index
        word.gsub!(/(amment)$/, 'ant')
    elsif a14_index && a14_index >= rv_index
        word.gsub!(/(emment)$/, 'ent')
    elsif a15_index && a15_index >= rv_index
        word = word[0..a15_index]
    end

    # Step 2a: Verb suffixes beginning i
    
    word_step1 = word.clone
    step_2a_done = false
    if original_word == word.downcase || original_word =~ /(amment|emment|ment|ments)$/
        step_2a_done = true
        b1_regex = /([^aeiouyâàëéêèïîôûù])(îmes|ît|îtes|i|ie|ies|ir|ira|irai|iraIent|irais|irait|iras|irent|irez|iriez|irions|irons|iront|is|issaIent|issais|issait|issant|issante|issantes|issants|isse|issent|isses|issez|issiez|issions|issons|it)$/i
        tmp = word =~ b1_regex
        if !tmp.nil? && tmp >= rv_index
            word.gsub!(b1_regex, '\1')
        end
    end

    # Step 2b: Other verb suffixes
    if step_2a_done && word_step1 == word
        b2_regex = /(é|ée|ées|és|èrent|er|era|erai|eraIent|erais|erait|eras|erez|eriez|erions|erons|eront|ez|iez)$/i
        tmp = word =~ b2_regex
        if tmp && tmp >= rv_index
            word.gsub!(b2_regex, '')
        else
            tmp = word =~ /(ions)$/
            if tmp && tmp >= r2_index
                word.gsub!(/(ions)$/, '')
            else
                b3_regex = /e(âmes|ât|âtes|a|ai|aIent|ais|ait|ant|ante|antes|ants|as|asse|assent|asses|assiez|assions)$/i
                tmp = word =~ b3_regex
                if tmp && tmp >= rv_index
                    word.gsub!(b3_regex, '')
                else
                    b3_regex2 = /(âmes|ât|âtes|a|ai|aIent|ais|ait|ant|ante|antes|ants|as|asse|assent|asses|assiez|assions)$/i
                    tmp = word =~ b3_regex2
                    if tmp && tmp >= rv_index
                        word.gsub!(b3_regex2, '')
                    end
                end
            end
        end
    end

    if original_word != word.downcase
        # Step 3
        rep = ''
        if word =~ /Y$/
            word.gsub!(/Y$/, 'i')
        elsif word =~ /ç$/
            word.gsub!(/ç$/, 'c')
        end
    else
        # Step 4
        # If the word ends s, not preceded by a, i, o, u, è or s, delete it
        tmp = word =~ /([^aiouès])s$/
        if tmp && tmp >= rv_index
            word.gsub!(/([^aiouès])s$/, '\1')
        end
        e1_index = word =~ /ion$/
        tmp = word =~ /[st]ion$/
        if e1_index && e1_index >= r2_index && tmp && tmp >= rv_index
            word = word[0..e1_index - 1]
        else
            e2_index = word =~ /(ier|ière|Ier|Ière)$/
            if e2_index && e2_index >= rv_index
                word = word[0..e2_index - 1] + 'i'
            else
                tmp = word =~ /e$/
                if tmp && tmp >= rv_index
                    word.gsub!(/e$/, '')
                elsif !(tmp = (word =~ /guë$/)).nil? && tmp >= rv_index
                    word.gsub!(/guë$/, 'gu')
                end
            end
        end
    end

    # Step 5: Undouble
    word.gsub!(/(en|on)(n)$/, '\1')
    word.gsub!(/(ett)$/, 'et')
    word.gsub!(/(el|eil)(l)$/, '\1')

    # Step 6: Un-accent
    word.gsub!(/[éè]([^aeiouyâàëéêèïîôûù]+)$/, 'e\1')
    word.downcase.strip

end

# TESTS
# Opens voc.txt and compare the stem result with output.txt

voc = File.open('voc.txt', 'r:UTF-8')
expected = File.open('output.txt', 'r:UTF-8')
expected_lines = expected.lines.to_a

errors = 0

voc.lines.each_with_index do |l, i|
    stemmed = stem(l)
    expected = expected_lines[i].strip
    if stemmed != expected
        puts "Error: #{l} expected: #{expected} actual: #{stemmed}"
        errors += 1
    end
end

puts "#{errors} error(s) found, tested #{expected_lines.length} words/stems"

以上是关于ruby 法国Stemmer的主要内容,如果未能解决你的问题,请参考以下文章

如何使用Stemmer或Lemmatizer来阻止特定的单词

NLTK替换和矫正单词代码示例

时尚|Ruby Fang:律政佳人到设计师,伟大都源于勇敢的开始

谁能告诉我法国朗格勒(langres)的概况?

法国和意大利的小城镇都有哪些????

法国红酒aop是啥等级