ruby 法国Stemmer
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了ruby 法国Stemmer相关的知识,希望对你有一定的参考价值。
# -*- encoding: utf-8 -*-
#
# Implementation of the stemming algorithm at http://snowball.tartarus.org/algorithms/french/stemmer.html
# Based on the javascript port made by Kasun Gajasinghe http://snowball.tartarus.org/otherlangs/french_javascript.txt
#
# Testing:
# It uses the file voc.txt (http://snowball.tartarus.org/algorithms/french/voc.txt)
# and compares results with output.txt (http://snowball.tartarus.org/algorithms/french/output.txt)
#
# At the time being, it fails for 242 words on 20403, feel free to edit this gist.
def stem(word)
# Letters in French include the following accented forms,
# â à ç ë é ê è ï î ô û ù
# The following letters are vowels:
# a e i o u y â à ë é ê è ï î ô û ù
original_word = word
# Downcase it
word = word.downcase
tmp = -1
# Uppercase some part to exclude them later on
word.gsub!(/qu/, 'qU')
word.gsub!(/([aeiouyâàëéêèïîôûù])u([aeiouyâàëéêèïîôûù])/, '\1U\2')
word.gsub!(/([aeiouyâàëéêèïîôûù])i([aeiouyâàëéêèïîôûù])/, '\1I\2')
word.gsub!(/([aeiouyâàëéêèïîôûù])y/, '\1Y')
word.gsub!(/y([aeiouyâàëéêèïîôûù])/, 'Y\1')
# Determine RV
rv = '';
rv_index = -1;
if word =~ /^(par|col|tap)/ || word =~ /^[aeiouyâàëéêèïîôûù]{2}/
rv = word[3..word.length]
rv_index = 3
else
rv_index = (word[1..word.length]) =~ /[aeiouyâàëéêèïîôûù]/
if rv_index
rv_index += 2
rv = word[rv_index..word.length]
else
rv_index = word.length
end
end
# R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel.
# R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel
r1_index = word =~ /[aeiouyâàëéêèïîôûù][^aeiouyâàëéêèïîôûù]/
r1 = ''
if r1_index
r1_index += 2
r1 = word[r1_index..word.length]
else
r1_index = word.length
end
r2_index = -1
r2 = ''
if r1_index
r2_index = r1 =~ /[aeiouyâàëéêèïîôûù][^aeiouyâàëéêèïîôûù]/
if r2_index
r2_index += 2
r2 = r1[r2_index..r1.length]
r2_index += r1_index
else
r2 = ''
r2_index = word.length
end
end
if r1_index && r1_index < 3
r1_index = 3
r1 = word[r1_index..word.length]
end
# Step 1: Standard suffix removal
a1_index = word =~ /(ance|iqUe|isme|able|iste|eux|ances|iqUes|ismes|ables|istes)$/
a2_index = word =~ /(atrice|ateur|ation|atrices|ateurs|ations)$/
a3_index = word =~ /(logie|logies)$/
a4_index = word =~ /(usion|ution|usions|utions)$/
a5_index = word =~ /(ence|ences)$/
a6_index = word =~ /(ement|ements)$/
a7_index = word =~ /(ité|ités)$/
a8_index = word =~ /(if|ive|ifs|ives)$/
a9_index = word =~ /(eaux)$/
a10_index = word =~ /(aux)$/
a11_index = word =~ /(euse|euses)$/
a12_index = word =~ /[^aeiouyâàëéêèïîôûù](issement|issements)$/
a13_index = word =~ /(amment)$/
a14_index = word =~ /(emment)$/
a15_index = word =~ /[aeiouyâàëéêèïîôûù](ment|ments)$/
if a1_index && a1_index >= r2_index
word = word[0..a1_index - 1]
elsif a2_index && a2_index >= r2_index
word = word[0..a2_index - 1]
a2_index2 = word =~ /(ic)$/
if a2_index2 && a2_index2 >= r2_index
word = word[0..a2_index2 - 1]
else
word.gsub!(/(ic)$/, 'iqU')
end
elsif a3_index && a3_index >= r2_index
word.gsub!(/(logie|logies)$/, 'log')
elsif a4_index && a4_index >= r2_index
word.gsub!(/(usion|ution|usions|utions)$/, 'u')
elsif a5_index && a5_index >= r2_index
word.gsub!(/(ence|ences)$/, 'ent')
elsif a6_index && a6_index >= rv_index
word = word[0..a6_index - 1]
tmp = word =~ /(iv)$/
if !tmp.nil? && tmp >= r2_index
word.gsub!(/(iv)$/, '')
tmp = word =~ /(at)$/
if !tmp.nil? && tmp >= r2_index
word.gsub!(/(at)$/, '')
end
elsif word =~ /(eus)$/
a6_index2 = word =~ /(eus)$/
if a6_index2 >= r2_index
word = word[0..a6_index2 - 1]
elsif a6_index2 >= r1_index
word = word[0..a6_index2 - 1] + 'eux';
end
elsif !(tmp = (word =~ /(abl|iqU)$/)).nil? && tmp >= r2_index
word.gsub!(/(abl|iqU)$/, '')
elsif !(tmp = (word =~ /(ièr|Ièr)$/)).nil? && tmp >= rv_index
word.gsub!(/(ièr|Ièr)$/, 'i')
end
elsif a7_index && a7_index >= r2_index
word = word[0..a7_index - 1]
if word =~ /(abil)$/
a7_index2 = word =~ /(abil)$/
if a7_index2 >= r2_index
word = word[0..a7_index2 - 1]
else
word = word[0..a7_index2 - 1] + 'abl'
end
elsif word =~ /(ic)$/
a7_index3 = word =~ /(ic)$/
if a7_index3 && a7_index3 >= r2_index
word = word[0..a7_index3 - 1]
else
word.gsub!(/(ic)$/, 'iqU')
end
elsif !(tmp = (word =~ /(iv)$/)).nil? && tmp != r2_index
word.gsub!(/(iv)$/, '')
end
elsif a8_index && a8_index >= r2_index
word = word[0..a8_index - 1]
tmp = word =~ /(at)$/
if !tmp.nil? && tmp >= r2_index
word.gsub!(/(at)$/, '')
tmp = word =~ /(ic)$/
if !tmp.nil? && tmp >= r2_index
word.gsub!(/(ic)$/, '')
else
word.gsub!(/(ic)$/, 'iqU')
end
end
elsif a9_index
word.gsub!(/(eaux)/, 'eau')
elsif a10_index && a10_index >= r1_index
word.gsub!(/(aux)/, 'al')
elsif a11_index
a11_index2 = word =~ /(euse|euses)$/
if a11_index2 >= r2_index
word = word[0..a11_index2 - 1]
elsif a11_index2 >= r1_index
word = word[0..a11_index2 - 1] + 'eux'
end
elsif a12_index && a12_index >= r1_index
word = word[0..a12_index]
elsif a13_index && a13_index >= rv_index
word.gsub!(/(amment)$/, 'ant')
elsif a14_index && a14_index >= rv_index
word.gsub!(/(emment)$/, 'ent')
elsif a15_index && a15_index >= rv_index
word = word[0..a15_index]
end
# Step 2a: Verb suffixes beginning i
word_step1 = word.clone
step_2a_done = false
if original_word == word.downcase || original_word =~ /(amment|emment|ment|ments)$/
step_2a_done = true
b1_regex = /([^aeiouyâàëéêèïîôûù])(îmes|ît|îtes|i|ie|ies|ir|ira|irai|iraIent|irais|irait|iras|irent|irez|iriez|irions|irons|iront|is|issaIent|issais|issait|issant|issante|issantes|issants|isse|issent|isses|issez|issiez|issions|issons|it)$/i
tmp = word =~ b1_regex
if !tmp.nil? && tmp >= rv_index
word.gsub!(b1_regex, '\1')
end
end
# Step 2b: Other verb suffixes
if step_2a_done && word_step1 == word
b2_regex = /(é|ée|ées|és|èrent|er|era|erai|eraIent|erais|erait|eras|erez|eriez|erions|erons|eront|ez|iez)$/i
tmp = word =~ b2_regex
if tmp && tmp >= rv_index
word.gsub!(b2_regex, '')
else
tmp = word =~ /(ions)$/
if tmp && tmp >= r2_index
word.gsub!(/(ions)$/, '')
else
b3_regex = /e(âmes|ât|âtes|a|ai|aIent|ais|ait|ant|ante|antes|ants|as|asse|assent|asses|assiez|assions)$/i
tmp = word =~ b3_regex
if tmp && tmp >= rv_index
word.gsub!(b3_regex, '')
else
b3_regex2 = /(âmes|ât|âtes|a|ai|aIent|ais|ait|ant|ante|antes|ants|as|asse|assent|asses|assiez|assions)$/i
tmp = word =~ b3_regex2
if tmp && tmp >= rv_index
word.gsub!(b3_regex2, '')
end
end
end
end
end
if original_word != word.downcase
# Step 3
rep = ''
if word =~ /Y$/
word.gsub!(/Y$/, 'i')
elsif word =~ /ç$/
word.gsub!(/ç$/, 'c')
end
else
# Step 4
# If the word ends s, not preceded by a, i, o, u, è or s, delete it
tmp = word =~ /([^aiouès])s$/
if tmp && tmp >= rv_index
word.gsub!(/([^aiouès])s$/, '\1')
end
e1_index = word =~ /ion$/
tmp = word =~ /[st]ion$/
if e1_index && e1_index >= r2_index && tmp && tmp >= rv_index
word = word[0..e1_index - 1]
else
e2_index = word =~ /(ier|ière|Ier|Ière)$/
if e2_index && e2_index >= rv_index
word = word[0..e2_index - 1] + 'i'
else
tmp = word =~ /e$/
if tmp && tmp >= rv_index
word.gsub!(/e$/, '')
elsif !(tmp = (word =~ /guë$/)).nil? && tmp >= rv_index
word.gsub!(/guë$/, 'gu')
end
end
end
end
# Step 5: Undouble
word.gsub!(/(en|on)(n)$/, '\1')
word.gsub!(/(ett)$/, 'et')
word.gsub!(/(el|eil)(l)$/, '\1')
# Step 6: Un-accent
word.gsub!(/[éè]([^aeiouyâàëéêèïîôûù]+)$/, 'e\1')
word.downcase.strip
end
# TESTS
# Opens voc.txt and compare the stem result with output.txt
voc = File.open('voc.txt', 'r:UTF-8')
expected = File.open('output.txt', 'r:UTF-8')
expected_lines = expected.lines.to_a
errors = 0
voc.lines.each_with_index do |l, i|
stemmed = stem(l)
expected = expected_lines[i].strip
if stemmed != expected
puts "Error: #{l} expected: #{expected} actual: #{stemmed}"
errors += 1
end
end
puts "#{errors} error(s) found, tested #{expected_lines.length} words/stems"
以上是关于ruby 法国Stemmer的主要内容,如果未能解决你的问题,请参考以下文章
如何使用Stemmer或Lemmatizer来阻止特定的单词