本拼写检查器是基于朴素贝叶斯的基础来写的,贝叶斯公式以及原理就不在详述。直接上代码
import re, collections def words(text): return re.findall(‘[a-z]+‘, text.lower()) def train(features): model = collections.defaultdict(lambda : 1) for f in features: model[f] += 1 return model alphabet = ‘abcdefghijklmnopqrstuvwxyz‘ NWORDS = train(words(open(‘D:/big.txt‘).read())) def edits1(word): n = len(word) return set( [word[0:i] + word[i + 1:] for i in range(n)] + # deletion [word[0:i] + word[i + 1] + word[i] + word[i + 2:] for i in range(n - 1)] + # transposition [word[0:i] + c + word[i + 1:] for i in range(n) for c in alphabet] + # alteration [word[0:i] + c + word[i:] for i in range(n + 1) for c in alphabet] ) def known_edits2(word): return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS) def known(words): return set(w for w in words if w in NWORDS) def correct(word): cindidates = known([word]) or known(edits1(word)) or known_edits2(word) or [word] return max(cindidates, key=lambda w: NWORDS[w]) print(correct(‘tess‘))
以我现在的水平,现在看贝叶斯的原理还是懂的,但是编辑距离还不是很懂,以后慢慢变懂