NLTK替换和矫正单词代码示例
Posted 光英的记忆
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了NLTK替换和矫正单词代码示例相关的知识,希望对你有一定的参考价值。
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
# import enchant
# from nltk.metrics import edit_distance
# 词干提取
stemmer = PorterStemmer()
print(stemmer.stem('cooking'))
print(stemmer.stem('cookery'))
# 词性还原
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('cooking'))
print(lemmatizer.lemmatize('cooking', pos='v'))
print(lemmatizer.lemmatize('cookbooks'))
stemmer = PorterStemmer()
# 词干提取
stemmer.stem('believes')
# 词性还原
lemmatizer.lemmatize('believes')
# 基于匹配的正则表达式替换单词
replacement_patterns = [
(r'won\\'t', 'will not'),
(r'can\\'t', 'cannot'),
(r'i\\'m', 'i am'),
(r'ain\\'t', 'is not'),
(r'(\\w+)\\'ll', '\\g<1> will'),
(r'(\\w+)n\\'t', '\\g<1> not'),
(r'(\\w+)\\'ve', '\\g<1> have'),
(r'(\\w+)\\'s', '\\g<1> is'),
(r'(\\w+)\\'re', '\\g<1> are'),
(r'(\\w+)\\'d', '\\g<1> would')
]
class RegexpReplacer(object):
def __init__(self, patterns=replacement_patterns):
self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
def replace(self, text):
s = text
for (pattern, repl) in self.patterns:
s = re.sub(pattern, repl, s)
return s
replacer = RegexpReplacer()
print(replacer.replace("can't is a contraction"))
print(replacer.replace("I should've done that thing I didn't do"))
# 移除重复字符
# 使用Enchant进行拼写校正
# sudo easy_install pyenchant
# 替换同义词
# from replacers import WordReplacer
# replacer = WordReplacer('bday': 'birthday')
# replacer.replace('bday')
# 使用反义词替换否定形式
以上是关于NLTK替换和矫正单词代码示例的主要内容,如果未能解决你的问题,请参考以下文章