Python3自然语言处理——获得文本语料库和词汇资源
Posted 引文空间
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Python3自然语言处理——获得文本语料库和词汇资源相关的知识,希望对你有一定的参考价值。
1.获取文本语料库
古腾堡语料库
> import nltk
> nltk.corpus.gutenberg.fileids()
['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']
from nltk.corpus import gutenberg
gutenberg.fileids()
['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']
>>> from nltk.corpus import webtext
>>> for fileid in webtext.fileids():
print(fileid)
firefox.txt
grail.txt
overheard.txt
pirates.txt
singles.txt
wine.txt
'10-19-20s_706posts.xml') > chatroom=nps_chat.posts(
> chatroom
[['now', 'im', 'left', 'with', 'this', 'gay', 'name'], [':P'], ...]
布朗语料库
from nltk.corpus import brown
brown.categories()
['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
路透社语料库
> from nltk.corpus import reuters
> reuters.fileids()
(此处省略2198行)
> reuters.categories()
['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']
就职演说语料库
from nltk.corpus import inaugural
inaugural.fileids()
['1789-Washington.txt', '1793-Washington.txt', '1797-Adams.txt', '1801-Jefferson.txt', '1805-Jefferson.txt', '1809-Madison.txt', '1813-Madison.txt', '1817-Monroe.txt', '1821-Monroe.txt', '1825-Adams.txt', '1829-Jackson.txt', '1833-Jackson.txt', '1837-VanBuren.txt', '1841-Harrison.txt', '1845-Polk.txt', '1849-Taylor.txt', '1853-Pierce.txt', '1857-Buchanan.txt', '1861-Lincoln.txt', '1865-Lincoln.txt', '1869-Grant.txt', '1873-Grant.txt', '1877-Hayes.txt', '1881-Garfield.txt', '1885-Cleveland.txt', '1889-Harrison.txt', '1893-Cleveland.txt', '1897-McKinley.txt', '1901-McKinley.txt', '1905-Roosevelt.txt', '1909-Taft.txt', '1913-Wilson.txt', '1917-Wilson.txt', '1921-Harding.txt', '1925-Coolidge.txt', '1929-Hoover.txt', '1933-Roosevelt.txt', '1937-Roosevelt.txt', '1941-Roosevelt.txt', '1945-Roosevelt.txt', '1949-Truman.txt', '1953-Eisenhower.txt', '1957-Eisenhower.txt', '1961-Kennedy.txt', '1965-Johnson.txt', '1969-Nixon.txt', '1973-Nixon.txt', '1977-Carter.txt', '1981-Reagan.txt', '1985-Reagan.txt', '1989-Bush.txt', '1993-Clinton.txt', '1997-Clinton.txt', '2001-Bush.txt', '2005-Bush.txt', '2009-Obama.txt', '2013-Obama.txt', '2017-Trump.txt']
标注文本语料库
其他语言的语料库
载入你自己的语料库
C:\Users\Administrator\Desktop\examples
from nltk.corpus import PlaintextCorpusReader
>>> corpus_root=r'C:\Users\Administrator\Desktop\examples'
'.*') wordlist=PlaintextCorpusReader(corpus_root,
wordlist.fileids()
['1.txt', '2.txt']
'1.txt') wordlist.words(
['There', 'was', 'an', 'old', 'man', 'who', 'lived', ...]
2.条件频率分布
def plot(self, *args, **kwargs):
"""
Plot the given samples from the conditional frequency distribution.
For a cumulative plot, specify cumulative=True.
(Requires Matplotlib to be installed.)
:param samples: The samples to plot
:type samples: list
:param title: The title for the graph
:type title: str
:param conditions: The conditions to plot (default is all)
:type conditions: list
"""
try:
import matplotlib.pyplot as plt #import statment fix
except ImportError:
raise ValueError(
'The plot function requires matplotlib to be installed.'
'See http://matplotlib.org/'
)
cumulative = _get_kwarg(kwargs, 'cumulative', False)
percents = _get_kwarg(kwargs, 'percents', False)
conditions = [c for c in _get_kwarg(kwargs, 'conditions', self.conditions()) if c in self] # conditions should be in self
title = _get_kwarg(kwargs, 'title', '')
samples = _get_kwarg(
kwargs, 'samples', sorted(set(v
for c in conditions
for v in self[c]))
) # this computation could be wasted
if "linewidth" not in kwargs:
kwargs["linewidth"] = 2
ax = plt.gca()
if (len(conditions) != 0):
freqs = []
for condition in conditions:
if cumulative:
# freqs should be a list of list where each sub list will be a frequency of a condition
freqs.append(list(self[condition]._cumulative_frequencies(samples)))
ylabel = "Cumulative Counts"
legend_loc = 'lower right'
if percents:
freqs[-1] = [f / freqs[len(freqs) - 1] * 100 for f in freqs]
ylabel = "Cumulative Percents"
else:
freqs.append([self[condition][sample] for sample in samples])
ylabel = "Counts"
legend_loc = 'upper right'
# percents = [f * 100 for f in freqs] only in ConditionalProbDist?
i = 0
for freq in freqs:
kwargs['label'] = conditions[i] #label for each condition
i += 1
ax.plot(freq, *args, **kwargs)
ax.legend(loc=legend_loc)
ax.grid(True, color="silver")
ax.set_xticks(range(len(samples)))
ax.set_xticklabels([text_type(s) for s in samples], rotation=90)
if title:
ax.set_title(title)
ax.set_xlabel("Samples")
ax.set_ylabel(ylabel)
plt.show()
return ax
def tabulate(self, *args, **kwargs):
"""
Tabulate the given samples from the conditional frequency distribution.
:param samples: The samples to plot
:type samples: list
:param conditions: The conditions to plot (default is all)
:type conditions: list
:param cumulative: A flag to specify whether the freqs are cumulative (default = False)
:type title: bool
"""
cumulative = _get_kwarg(kwargs, 'cumulative', False)
conditions = _get_kwarg(kwargs, 'conditions', sorted(self.conditions()))
samples = _get_kwarg(
kwargs, 'samples', sorted(set(v for c in conditions
if c in self
for v in self[c]))) # this computation could be wasted
width = max(len("%s" % s) for s in samples)
freqs = dict()
for c in conditions:
if cumulative:
freqs[c] = list(self[c]._cumulative_frequencies(samples))
else:
freqs[c] = [self[c][sample] for sample in samples]
width = max(width, max(len("%d" % f) for f in freqs[c]))
condition_size = max(len("%s" % c) for c in conditions)
print(' ' * condition_size, end=' ')
for s in samples:
print("%*s" % (width, s), end=' ')
print()
for c in conditions:
print("%*s" % (condition_size, c), end=' ')
for f in freqs[c]:
print("%*d" % (width, f), end=' ')
print()
# Mathematical operators
def __add__(self, other):
"""
Add counts from two ConditionalFreqDists.
"""
if not isinstance(other, ConditionalFreqDist):
return NotImplemented
result = ConditionalFreqDist()
for cond in self.conditions():
newfreqdist = self[cond] + other[cond]
if newfreqdist:
result[cond] = newfreqdist
for cond in other.conditions():
if cond not in self.conditions():
for elem, count in other[cond].items():
if count > 0:
result[cond][elem] = count
return result
> import nltk
> from nltk.corpus import brown
> cfd=nltk.ConditionalFreqDist(
(genre,word)
for genre in brown.categories()
for word in brown.words(categories=genre))
'news','religion','hobbies','science_fiction','romance','humor'] > genres=[
'can','could','may','might','must','will'] > modals=[
> cfd.tabulate(conditions=genres,samples=modals)
can could may might must will
news 93 86 66 38 50 389
religion 82 59 78 12 54 71
hobbies 268 58 131 22 83 264
science_fiction 16 49 4 12 8 16
romance 74 193 11 51 45 43
humor 16 30 8 8 9 13
import nltk
from nltk.corpus import inaugural
cfd=nltk.ConditionalFreqDist(
(target,fileid[:4])
for fileid in inaugural.fileids()
for w in inaugural.words(fileid)
for target in['america','citizen']
if w.lower().startswith(target))
cfd.plot()
import nltk
from nltk.corpus import udhr
languages=['Chickasaw','English','German_Deutsch','Greenlandic_Inuktikut','Hungarian_Magyar','Ibibio_Efik']
cfd=nltk.ConditionalFreqDist(
(lang,len(word))
for lang in languages
for word in udhr.words(lang+'-Latin1'))
cfd.plot(cumulative=True)
Traceback (most recent call last):
File "C:\Users\Administrator\Desktop\NLP\5.py", line 9, in <module>
cfd.plot(cumulative=True,percents=True)
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python38\lib\site-packages\nltk\probability.py", line 1935, in plot
freqs[-1] = [f / freqs[len(freqs) - 1] * 100 for f in freqs]
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python38\lib\site-packages\nltk\probability.py", line 1935, in <listcomp>
freqs[-1] = [f / freqs[len(freqs) - 1] * 100 for f in freqs]
TypeError: unsupported operand type(s) for /: 'list' and 'list'
unsupported operand type(s) for /: 'list' and 'list'
import nltk
names=nltk.corpus.names
cfd=nltk.ConditionalFreqDist(
(fileid,name[-1])
for fileid in names.fileids()
for name in names.words(fileid))
cfd.plot()
以上是关于Python3自然语言处理——获得文本语料库和词汇资源的主要内容,如果未能解决你的问题,请参考以下文章