python 字频率
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 字频率相关的知识,希望对你有一定的参考价值。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals, print_function
import plac
import os
import io
from preshed.counter import PreshCounter
from joblib import Parallel, delayed
from pathlib import Path
from spacy.strings import StringStore
from spacy.attrs import ORTH
from spacy.util import get_lang_class
def count_freqs(Language, input_loc, output_loc):
print('Counting frequencies %s' % str(input_loc))
tokenizer = Language.Defaults.create_tokenizer()
counts = PreshCounter()
doc = tokenizer(input_loc.read_text())
doc.count_by(ORTH, counts=counts)
with output_loc.open('w') as file_:
for orth, freq in counts:
string = tokenizer.vocab.strings[orth]
if not string.isspace():
file_.write('%d\t%s\n' % (freq, string))
def parallelize(func, iterator, n_jobs):
Parallel(n_jobs=n_jobs)(delayed(func)(*item) for item in iterator)
def merge_counts(freqs_dir, out_loc):
string_map = StringStore()
counts = PreshCounter()
doc_counts = PreshCounter()
for loc in freqs_dir.iterdir():
with loc.open('r') as file_:
for line in file_:
freq, word = line.strip().split('\t', 1)
orth = string_map.add(word)
counts.inc(orth, int(freq))
doc_counts.inc(orth, 1)
with io.open(out_loc, 'w', encoding='utf8') as file_:
for orth, count in counts:
string = string_map[orth]
file_.write('%d\t%d\t%s\n' % (count, doc_counts[orth], string))
@plac.annotations(
lang=("Language to tokenize", "positional"),
input_loc=("Directory path for input text files", "positional", None, Path),
freqs_dir=("Directory for intermediate frequency files", "positional", None, Path),
output_loc=("Location for output file", "positional", None, Path),
n_jobs=("Number of workers", "option", "n", int),
skip_existing=("Skip inputs where an output file exists", "flag", "s", bool),
)
def main(lang, input_loc, freqs_dir, output_loc, n_jobs=2, skip_existing=False):
Language = get_lang_class(lang)
os.makedirs(freqs_dir, exist_ok=True)
tasks = []
outputs = []
for input_path in input_loc.iterdir():
if not input_path:
continue
filename = input_path.parts[-1]
output_path = freqs_dir / filename.replace('bz2', 'freq')
outputs.append(output_path)
if not output_path.exists() or not skip_existing:
tasks.append((Language, input_path, output_path))
if tasks:
parallelize(count_freqs, tasks, n_jobs)
print("Merge")
merge_counts(freqs_dir, output_loc)
if __name__ == '__main__':
plac.call(main)
以上是关于python 字频率的主要内容,如果未能解决你的问题,请参考以下文章