python 字频率

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 字频率相关的知识,希望对你有一定的参考价值。

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals, print_function

import plac
import os
import io
from preshed.counter import PreshCounter
from joblib import Parallel, delayed
from pathlib import Path

from spacy.strings import StringStore
from spacy.attrs import ORTH

from spacy.util import get_lang_class


def count_freqs(Language, input_loc, output_loc):
    print('Counting frequencies %s' % str(input_loc))
    tokenizer = Language.Defaults.create_tokenizer()

    counts = PreshCounter()
    doc = tokenizer(input_loc.read_text())
    doc.count_by(ORTH, counts=counts)

    with output_loc.open('w') as file_:
        for orth, freq in counts:
            string = tokenizer.vocab.strings[orth]
            if not string.isspace():
                file_.write('%d\t%s\n' % (freq, string))


def parallelize(func, iterator, n_jobs):
    Parallel(n_jobs=n_jobs)(delayed(func)(*item) for item in iterator)


def merge_counts(freqs_dir, out_loc):
    string_map = StringStore()
    counts = PreshCounter()
    doc_counts = PreshCounter()
    for loc in freqs_dir.iterdir():
        with loc.open('r') as file_:
            for line in file_:
                freq, word = line.strip().split('\t', 1)
                orth = string_map.add(word)
                counts.inc(orth, int(freq))
                doc_counts.inc(orth, 1)
    with io.open(out_loc, 'w', encoding='utf8') as file_:
        for orth, count in counts:
            string = string_map[orth]
            file_.write('%d\t%d\t%s\n' % (count, doc_counts[orth], string))


@plac.annotations(
    lang=("Language to tokenize", "positional"),
    input_loc=("Directory path for input text files", "positional", None, Path),
    freqs_dir=("Directory for intermediate frequency files", "positional", None, Path),
    output_loc=("Location for output file", "positional", None, Path),
    n_jobs=("Number of workers", "option", "n", int),
    skip_existing=("Skip inputs where an output file exists", "flag", "s", bool),
)
def main(lang, input_loc, freqs_dir, output_loc, n_jobs=2, skip_existing=False):
    Language = get_lang_class(lang)
    os.makedirs(freqs_dir, exist_ok=True)
    tasks = []
    outputs = []
    for input_path in input_loc.iterdir():
        if not input_path:
            continue
        filename = input_path.parts[-1]
        output_path = freqs_dir / filename.replace('bz2', 'freq')
        outputs.append(output_path)
        if not output_path.exists() or not skip_existing:
            tasks.append((Language, input_path, output_path))

    if tasks:
        parallelize(count_freqs, tasks, n_jobs)

    print("Merge")
    merge_counts(freqs_dir, output_loc)


if __name__ == '__main__':
    plac.call(main)

以上是关于python 字频率的主要内容,如果未能解决你的问题,请参考以下文章

PHP 字频率计数

Python 自然语言处理字频统计

基于汉字字频特征实现99.99%准确率的新闻文本分类器

基于汉字字频特征实现99.99%准确率的新闻文本分类器

基于汉字字频特征实现99.99%准确率的新闻文本分类器

javascript 字频#js