python Tokenize Yelp
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python Tokenize Yelp相关的知识,希望对你有一定的参考价值。
# -*- coding: utf-8 -*-
"""Script to tokenize yelp reviews with spaCy.
Input
=====
We use the review file in the yelp academic dataset
for this gist See https://www.yelp.com/dataset_challenge.
``` python
{
u'business_id': u'2aFiy99vNLklCx3T_tGS9A',
u'cool': 0,
u'date': u'2011-10-10',
u'funny': 0,
u'review_id': u'NxL8SIC5yqOdnlXCg18IBg',
u'stars': 5,
u'text': u"If you enjoy service by someone who is as...",
u'type': u'review',
u'useful': 0,
u'user_id': u'KpkOkG6RIf4Ra25Lhhxf1A',
}
```
Output
======
A pandas Dataframe where each row corresponds to single
sentence of a review.
review_id sent_num tokens
32621 B8xpcb3VRV8BtJ8YM17_HQ 3 [Nobody, pushed, me, to, get, gel, color, ,, and, I, 've, found, some, ...
20817 RKTKuMOxsimvWRfI9p-J0g 0 [This, is, a, default, corporate, watering, hole, for, the, nearby, off...
"""
from __future__ import division
from __future__ import unicode_literals
import simplejson as json
import codecs
import pandas as pd # optional
import tqdm # optional
import spacy
from itertools import izip
from pprint import pprint
# Path to the reviews file of the yelp's academic dataset.
REVIEWS_FILE = 'path/to/file'
def load_data(path, test_frac=0.3, n_reviews=None):
sent_tokens_list = []
nlp = spacy.load('en')
# We only want to use the parser for this task. This
# let's us skip POS-tagging and NER detection.
nlp.pipeline = [nlp.parser]
return pd.DataFrame(
iter_file_sent_tokens(
path=paths.reviews,
nlp=nlp,
n_reviews=n_reviews,
),
)
def iter_file_sent_tokens(path, nlp, n_reviews=None):
"""Iterate over the review file and yield sentence tokens.
:param path: Path to the reviews file.
:type path: str
:param nlp: Spacy tokenizer with parsing.
:type nlp: spacy.en.English
:param n_reviews: Maximum number of reviews to iter over. If
None we iter over all reviews.
:type n_reviews: int or NoneType
:returns: Iterator over a dict. See sample output.
:rtype: iter(dict)
Sample output::
{'review_id': 'iamid', 'sent_num': 0, 'tokens': ['Every', 'villain', 'is', 'lemons']}
"""
for review_dict, review_doc in izip(
iter_review_dict(path=path, n_reviews=n_reviews),
nlp.pipe(
iter_review_text(path=path, n_reviews=n_reviews),
batch_size=2500,
n_threads=4,
),
):
for sent_num, sent in enumerate(review_doc.sents):
yield {
'sent_num': sent_num,
'tokens': map(unicode, sent),
'review_id': review_dict['review_id'],
}
def iter_review_dict(path, n_reviews=None):
"""Iterate over the review file loading the json data into a dict."""
with codecs.open(path, encoding='utf-8') as infile:
for i, line in tqdm.tqdm(enumerate(infile), total=n_reviews):
if n_reviews and n_reviews == i:
return
yield json.loads(line)
def iter_review_text(path, n_reviews=None):
"""Iterate over the review text in the review file."""
for review_dict in iter_review_dict(path=path, n_reviews=n_reviews):
yield review_dict['text']
df = load_data(
path=REVIEWS_FILE,
test_frac=0.3,
n_reviews=10000,
)
print df.head(2)
# review_id sent_num tokens
# 32621 B8xpcb3VRV8BtJ8YM17_HQ 3 [Nobody, pushed, me, to, get, gel, color, ,, and, I, 've, found, some, ...
# 20817 RKTKuMOxsimvWRfI9p-J0g 0 [This, is, a, default, corporate, watering, hole, for, the, nearby, off...
以上是关于python Tokenize Yelp的主要内容,如果未能解决你的问题,请参考以下文章
NLTK Python word_tokenize [重复]
安装TA-Lib时报错:ubuntu****, Command "/usr/bin/python -u -c "import setuptools, tokenize;__file