'''
From text
# + The sklearn.feature_extraction.text submodule gathers utilities to build feature vectors from text documents.
# > **feature_extraction.text.CountVectorizer([…])** Convert a collection of text documents to a matrix of token counts
# > **feature_extraction.text.HashingVectorizer([…])** Convert a collection of text documents to a matrix of token occurrences
# > **feature_extraction.text.TfidfTransformer([…])** Transform a count matrix to a normalized tf or tf-idf representation
# > **feature_extraction.text.TfidfVectorizer([…])** Convert a collection of raw documents to a matrix of TF-IDF features.
'''
## **CountVectorizer**
# Convert a collection of text documents to a matrix of token counts
# http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
corpus = [
'This is the first document.',
'This is the second second document.',
'And the third one.',
'Is this the first document?',
]
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names()
X.toarray()
## TfidfTransformer
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=False)
## TfidfVectorizer that combines all the options of CountVectorizer and TfidfTransformer in a single model:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit_transform(corpus)