from nlpbook import get_train_test_data
from sklearn.feature_extraction.text import CountVectorizer
train_df, test_df = get_train_test_data()
# Create the bag of characters feature extraction transformer.
# The `CountVectorizer` class bags input text. We set
# `analyzer="char"` so that `CountVectorizer` counts characters
# instead of words and `lowercase=False` to prevent upper case
# letters from being converted to lowercase.
vectorizer = CountVectorizer(analyzer="char", lowercase=False)
# Fit the bag of characters transformer on our reviews.
# Notice we do not pass a matrix into the fit method, but an array.
# Feature extraction should be performed on a per column basis so
# we need to pass in the column we want feature extraction performed
# on.
vectorizer.fit(train_df["review"])
# Transform the first row to a bag of characters.
# Convert the sparse matrix to a numpy array to see the counts.
vectorizer.transform(train_df["review"].head(1)).toarray()
array([[ 0, 0, 0, 275, 0, 6, 0, 0, 0, 1, 5, 1, 1,
0, 0, 25, 4, 16, 8, 0, 2, 0, 0, 0, 0, 0,
0, 1, 2, 0, 0, 8, 0, 8, 0, 0, 2, 5, 0,
1, 0, 0, 3, 2, 3, 0, 0, 3, 5, 2, 1, 1,
0, 2, 1, 5, 1, 0, 3, 0, 0, 0, 0, 0, 0,
0, 0, 0, 99, 24, 23, 50, 166, 31, 21, 64, 84, 1,
5, 61, 27, 89, 92, 30, 1, 94, 90, 122, 29, 13, 17,
0, 35, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0]])