Problem statement: Most of the Restaurants ask reviews to the customers and based on the reviews the restaurant can improve the customer satisfaction.
So Reviews plays a vital role for the successful growth of the restaurant.
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python
Docker image: https://github.com/kaggle/docker-python
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
1. Text Preprocessing
2. Text Visualization
3. Sentiment Analysis
4. Sentiment Modeling
from warnings import filterwarnings
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate
from sklearn.preprocessing import LabelEncoder
from textblob import Word, TextBlob
from wordcloud import WordCloud
filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 200)
pd.set_option('display.float_format', lambda x: '%.2f' % x)
1. Text Preprocessing
df.head()
df.info()
df.shape
Normalizing Case Folding
df.head()
Punctuations
Stopwords
It allows us to get rid of commonly used words.
Rarewords
We drop words according to their frequencies.
Tokenization
break sentences into parts
Lemmatization / Stemming
is the process of separating words by root
2. Text Visualization
Calculate the term frequencies
tf.sort_values("tf", ascending = False)
Barplot
Wordcloud
text = " ".join(i for i in df.Review)
wordcloud = WordCloud().generate(text)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
wordcloud = WordCloud(max_font_size=50,
max_words=100,
background_color="white").generate(text)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
wordcloud.to_file("wordcloud.png")
3. Sentiment Analysis
sia = SentimentIntensityAnalyzer()
sia.polarity_scores("The food was awesome")
df.head()
4. Sentiment Modeling
Create the target
Apply for all data
df["sentiment_label"] = LabelEncoder().fit_transform(df["sentiment_label"])
Count Vectors
converts text to a term count vector.
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X_count = vectorizer.fit_transform(X)
vectorizer.get_feature_names()[10:15]
X_count.toarray()[10:15]
TF-IDF
It is the weight factor calculated with the statistical method that shows the importance of a term in the document.
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_word_vectorizer = TfidfVectorizer()
X_tf_idf_word = tf_idf_word_vectorizer.fit_transform(X)
tf_idf_ngram_vectorizer = TfidfVectorizer(ngram_range=(2, 3))
X_tf_idf_ngram = tf_idf_word_vectorizer.fit_transform(X)
Logistic Regression
log_model = LogisticRegression().fit(X_tf_idf_word, y)
cross_val_score(log_model,
X_tf_idf_word,
y, scoring="accuracy",
cv=5).mean()
random_review = pd.Series(df["Review"].sample(1).values)
random_review
Random Forests
TF-IDF Word-Level
rf_model = RandomForestClassifier().fit(X_tf_idf_word, y)
print(cross_val_score(rf_model, X_tf_idf_word, y, cv=5, n_jobs=-1).mean())
rf_model = RandomForestClassifier().fit(X_tf_idf_ngram, y)
print(cross_val_score(rf_model, X_tf_idf_ngram, y, cv=5, n_jobs=-1).mean())
Hyperparameter Optimization
rf_model = RandomForestClassifier(random_state=17)
rf_params = {"max_depth": [5, 8, None],
"max_features": [5, 7, "auto"],
"min_samples_split": [2, 5, 8, 20],
"n_estimators": [100, 200, 500]}
rf_best_grid = GridSearchCV(rf_model,
rf_params,
cv=5,
n_jobs=-1,
verbose=True).fit(X_count, y)
rf_best_grid.best_params_
rf_final = rf_model.set_params(**rf_best_grid.best_params_, random_state=17).fit(X_count, y)
cv_results = cross_validate(rf_final, X_count, y, cv=3, scoring=["accuracy", "f1", "roc_auc"])
cv_results['test_accuracy'].mean()
cv_results['test_f1'].mean()
cv_results['test_roc_auc'].mean()




