# Utility Libraries
import re
import numpy as np
import pandas as pd
from collections import Counter

# Plotting Libraries
import matplotlib.pyplot as plt

# nltk Libraries
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# sklearn Libraries
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

# Transformer Libraries
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

c:\Users\tjmaz\anaconda3\envs\PyTorchGPU\Lib\site-packages\tqdm\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

comments = list(pd.read_csv("Restaurant_Reviews.tsv", sep = "\t")["Review"])
comments = list(set(comments))
comments[:10]

['I hate to disagree with my fellow Yelpers, but my husband and I were so disappointed with this place.',
 "Just don't know why they were so slow.",
 "Best fish I've ever had in my life!",
 "I've lived here since 1979 and this was the first (and last) time I've stepped foot into this place.",
 'Sauce was tasteless.',
 'first time there and might just be the last.',
 'The food, amazing.',
 'We waited for forty five minutes in vain.',
 'Now this dish was quite flavourful.',
 'Very Very Disappointed ordered the $35 Big Bay Plater.']

def preprocess_comments(comments):
    stop_words = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()
    def clean_comment(comment):
        comment = re.sub(r"http\S+|www\S+|https\S+", '', comment, flags=re.MULTILINE)
        comment = re.sub(r"[^a-zA-Z\s]", '', comment)
        words = word_tokenize(comment.lower())
        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
        return ' '.join(words)
    return [clean_comment(comment) for comment in comments]
processed_comments = preprocess_comments(comments)
processed_comments[:10]

['hate disagree fellow yelpers husband disappointed place',
 'dont know slow',
 'best fish ive ever life',
 'ive lived since first last time ive stepped foot place',
 'sauce tasteless',
 'first time might last',
 'food amazing',
 'waited forty five minute vain',
 'dish quite flavourful',
 'disappointed ordered big bay plater']

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(processed_comments)

pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(embeddings)

def get_sentiments(comments):
    sentiments = []
    sia = SentimentIntensityAnalyzer()
    for text in comments:
        sentiments.append(sia.polarity_scores(str(text))["compound"])
    return sentiments
sentiments = get_sentiments(comments)

def perform_and_plot_kmeans(original_data, pca_data, k):
    kmeans = KMeans(n_clusters=k, random_state=42)
    cluster_labels = kmeans.fit_predict(original_data)
    plt.figure(figsize=(8, 6))
    scatter = plt.scatter(pca_data[:, 0], pca_data[:, 1], c=cluster_labels, cmap='viridis')
    plt.title(f'2D PCA Projection of Word Embeddings with K-Means Clustering (k={k})')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    handles, labels = scatter.legend_elements()
    plt.legend(handles, labels, title="Clusters", loc='upper right')
    plt.grid(True)
    plt.show()
    return kmeans, cluster_labels

def get_top_n_closest_comments_to_centers(embeddings, cluster_labels, comments, kmeans, n_comments):
    cluster_centers = kmeans.cluster_centers_
    all_sorted_comments = [None] * len(cluster_centers)  
    
    for cluster in range(len(cluster_centers)):
        cluster_indices = np.where(cluster_labels == cluster)[0]
        cluster_embeddings = embeddings[cluster_indices]
        distances = euclidean_distances(cluster_embeddings, cluster_centers[cluster].reshape(1, -1)).flatten()
        sorted_indices = np.argsort(distances)
        sorted_comments = [comments[cluster_indices[idx]] for idx in sorted_indices]
        all_sorted_comments[cluster] = sorted_comments
        print(f"Top {n_comments} closest comments to Cluster {cluster + 1} center:")
        for idx in range(min(n_comments, len(sorted_comments))):
            print(f"- {sorted_comments[idx]}")
        print("") 
    return all_sorted_comments

def summarize_comments(comments, cluster_labels, min_length = 1, max_length = 10):
    clusters = list(set(cluster_labels))
    summarized_comments = []
    for cluster in clusters:
        model_name = "t5-base"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
        selected_comments = comments[cluster]  
        selected_comments = selected_comments[:len(selected_comments) // 50]
        input_text = " ".join(selected_comments)
        encoded_input = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        input_text = tokenizer.decode(encoded_input["input_ids"][0], skip_special_tokens=True)
        summary = summarizer(input_text, max_length=max_length, min_length=min_length, do_sample=False)
        summarized_comments.append(summary[0]['summary_text'])
    return summarized_comments

def plot_and_print_comments(embeddings, embeddings_2d, comments, k_clusters, n_comments = 10):
    kmeans, cluster_labels = perform_and_plot_kmeans(embeddings, embeddings_2d, k_clusters)
    sorted_comments = get_top_n_closest_comments_to_centers(embeddings, cluster_labels, comments, kmeans, n_comments)
    summarized_comments = summarize_comments(sorted_comments, cluster_labels)
    return kmeans, cluster_labels, sorted_comments, summarized_comments

def plot_cluster_bar_chart(summarized_comments, cluster_labels, sentiments):
    
    cluster_names = [summarized_comments[cluster] for cluster in range(len(summarized_comments))]
    
    cluster_counts = Counter(cluster_labels)
    clusters = list(range(len(cluster_counts))) 
    counts = [cluster_counts[cluster] for cluster in clusters]
    total_count = sum(counts)
    percentages = [(count / total_count) * 100 for count in counts]

    cluster_sentiments = {cluster: [] for cluster in clusters}
    for label, sentiment in zip(cluster_labels, sentiments):
        cluster_sentiments[label].append(sentiment)
    avg_sentiments = [sum(cluster_sentiments[cluster]) / len(cluster_sentiments[cluster]) for cluster in clusters]
    avg_sentiments = [sentiment * 100 for sentiment in avg_sentiments]
    
    x = np.arange(len(clusters))  
    width = 0.4 
    
    plt.figure(figsize=(12, 6))
    plt.bar(x - width/2, percentages, width, label='Percentage of Comments', color='skyblue')
    plt.bar(x + width/2, avg_sentiments, width, label='Average Sentiment', color='orange')
    
    plt.xlabel('Cluster')
    plt.ylabel('Values')
    plt.title('Cluster Analysis: Percentage and Average Sentiment')
    plt.xticks(x, cluster_names, rotation=30)
    plt.legend()
    for i, (percentage, avg_sentiment) in enumerate(zip(percentages, avg_sentiments)):
        plt.text(x[i] - width/2, percentage + 0.5, f'{percentage:.2f}%', ha='center', va='bottom', fontsize=10)
        plt.text(x[i] + width/2, avg_sentiment + 0.05, f'{avg_sentiment:.2f}%', ha='center', va='bottom', fontsize=10)
    
    plt.tight_layout()
    plt.show()

kmeans_2, cluster_labels_2, sorted_comments_2, summarized_comments_2 = plot_and_print_comments(embeddings, embeddings_2d, comments, k_clusters=2)
plot_cluster_bar_chart(summarized_comments_2, cluster_labels_2, sentiments)

Top 10 closest comments to Cluster 1 center:
- As for the service, I thought it was good.
- Terrible service!
- Service was fantastic.
- And service was super friendly.
- this place is good.
- This place is great!!!!!!!!!!!!!!
- The service was poor and thats being nice.
- This place is amazing!
- Cant say enough good things about this place.
- Awful service.

Top 10 closest comments to Cluster 2 center:
- The food, amazing.
- This is an Outstanding little restaurant with some of the Best Food I have ever tasted.
- Great service and food.
- Food was great and so was the serivce!
- Now this dish was quite flavourful.
- Service was exceptional and food was a good as all the reviews.
- Fantastic food!
- Great food.
- Everything was good and tasty!
- The food was excellent and service was very good.

Device set to use cuda:0
Device set to use cuda:0

kmeans_3, cluster_labels_3, sorted_comments_3, summarized_comments_3 = plot_and_print_comments(embeddings, embeddings_2d, comments, k_clusters=3)
plot_cluster_bar_chart(summarized_comments_3, cluster_labels_3, sentiments)

Top 10 closest comments to Cluster 1 center:
- As for the service, I thought it was good.
- Terrible service!
- I can't wait to go back.
- Won't go back.
- We won't be going back.
- Service sucks.
- Very poor service.
- I wouldn't return.
- Service stinks here!
- We won't be going back anytime soon!

Top 10 closest comments to Cluster 2 center:
- The food, amazing.
- Now this dish was quite flavourful.
- Food was great and so was the serivce!
- Great service and food.
- This is an Outstanding little restaurant with some of the Best Food I have ever tasted.
- Fantastic food!
- Great food.
- Service was exceptional and food was a good as all the reviews.
- Everything was good and tasty!
- Food was delicious!

Top 10 closest comments to Cluster 3 center:
- This place is amazing!
- This place is pretty good, nice little vibe in the restaurant.
- Great place fo take out or eat in.
- This place is great!!!!!!!!!!!!!!
- This wonderful experience made this place a must-stop whenever we are in town again.
- this place is good.
- I would not recommend this place.
- This is a GREAT place to eat!
- Wow... Loved this place.
- Pretty awesome place.

Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0

kmeans_4, cluster_labels_4, sorted_comments_4, summarized_comments_4 = plot_and_print_comments(embeddings, embeddings_2d, comments, k_clusters=4)
plot_cluster_bar_chart(summarized_comments_4, cluster_labels_4, sentiments)

Top 10 closest comments to Cluster 1 center:
- Service is quick and friendly.
- And service was super friendly.
- Service was fine and the waitress was friendly.
- As for the service, I thought it was good.
- Waitress was a little slow in service.
- The service was poor and thats being nice.
- But the service was beyond bad.
- Terrible service!
- Very poor service.
- Service was good and the company was better!

Top 10 closest comments to Cluster 2 center:
- Now this dish was quite flavourful.
- Everything was good and tasty!
- Everything was fresh and delicious!
- From what my dinner companions told me...everything was very fresh with nice texture and taste.
- Extremely Tasty!
- It lacked flavor, seemed undercooked, and dry.
- Not much flavor to them, and very poorly constructed.
- To my disbelief, each dish qualified as the worst version of these foods I have ever tasted.
- It was extremely "crumby" and pretty tasteless.
- It's too bad the food is so damn generic.

Top 10 closest comments to Cluster 3 center:
- I can't wait to go back.
- Won't ever go here again.
- this place is good.
- We loved the place.
- This place is amazing!
- This place is great!!!!!!!!!!!!!!
- Wow... Loved this place.
- This place has it!
- Cant say enough good things about this place.
- I will never go back to this place and will never ever recommended this place to anyone!

Top 10 closest comments to Cluster 4 center:
- The food was excellent and service was very good.
- Great service and food.
- Service was exceptional and food was a good as all the reviews.
- Great food and great service in a clean and friendly setting.
- This is an Outstanding little restaurant with some of the Best Food I have ever tasted.
- Good food , good service .
- Food was great and so was the serivce!
- Great food and awesome service!
- The food, amazing.
- Phenomenal food, service and ambiance.

Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0

kmeans_5, cluster_labels_5, sorted_comments_5, summarized_comments_5 = plot_and_print_comments(embeddings, embeddings_2d, comments, k_clusters=5)
plot_cluster_bar_chart(summarized_comments_5, cluster_labels_5, sentiments)

Top 10 closest comments to Cluster 1 center:
- We got sitting fairly fast, but, ended up waiting 40 minutes just to place our order, another 30 minutes before the food arrived.
- At least 40min passed in between us ordering and the food arriving, and it wasn't that busy.
- This is was due to the fact that it took 20 minutes to be acknowledged, then another 35 minutes to get our food...and they kept forgetting things.
- -Drinks took close to 30 minutes to come out at one point.
- We sat another ten minutes and finally gave up and left.
- We literally sat there for 20 minutes with no one asking to take our order.
- The real disappointment was our waiter.
- I kept looking at the time and it had soon become 35 minutes, yet still no food.
- Similarly, the delivery man did not say a word of apology when our food was 45 minutes late.
- I also decided not to send it back because our waitress looked like she was on the verge of having a heart attack.

Top 10 closest comments to Cluster 2 center:
- Now this dish was quite flavourful.
- Everything was good and tasty!
- Everything was fresh and delicious!
- Extremely Tasty!
- From what my dinner companions told me...everything was very fresh with nice texture and taste.
- Not much flavor to them, and very poorly constructed.
- It lacked flavor, seemed undercooked, and dry.
- To my disbelief, each dish qualified as the worst version of these foods I have ever tasted.
- It was extremely "crumby" and pretty tasteless.
- It's too bad the food is so damn generic.

Top 10 closest comments to Cluster 3 center:
- This place is amazing!
- this place is good.
- This place is great!!!!!!!!!!!!!!
- I will never go back to this place and will never ever recommended this place to anyone!
- Wow... Loved this place.
- We loved the place.
- Cant say enough good things about this place.
- I can't wait to go back.
- Won't ever go here again.
- This place has it!

Top 10 closest comments to Cluster 4 center:
- The food was excellent and service was very good.
- Great service and food.
- Service was exceptional and food was a good as all the reviews.
- This is an Outstanding little restaurant with some of the Best Food I have ever tasted.
- Great food and great service in a clean and friendly setting.
- Good food , good service .
- Food was great and so was the serivce!
- The food, amazing.
- Great food and awesome service!
- Phenomenal food, service and ambiance.

Top 10 closest comments to Cluster 5 center:
- Service is quick and friendly.
- And service was super friendly.
- As for the service, I thought it was good.
- The service was poor and thats being nice.
- But the service was beyond bad.
- Service was fantastic.
- Service was fine and the waitress was friendly.
- Terrible service!
- Service was good and the company was better!
- Very poor service.

Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Your max_length is set to 10, but your input_length is only 9. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)

def plot_elbow_curve(data, max_k=10):
    inertia = []
    for k in range(1, max_k+1):
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(data)
        inertia.append(kmeans.inertia_)
    plt.figure(figsize=(8, 6))
    plt.plot(range(1, max_k+1), inertia, marker='o', color='b', linestyle='--')
    plt.title('Elbow Curve for K-Means Clustering')
    plt.xlabel('Number of Clusters (k)')
    plt.ylabel('Inertia')
    plt.xticks(range(1, max_k+1))
    plt.grid(True)
    plt.show()

def plot_silhouette_scores(data, max_k=10):
    silhouette_scores = []
    for k in range(2, max_k+1): 
        kmeans = KMeans(n_clusters=k, random_state=42)
        cluster_labels = kmeans.fit_predict(data)
        score = silhouette_score(data, cluster_labels)
        silhouette_scores.append(score)
    plt.figure(figsize=(8, 6))
    plt.plot(range(2, max_k+1), silhouette_scores, marker='o', color='b', linestyle='--')
    plt.title('Silhouette Scores for K-Means Clustering')
    plt.xlabel('Number of Clusters (k)')
    plt.ylabel('Silhouette Score')
    plt.xticks(range(2, max_k+1))
    plt.grid(True)
    plt.show()

def gap_statistic(X, max_k=10):
    gaps = np.zeros(max_k - 1)
    results = []
    for k in range(1, max_k):
        kmeans = KMeans(n_clusters=k)
        kmeans.fit(X)
        inertia = kmeans.inertia_
        results.append(inertia)
        reference_inertia = 0  
        gaps[k - 1] = reference_inertia - inertia
    plt.plot(range(1, max_k), gaps)
    plt.title('Gap Statistic for Optimal Clusters')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Gap Statistic')
    plt.show()

def plot_davies_bouldin(data, max_k=10):
    davies_bouldin_scores = []
    for k in range(2, max_k + 1):
        kmeans = KMeans(n_clusters=k, random_state=42)
        cluster_labels = kmeans.fit_predict(data)
        db_score = davies_bouldin_score(data, cluster_labels)
        davies_bouldin_scores.append(db_score)
    plt.plot(range(2, max_k + 1), davies_bouldin_scores, marker='o')
    plt.title('Davies-Bouldin Index for Optimal Clusters')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Davies-Bouldin Index')
    plt.grid(True)
    plt.show()

def plot_calinski_harabasz(data, max_k=10):
    calinski_harabasz_scores = []
    for k in range(2, max_k + 1):
        kmeans = KMeans(n_clusters=k, random_state=42)
        cluster_labels = kmeans.fit_predict(data)
        ch_score = calinski_harabasz_score(data, cluster_labels)
        calinski_harabasz_scores.append(ch_score)
    plt.plot(range(2, max_k + 1), calinski_harabasz_scores, marker='o')
    plt.title('Calinski-Harabasz Index for Optimal Clusters')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Calinski-Harabasz Index')
    plt.grid(True)
    plt.show()

plot_elbow_curve(embeddings)
plot_silhouette_scores(embeddings)
gap_statistic(embeddings)
plot_davies_bouldin(embeddings)
plot_calinski_harabasz(embeddings)

Clustering Customer Comments¶

Introduction¶

Load in the Comments¶

Preprocess the Comments¶

Load in the Transformer Model¶

Sentiment Extraction¶

Optimal Number of Clusters - Qualitative Analysis¶

Analysis on k = 2 Clusters¶

Cluster 1: Service Quality Focused¶

Cluster 2: Food Quality Focused¶

Analysis on k = 3 Clusters¶

Cluster 1: Service Quality Focused¶

Cluster 2: Overall Positive Experience¶

Cluster 3: Food Focused¶

Analysis on k = 4 Clusters¶

Cluster 1: Food Quality and Taste: Positive and Negative Feedback¶

Cluster 2: Service Experience: Praise and Criticism¶

Cluster 3: Overall Dining Experience: High Satisfaction with Food and Service¶

Cluster 4: Likelihood of Returning: Mixed Feelings About Future Visits¶

Analysis on k = 5 Clusters¶

Cluster 1: Mixed Service Experience¶

Cluster 2: Positive Experience with Food, Service, and Atmosphere¶

Cluster 3: Exceptional Food Quality and Consistency¶

Cluster 4: Mixed Food Experience¶

Cluster 5: Mixed Intentions About Returning¶

Qualitative Analysis Conclusion¶

Optimal Number of Clusters - Quantitative Analysis¶

1. Elbow Method¶

2. Silhouette Score¶

3. Gap Statistic¶

4. Davies-Bouldin Index¶

5. Calinski-Harabasz Index (Variance Ratio Criterion)¶