Aspect based sentiment analysis
Last updated
Last updated
In this notebook, we will demonstrate how to use Curator to distill capabilities from a large language model to a much smaller 8B parameter model.
We will use Yelp restaurant reviews dataset to train a sentiment analysis model. We will generate a synthetic dataset using curator and finetune a model using Together's finetuning API.
Example input:
The food was good, but the service was slow.
Example output:
{
"food_sentiment": "Positive",
"service_sentiment": "Negative"
}
!pip install bespokelabs-curator datasets together
from bespokelabs import curator
from datasets import load_dataset
from together import Together
import os
import json
import getpass
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")
os.environ["TOGETHER_API_KEY"] = getpass.getpass("Enter your Together API key: ")
# We use curator viewer to visualize the data fast.
# You can comment it out if you don't want to use it.
os.environ['CURATOR_VIEWER']='1'
The data curation process is pretty simple. We will use a prompt to instruct the model to analyze the review and output the sentiment for each aspect.
Note that here we are not using structured outputs, since the same prompt/curator block will be used to evaluate the base model that we finetune below (Llama-3.1-8B-Instruct). We use json mode instead of structured outputs below since many small models don't support that
PROMPT ="""You are a sentiment analysis expert specializing in restaurant reviews. You need to analyze the sentiment of the given restaurant review.
Analyze the review for the following specific aspects:
1. Food: Quality, taste, presentation, menu variety, etc.
2. Service: Staff behavior, responsiveness, professionalism, etc.
3. Ambience: Atmosphere, decor, comfort, noise level, etc.
4. Price: Value for money, affordability, etc.
5. Overall: General impression of the restaurant experience
For each aspect, classify the sentiment as exactly one of the following:
- Positive: The review expresses satisfaction or praise
- Negative: The review expresses dissatisfaction or criticism
- Neutral: The review is balanced or doesn't mention the aspect
If an aspect is not mentioned in the review, classify it as Neutral.
Output the sentiment for each aspect in the following format:
```json
{{
"food_sentiment": "Positive",
"service_sentiment": "Negative",
"ambience_sentiment": "Neutral",
"price_sentiment": "Positive",
"overall_sentiment": "Negative"
}}```
"""
class AspectBasedSentimentCurator(curator.LLM):
def prompt(self, input: dict) -> str:
# we can also return a prompt string: return f"{PROMPT}\nThe review is {input['text']}"
return [{"role": "system", "content": PROMPT},
{"role": "user", "content": f"The review is: {input['text']}"}]
def parse(self, input: dict, raw_response: str) -> dict:
response = raw_response.split("```json")[1].split("```")[0]
try:
response = json.loads(response)
except:
response = {}
return {
**input,
"food_sentiment": response.get("food_sentiment", "None"),
"service_sentiment": response.get("service_sentiment", "None"),
"ambience_sentiment": response.get("ambience_sentiment", "None"),
"price_sentiment": response.get("price_sentiment", "None"),
"overall_sentiment": response.get("overall_sentiment", "None")
}
We will run this curator on yelp restaurant reviews dataset to generate aspect based sentiment annotations for each review.
source_dataset = load_dataset("bespokelabs/yelp_restaurant_reviews", split="train")
# We can visualize data using Curator viewer easily.
from bespokelabs.curator.utils import push_to_viewer
url = push_to_viewer(source_dataset)
Curator Viewer: ✨
https://curator.bespokelabs.ai/datasets/249dcc5c831f4563b5e7565465252ed8
annotated_dataset = AspectBasedSentimentCurator(
"gpt-4o",
generation_params = {
"temperature": 0.0,
}
)(source_dataset)
We will create a train test split and use the curated dataset to finetune a smaller model.
# Rename the column to _gt (note: gt stands for ground truth).
annotated_dataset = annotated_dataset.rename_column("food_sentiment", "food_sentiment_gt")
annotated_dataset = annotated_dataset.rename_column("service_sentiment", "service_sentiment_gt")
annotated_dataset = annotated_dataset.rename_column("ambience_sentiment", "ambience_sentiment_gt")
annotated_dataset = annotated_dataset.rename_column("price_sentiment", "price_sentiment_gt")
annotated_dataset = annotated_dataset.rename_column("overall_sentiment", "overall_sentiment_gt")
train_dataset = annotated_dataset.select(range(int(len(annotated_dataset) * 0.9)))
test_dataset = annotated_dataset.select(range(int(len(annotated_dataset) * 0.1)))
def evaluate_sentiment(dataset):
"""
Evaluates sentiment analysis models by comparing model output with ground truth.
"""
aspects = ['food_sentiment', 'service_sentiment', 'ambience_sentiment', 'price_sentiment', 'overall_sentiment']
# Calculate accuracy for each aspect
aspect_accuracies = {}
for aspect in aspects:
correct_predictions = sum(1 for i in range(len(dataset)) if dataset[aspect][i] == dataset[f"{aspect}_gt"][i])
total_predictions = len(dataset)
aspect_accuracies[aspect] = correct_predictions / total_predictions if total_predictions > 0 else 0
# Calculate overall accuracy (average of all aspects)
overall_accuracy = sum(aspect_accuracies.values()) / len(aspect_accuracies) if aspect_accuracies else 0
return {"overall_accuracy": overall_accuracy, "aspect_accuracies": aspect_accuracies}
small_model_output = AspectBasedSentimentCurator(
"together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
generation_params = {
"temperature": 0.0,
},
backend_params = {
"max_tokens_per_minute": 100000000,
},
backend="litellm",
)(test_dataset)
base_eval = evaluate_sentiment(small_model_output)
print(json.dumps(base_eval, indent=4))
Output
{
"overall_accuracy": 0.8252465483234716,
"aspect_accuracies": {
"food_sentiment": 0.873767258382643,
"service_sentiment": 0.9211045364891519,
"ambience_sentiment": 0.6903353057199211,
"price_sentiment": 0.7830374753451677,
"overall_sentiment": 0.8579881656804734
}
}
Above, we can see that the overall accuracy is 82.5% and the aspect accuracies are not very good.
Thus we will use the curated dataset to finetune a 8B parameter model. Below is the dataset if you wish to analyze further:
def _format_response(data_point):
return f"""
```json
{{
"food_sentiment": "{data_point['food_sentiment_gt']}",
"service_sentiment": "{data_point['service_sentiment_gt']}",
"ambience_sentiment": "{data_point['ambience_sentiment_gt']}",
"price_sentiment": "{data_point['price_sentiment_gt']}",
"overall_sentiment": "{data_point['overall_sentiment_gt']}"
}}
```
"""
finetuning_dataset = []
for data_point in train_dataset:
finetuning_dataset.append({
"messages": [
{"role": "system", "content": PROMPT},
{"role": "user", "content": f"The review is: {data_point['text']}"},
{"role": "assistant", "content": _format_response(data_point)}
],
})
# upload the dataset to together
# create a temporary file and upload it to together
with open("finetuning_dataset.jsonl", "w") as f:
for data_point in finetuning_dataset:
f.write(json.dumps(data_point) + "\n")
# upload the file to together
client = Together()
file = client.files.upload("finetuning_dataset.jsonl")
client = Together()
fine_tune_response =client.fine_tuning.create(
training_file = file.id,
model = 'meta-llama/Meta-Llama-3.1-8B-Instruct-Reference',
n_epochs = 3,
suffix = '-aspect-based-sentiment-analysis-lora',
lora = True,
lora_r = 64,
wandb_api_key = os.environ.get("WANDB_API_KEY", None)
)
# Wait until job is completed
!together fine-tuning list-events ft-xyz # paste your job ID here
| | Message | Type | Created At | Hash |
+====+===================================================+==============================================+============================+========+
| 0 | Fine tune request created | FinetuneEventType.JOB_PENDING | 2025-04-02 04:48:37.014000 | |
+----+---------------------------------------------------+----------------------------------------------+----------------------------+--------+
| 1 | Job started at Wed Apr 2 04:49:08 UTC 2025 | FinetuneEventType.JOB_START | 2025-04-02 04:49:08 | |
+----+---------------------------------------------------+----------------------------------------------+----------------------------+--------+
| 2 | Model data downloaded for togethercomputer/Meta- | FinetuneEventType.MODEL_DOWNLOAD_COMPLETE | 2025-04-02 04:49:10 | |
| | Llama-3.1-8B-Instruct-Reference__TOG__FT at Wed | | | |
| | Apr 2 04:49:10 UTC 2025 | | | |
+----+---------------------------------------------------+----------------------------------------------+----------------------------+--------+
| 3 | Data downloaded for togethercomputer/Meta- | FinetuneEventType.TRAINING_DATA_DOWNLOADING | 2025-04-02 04:50:21 | |
| | Llama-3.1-8B-Instruct-Reference__TOG__FT at | | | |
| | $2025-04-02T04:50:21.782349 | | | |
+----+---------------------------------------------------+----------------------------------------------+----------------------------+--------+
| 4 | Training started for model togethercomputer/Meta- | FinetuneEventType.TRAINING_START | 2025-04-02 04:52:05 | |
| | Llama-3.1-8B-Instruct-Reference__TOG__FT | | | |
+----+---------------------------------------------------+----------------------------------------------+----------------------------+--------+
| 5 | Epoch completed, at step 31 | FinetuneEventType.EPOCH_COMPLETE | 2025-04-02 04:54:24 | |
+----+---------------------------------------------------+----------------------------------------------+----------------------------+--------+
| 6 | Epoch completed, at step 62 | FinetuneEventType.EPOCH_COMPLETE | 2025-04-02 04:56:42 | |
+----+---------------------------------------------------+----------------------------------------------+----------------------------+--------+
| 7 | Epoch completed, at step 93 | FinetuneEventType.EPOCH_COMPLETE | 2025-04-02 04:59:16 | |
+----+---------------------------------------------------+----------------------------------------------+----------------------------+--------+
| 8 | Training completed for togethercomputer/Meta- | FinetuneEventType.TRAINING_COMPLETE | 2025-04-02 04:59:37 | |
| | Llama-3.1-8B-Instruct-Reference__TOG__FT at Wed | | | |
| | Apr 2 04:59:36 UTC 2025 | | | |
+----+---------------------------------------------------+----------------------------------------------+----------------------------+--------+
| 9 | Uploading output model | FinetuneEventType.MODEL_UPLOADING | 2025-04-02 05:00:20 | |
+----+---------------------------------------------------+----------------------------------------------+----------------------------+--------+
| 10 | Compressing output model | FinetuneEventType.MODEL_COMPRESSING | 2025-04-02 05:00:39 | |
+----+---------------------------------------------------+----------------------------------------------+----------------------------+--------+
| 11 | Model compression complete | FinetuneEventType.MODEL_COMPRESSION_COMPLETE | 2025-04-02 05:00:58 | |
+----+---------------------------------------------------+----------------------------------------------+----------------------------+--------+
| 12 | Model upload complete | FinetuneEventType.MODEL_UPLOAD_COMPLETE | 2025-04-02 05:03:00 |
# Run the finetuned model on the test dataset
ft_output = annotated_dataset = AspectBasedSentimentCurator(
# Replace with the model id of the fine-tuned model
# You will get the model ID from here: https://api.together.xyz/models
"together_ai/mahesh_bespoke/Meta-Llama-3.1-8B-Instruct-Reference--aspect-based-sentiment-analysis-lora-xyz",
generation_params = {
"temperature": 0.0,
},
backend_params = {
"max_tokens_per_minute": 100000000,
}
)(test_dataset)
ft_eval = evaluate_sentiment(ft_output)
print(json.dumps(ft_eval, indent=4))
Output
{
"overall_accuracy": 0.9222879684418146,
"aspect_accuracies": {
"food_sentiment": 0.9368836291913215,
"service_sentiment": 0.9428007889546351,
"ambience_sentiment": 0.8856015779092702,
"price_sentiment": 0.8777120315581854,
"overall_sentiment": 0.9684418145956607
}
}
# Compare the results
import pandas as pd
from IPython.display import display
base_model_results = base_eval
fine_tuned_results = ft_eval
# Create a comparison table
comparison_data = {
"Metric": ["Overall Accuracy"] + [f"{k.replace('_', ' ').title()}" for k in base_model_results["aspect_accuracies"].keys()],
"Base Model": [base_model_results["overall_accuracy"]] + list(base_model_results["aspect_accuracies"].values()),
"Fine-tuned Model": [fine_tuned_results["overall_accuracy"]] + list(fine_tuned_results["aspect_accuracies"].values()),
}
# Create and display the DataFrame
comparison_df = pd.DataFrame(comparison_data)
pct_improvement = (comparison_df["Fine-tuned Model"] - comparison_df["Base Model"]) / comparison_df["Base Model"] * 100
comparison_df["Percentage improvement"] = pct_improvement.apply(lambda x: f"{x:.2f}%")
display(comparison_df.style.format({
"Base Model": "{:.3f}",
"Fine-tuned Model": "{:.3f}",
}).set_caption("Model Performance Comparison"))
0
Overall Accuracy
0.825
0.922
11.76%
1
Food Sentiment
0.874
0.937
7.22%
2
Service Sentiment
0.921
0.943
2.36%
3
Ambience Sentiment
0.690
0.886
28.29%
4
Price Sentiment
0.783
0.878
12.09%
5
Overall Sentiment
0.858
0.968
12.87%
We can see that the fine-tuned model has higher overall accuracy and also better aspect accuracies. Also, it is 13.8x cheaper than the teacher model ($0.18 for the 8B model on together.ai vs. $2.5 for GPT-4o, per million tokens)! As next steps, we can rerun with a larger dataset and better hyperparameter settings, to match the performance of GPT-4o.