Save $$$ on LLM inference
Using batch mode
from datasets import load_dataset
dataset = load_dataset("allenai/WildChat", split="train")
dataset = dataset.select(range(3_000)) # Select a subset of 3,000 samplesfrom bespokelabs import curator
class WildChatReannotator(curator.LLM):
"""A reannotator for the WildChat dataset."""
def prompt(self, input: dict) -> str:
"""Extract the first message from a conversation to use as the prompt."""
return input["conversation"][0]["content"]
def parse(self, input: dict, response: str) -> dict:
"""Parse the model response along with the input to the model into the desired output format."""
instruction = input["conversation"][0]["content"]
return {"instruction": instruction, "new_response": response}
# Initialize the reannotator with batch processing
reannotator = WildChatReannotator(
model_name="gpt-4o-mini",
batch=True, # Enable batch processing
backend_params={"batch_size": 1_000}, # Specify batch size
)
reannotated_dataset = reannotator(dataset).datasetSupported Models
Last updated