Evaluate standard tasks with my dataΒΆ
In Unitxt you can easily evaluate any model on your data:
# Import required components
from unitxt import evaluate, create_dataset
from unitxt.blocks import Task, InputOutputTemplate
from unitxt.inference import HFAutoModelInferenceEngine
# Question-answer dataset
data = [
{"question": "What is the capital of Texas?", "answer": "Austin"},
{"question": "What is the color of the sky?", "answer": "Blue"},
]
# Define the task and evaluation metric
task = Task(
input_fields={"question": str},
reference_fields={"answer": str},
prediction_type=str,
metrics=["metrics.accuracy"],
)
# Create a template to format inputs and outputs
template = InputOutputTemplate(
instruction="Answer the following question.",
input_format="{question}",
output_format="{answer}",
postprocessors=["processors.lower_case"],
)
# Prepare the dataset
dataset = create_dataset(
task=task,
template=template,
format="formats.chat_api",
test_set=data,
split="test",
)
# Set up the model (supports Hugging Face, WatsonX, OpenAI, etc.)
model = HFAutoModelInferenceEngine(
model_name="Qwen/Qwen1.5-0.5B-Chat", max_new_tokens=32
)
# Generate predictions and evaluate
predictions = model(dataset)
results = evaluate(predictions=predictions, data=dataset)
# Print results
print("Global Results:\n", results.global_scores.summary)
print("Instance Results:\n", results.instance_scores.summary)