Evaluation
Datasets
Create and manage evaluation datasets and dataset runs across TypeScript, Python, and Java SDKs.
Datasets
Datasets store collections of input/expected-output pairs used to evaluate LLM pipelines. Each item has an input, an optional expectedOutput, and optional context and metadata.
Setup
import { AISDK } from '@browserstack/ai-sdk';
const testOps = new AISDK({
publicKey: process.env.AISDK_PUBLIC_KEY,
secretKey: process.env.AISDK_SECRET_KEY,
});
const datasets = testOps.datasets; // DatasetsClient
const datasetRuns = testOps.datasetRuns; // DatasetRunsimport os
from browserstack_ai_sdk import AISDK
client = AISDK(
public_key=os.environ["AISDK_PUBLIC_KEY"],
secret_key=os.environ["AISDK_SECRET_KEY"],
)
datasets = client.datasets; # datasets Client
datasetRuns = client.datasetRuns; # dataset_runs clientimport com.browserstack.aisdk.TestOps;
import com.browserstack.aisdk.eval.DatasetsClient;
import com.browserstack.aisdk.eval.model.*;
TestOps sdk = TestOps.fromEnv();
DatasetsClient datasets = sdk.datasets();Create a Dataset
const dataset = await datasets.create(
'qa-golden-set',
'Golden set for QA evaluation',
{ owner: 'ml-team', version: '1.0' } // optional metadata
);
console.log(dataset.id); // dataset ID
console.log(dataset.name); // 'qa-golden-set'dataset = client.datasets.create(
name="qa-dataset-v1",
description="Question-answer pairs for RAG evaluation",
metadata='{"domain": "customer-support"}',
)
print(dataset)// Name only
DatasetResponse ds = datasets.create("my-dataset");
// With description
DatasetResponse ds = datasets.create("my-dataset", "Questions about Northern Lights");
// With metadata
DatasetResponse ds = datasets.create(
"my-dataset",
"Questions about Northern Lights",
Map.of("team", "nlp", "domain", "science")
);
System.out.println("Created: " + ds.getId() + " — " + ds.getName());List Datasets
const result = await datasets.list(
1, // page (1-indexed)
20, // limit per page
'qa' // optional name filter
);
for (const ds of result.data) {
console.log(ds.id, ds.name);
}result = client.datasets.list(page=1, limit=20)
for dataset in result["data"]:
print(f"{dataset.get('name')}: {dataset.get('id')}")Filter by name:
datasets = client.datasets.list(name="qa-dataset-v1")// Default (page 1, limit 50)
ListDatasetsResponse list = datasets.list();
// With pagination
ListDatasetsResponse list = datasets.list(1, 20);
// Filtered by name
ListDatasetsResponse list = datasets.list(1, 20, "my-dataset");
list.getData().forEach(d -> System.out.println(d.getId() + " " + d.getName()));Create Items
Add multiple items to a dataset at once. Each item has an input, optional expectedOutput, optional context, and optional metadata.
Expected CSV columns: input, expectedOutput, context, metadata, id, sourceTraceId, sourceObservationId, status.
await datasets.createItems({
datasetName: 'qa-golden-set',
items: [
{
input: { question: 'What is the capital of France?' },
expectedOutput: { answer: 'Paris' },
metadata: { difficulty: 'easy' },
},
{
input: { question: 'What is 2 + 2?' },
expectedOutput: { answer: '4' },
},
{
input: { question: 'Who wrote Hamlet?' },
expectedOutput: { answer: 'William Shakespeare' },
context: 'Classic English literature',
},
],
});Import from CSV:
const result = await datasets.createItems({
datasetName: 'qa-golden-set',
fileUrl: '/path/to/dataset.csv',
});
console.log(`Imported ${result.itemCount} items`);result = client.datasets.create_items(
dataset_name="qa-dataset-v1",
items=[
{
"input": {"question": "What is your return policy?"},
"expectedOutput": "Items can be returned within 30 days.",
"metadata": {"category": "returns"},
},
{
"input": {"question": "How do I track my order?"},
"expectedOutput": "Log in and visit the Orders page.",
},
],
)
print(f"Created {result['itemCount']} items")Import from CSV:
result = client.datasets.create_items(
dataset_name="qa-golden-set",
file_url="/path/to/dataset.csv",
options={"batchSize": 50},
)
print(f"Imported {result['itemCount']} items")Items are sent in batches of 100.
import com.browserstack.aisdk.eval.model.CreateDatasetItemRequest;
import java.util.List;
List<CreateDatasetItemRequest> items = List.of(
CreateDatasetItemRequest.builder()
.input("What causes Northern Lights?")
.expectedOutput("Solar wind particles interact with Earth's magnetic field...")
.context("Reference document: Aurora Borealis — NASA")
.build(),
CreateDatasetItemRequest.builder()
.input("How far is the Moon from Earth?")
.expectedOutput("Approximately 384,400 km on average.")
.build()
);
CreateDatasetItemsResponse result = datasets.createItems("my-dataset", items);
System.out.println("Added " + result.getItemCount() + " items");Import from CSV:
CreateDatasetItemsResponse result = datasets.createItemsFromCsv(
"my-dataset",
"/path/to/test-cases.csv"
);
System.out.println("Imported " + result.getItemCount() + " items");Dataset Runs
Dataset runs group execution results for a given dataset, linking traces to specific dataset items.
Create a Dataset Run
console.log('Creating dataset run...');
const datasetRun = await datasetRuns.create(
datasetName,
{ name: 'My Dataset Run', tag: tagName}
);
console.log(`Dataset run created with ID: ${datasetRun.id}`);
console.log(`Dataset run tag ID: ${datasetRun.tags[0].id}`);run = client.dataset_runs.create(
dataset_name="qa-golden-set",
name="gpt-4o-run-1",
description="GPT-4o baseline evaluation",
tag="production-v1", # optional
)
print(f"Run ID: {run['id']}")import com.browserstack.aisdk.eval.DatasetRunsClient;
DatasetRunsClient datasetRuns = sdk.datasetRuns();
DatasetRunResponse run = datasetRuns.create("my-dataset");
System.out.println("Run ID: " + run.getId());Add Run Items
Add dataset run items to a datasetRun
await datasetRuns.createItems(
'qa-golden-set',
datasetRun.id,
[
{
input: { question: 'What is AI?' },
expectedOutput: { answer: 'AI is...' },
},
]
);client.dataset_runs.create_items(
dataset_name="qa-golden-set",
dataset_run_id=run["id"],
items=[
{
"input": {"messages": [{"role": "user", "content": "What is machine learning?"}]},
"expectedOutput": {"should_contain": ["algorithm", "data", "patterns"]},
"traceId": "trace-id-abc",
"observationId": "gen-id-xyz", # optional
},
],
)import com.browserstack.aisdk.eval.model.CreateDatasetRunItemRequest;
List<CreateDatasetRunItemRequest> runItems = items.stream()
.map(item -> CreateDatasetRunItemRequest.builder()
.datasetItemId(item.getId())
.output(generatedOutput)
.traceId(trace.getId())
.build())
.collect(Collectors.toList());
CreateDatasetRunItemsResponse resp = datasetRuns.createItems(run.getId(), runItems);Complete Example
import { AISDK } from '@browserstack/ai-sdk';
import OpenAI from 'openai';
const testOps = new AISDK({
publicKey: process.env.AISDK_PUBLIC_KEY,
secretKey: process.env.AISDK_SECRET_KEY,
});
const openai = new OpenAI();
async function runEvaluation() {
const datasets = testOps.datasets;
const datasetRuns = testOps.datasetRuns;
// 1. Create dataset with test cases
await datasets.create('trivia-set', 'Trivia QA dataset');
await datasets.createItems({
datasetName: 'trivia-set',
items: [
{ input: { q: 'Capital of Japan?' }, expectedOutput: { a: 'Tokyo' } },
{ input: { q: 'Speed of light?' }, expectedOutput: { a: '299,792,458 m/s' } },
],
});
// 2. Create a run to track results
const run = await datasetRuns.create(
'trivia-set',
{ name: `eval-${Date.now()}` }
);
// 3. Run pipeline for each item and link traces
const items = [
{ id: 'item-1', q: 'Capital of Japan?' },
{ id: 'item-2', q: 'Speed of light?' },
];
for (const item of items) {
const trace = testOps.trace({ name: 'trivia-answer', input: item });
const generation = trace.generation({
name: 'answer',
model: 'gpt-4o',
input: [{ role: 'user', content: item.q }],
});
const response = await openai.chat.completions.create({
model: 'gpt-4o',
messages: [{ role: 'user', content: item.q }],
});
generation.end({ output: response.choices[0].message.content });
trace.update({ output: response.choices[0].message.content });
await datasetRuns.createItems(
'trivia-set',
run.id,
[
{
input: { question: item.q },
expectedOutput: { answer: response.choices[0].message.content },
},
]
);
}
await testOps.shutdown();
console.log('Evaluation complete. Run ID:', run.id);
}
runEvaluation();import os
import openai
from browserstack_ai_sdk import AISDK
client = AISDK(
public_key=os.environ["AISDK_PUBLIC_KEY"],
secret_key=os.environ["AISDK_SECRET_KEY"],
)
openai_client = openai.OpenAI()
# 1. Create dataset with test cases
client.datasets.create(name="trivia-set", description="Trivia QA dataset")
client.datasets.create_items(
dataset_name="trivia-set",
items=[
{"input": {"q": "Capital of Japan?"}, "expectedOutput": {"a": "Tokyo"}},
{"input": {"q": "Speed of light?"}, "expectedOutput": {"a": "299,792,458 m/s"}},
],
)
# 2. Create a run to track results
run = client.dataset_runs.create(
dataset_name="trivia-set",
name="gpt-4o-eval-run",
)
# 3. Run pipeline for each item and link traces
items = [
{"q": "Capital of Japan?", "expected": "Tokyo"},
{"q": "Speed of light?", "expected": "299,792,458 m/s"},
]
for item in items:
trace = client.trace(name="trivia-answer", input=item)
generation = trace.start_generation(
name="answer",
model="gpt-4o-mini",
prompt=[{"role": "user", "content": item["q"]}],
)
response = openai_client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": item["q"]}],
)
answer = response.choices[0].message.content
generation.update(output={"response": answer})
generation.end()
trace.update(output={"answer": answer})
client.dataset_runs.create_items(
dataset_name="trivia-set",
dataset_run_id=run["id"],
items=[{
"input": {"q": item["q"]},
"expectedOutput": {"a": item["expected"]},
}],
)
client.flush()
print(f"Evaluation complete. Run ID: {run['id']}")