Dataset Runs
Create dataset runs and link traces to dataset items from the dashboard or SDK.
Dataset Runs
Dataset runs group execution results for a given dataset, linking traces to specific dataset items for evaluation.
From the Dashboard
Open any dataset and select the Runs tab.
Runs Table
Each row shows one dataset run with these columns:
| Column | Description |
|---|---|
| Run Name | Run name (click to open run detail) |
| Version | Run version number |
| Status | Badge: COMPLETED, PROCESSING, FAILED, etc. |
| Error | Error message if the run failed |
| Created At | Timestamp |
| Count Run Items | Number of items in the run |
| Avg Latency | Average latency in seconds |
| Avg Total Cost | Average cost in USD |
| Scores | Evaluation scores (grouped by evaluator) |
| Tags | Tags attached to the run |
| Linked Experiments | Experiments referencing this run |
Create a Run
Click Create Dataset Run in the top-right of the Runs tab.
Choose how outputs will be generated:
- Prompt — select a prompt and version from your library. Model parameters (provider, model, temperature) are inherited from the prompt but can be edited.
- API Record — select a configured API endpoint that will be called for each dataset item.
Set Concurrency (1–10) — how many items to process in parallel.
Click Execute Dataset Run to start. The run appears in the table with a PROCESSING status and updates to COMPLETED when done.
Upload CSV to a Run
Click Upload Dataset Run in the top-right of the Runs tab.
Select or create a Dataset Run Tag — type a name to create a new tag.
Upload a CSV file. Preview the parsed data before confirming.
The items are added to a mutable dataset run under the selected tag.
Run Actions
Click the actions menu on any run row for:
- Export as CSV — download run results as CSV
- Export as JSON — download run results as JSON
- Delete — permanently remove the run
Bulk Actions
Select multiple runs using the checkboxes, then use the Actions dropdown:
- Compare — compare selected runs side by side (requires 2+ runs)
From the SDK
Create a Dataset Run
import { AISDK } from '@browserstack/ai-sdk';
const testOps = new AISDK({
publicKey: process.env.AISDK_PUBLIC_KEY,
secretKey: process.env.AISDK_SECRET_KEY,
});
const datasetRuns = testOps.datasetRuns; // DatasetRuns
const datasetName = 'qa-golden-set';
const tagName = 'production-v1';
console.log('Creating dataset run...');
const datasetRun = await datasetRuns.create(
datasetName,
{ name: 'My Dataset Run', tag: tagName}
);
console.log(`Dataset run created with ID: ${datasetRun.id}`);
console.log(`Dataset run tag ID: ${datasetRun.tags[0].id}`);run = client.dataset_runs.create(
dataset_name="qa-golden-set",
name="gpt-4o-run-1",
description="GPT-4o baseline evaluation",
tag="production-v1", # optional
)
print(f"Run ID: {run['id']}")
print(f"Run tag id: {run['tags'][0]['id']}")import com.browserstack.aisdk.eval.DatasetRunsClient;
DatasetRunsClient datasetRuns = sdk.datasetRuns();
DatasetRunResponse run = datasetRuns.create("qa-golden-set");
System.out.println("Run ID: " + run.getId());Add Run Items
Add dataset run items to a datasetRun
await datasetRuns.createItems(
'qa-golden-set',
datasetRun.id,
[
{
input: { question: 'What is AI?' },
expectedOutput: { answer: 'AI is...' },
},
]
);client.dataset_runs.create_items(
dataset_name="qa-golden-set",
dataset_run_id=run["id"],
items=[
{
"input": {"messages": [{"role": "user", "content": "What is machine learning?"}]},
"expectedOutput": {"should_contain": ["algorithm", "data", "patterns"]},
"traceId": "trace-id-abc",
"observationId": "gen-id-xyz", # optional
},
],
)import com.browserstack.aisdk.eval.model.CreateDatasetRunItemRequest;
List<CreateDatasetRunItemRequest> runItems = items.stream()
.map(item -> CreateDatasetRunItemRequest.builder()
.datasetItemId(item.getId())
.output(generatedOutput)
.traceId(trace.getId())
.build())
.collect(Collectors.toList());
CreateDatasetRunItemsResponse resp = datasetRuns.createItems(run.getId(), runItems);Complete Example
End-to-end example that creates a dataset, runs the pipeline for each item, and links traces to a dataset run.
import { AISDK } from '@browserstack/ai-sdk';
import OpenAI from 'openai';
const testOps = new AISDK({
publicKey: process.env.AISDK_PUBLIC_KEY,
secretKey: process.env.AISDK_SECRET_KEY,
});
const openai = new OpenAI();
async function runEvaluation() {
const datasets = testOps.datasets;
const datasetRuns = testOps.datasetRuns;
// 1. Create dataset with test cases
await datasets.create('trivia-set', 'Trivia QA dataset');
await datasets.createItems({
datasetName: 'trivia-set',
items: [
{ input: { q: 'Capital of Japan?' }, expectedOutput: { a: 'Tokyo' } },
{ input: { q: 'Speed of light?' }, expectedOutput: { a: '299,792,458 m/s' } },
],
});
// 2. Create a run to track results
const run = await datasetRuns.create(
'trivia-set',
{ name: `eval-${Date.now()}` }
);
// 3. Run pipeline for each item and link traces
const items = [
{ id: 'item-1', q: 'Capital of Japan?' },
{ id: 'item-2', q: 'Speed of light?' },
];
for (const item of items) {
const trace = testOps.trace({ name: 'trivia-answer', input: item });
const generation = trace.generation({
name: 'answer',
model: 'gpt-4o',
input: [{ role: 'user', content: item.q }],
});
const response = await openai.chat.completions.create({
model: 'gpt-4o',
messages: [{ role: 'user', content: item.q }],
});
generation.end({ output: response.choices[0].message.content });
trace.update({ output: response.choices[0].message.content });
await datasetRuns.createItems(
'trivia-set',
run.id,
[
{
input: { question: item.q },
expectedOutput: { answer: response.choices[0].message.content },
},
]
);
}
await testOps.shutdown();
console.log('Evaluation complete. Run ID:', run.id);
}
runEvaluation();import os
import openai
from browserstack_ai_sdk import AISDK
client = AISDK(
public_key=os.environ["AISDK_PUBLIC_KEY"],
secret_key=os.environ["AISDK_SECRET_KEY"],
)
openai_client = openai.OpenAI()
# 1. Create dataset with test cases
client.datasets.create(name="trivia-set", description="Trivia QA dataset")
client.datasets.create_items(
dataset_name="trivia-set",
items=[
{"input": {"q": "Capital of Japan?"}, "expectedOutput": {"a": "Tokyo"}},
{"input": {"q": "Speed of light?"}, "expectedOutput": {"a": "299,792,458 m/s"}},
],
)
# 2. Create a run to track results
run = client.dataset_runs.create(
dataset_name="trivia-set",
name="gpt-4o-eval-run",
)
# 3. Run pipeline for each item and link traces
items = [
{"q": "Capital of Japan?", "expected": "Tokyo"},
{"q": "Speed of light?", "expected": "299,792,458 m/s"},
]
for item in items:
trace = client.trace(name="trivia-answer", input=item)
generation = trace.start_generation(
name="answer",
model="gpt-4o-mini",
prompt=[{"role": "user", "content": item["q"]}],
)
response = openai_client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": item["q"]}],
)
answer = response.choices[0].message.content
generation.update(output={"response": answer})
generation.end()
trace.update(output={"answer": answer})
client.dataset_runs.create_items(
dataset_name="trivia-set",
dataset_run_id=run["id"],
items=[{
"input": {"q": item["q"]},
"expectedOutput": {"a": item["expected"]},
}],
)
client.flush()
print(f"Evaluation complete. Run ID: {run['id']}")