Dataset Runs

Dataset runs group execution results for a given dataset, linking traces to specific dataset items for evaluation.

From the Dashboard

Open any dataset and select the Runs tab.

Runs Table

Each row shows one dataset run with these columns:

Column	Description
Run Name	Run name (click to open run detail)
Version	Run version number
Status	Badge: `COMPLETED`, `PROCESSING`, `FAILED`, etc.
Error	Error message if the run failed
Created At	Timestamp
Count Run Items	Number of items in the run
Avg Latency	Average latency in seconds
Avg Total Cost	Average cost in USD
Scores	Evaluation scores (grouped by evaluator)
Tags	Tags attached to the run
Linked Experiments	Experiments referencing this run

Create a Run

Click Create Dataset Run in the top-right of the Runs tab.

Choose how outputs will be generated:

Prompt — select a prompt and version from your library. Model parameters (provider, model, temperature) are inherited from the prompt but can be edited.
API Record — select a configured API endpoint that will be called for each dataset item.

Set Concurrency (1–10) — how many items to process in parallel.

Click Execute Dataset Run to start. The run appears in the table with a PROCESSING status and updates to COMPLETED when done.

Upload CSV to a Run

Click Upload Dataset Run in the top-right of the Runs tab.

Select or create a Dataset Run Tag — type a name to create a new tag.

Upload a CSV file. Preview the parsed data before confirming.

The items are added to a mutable dataset run under the selected tag.

Run Actions

Click the actions menu on any run row for:

Export as CSV — download run results as CSV
Export as JSON — download run results as JSON
Delete — permanently remove the run

Bulk Actions

Select multiple runs using the checkboxes, then use the Actions dropdown:

Compare — compare selected runs side by side (requires 2+ runs)

From the SDK

Create a Dataset Run


import { AISDK } from '@browserstack/ai-sdk';
const testOps = new AISDK({
  publicKey: process.env.AISDK_PUBLIC_KEY,
  secretKey: process.env.AISDK_SECRET_KEY,
});

const datasetRuns = testOps.datasetRuns; // DatasetRuns

const datasetName = 'qa-golden-set';
const tagName = 'production-v1';
    
console.log('Creating dataset run...');
const datasetRun = await datasetRuns.create(
  datasetName,
  { name: 'My Dataset Run', tag: tagName}
);
    
console.log(`Dataset run created with ID: ${datasetRun.id}`);
console.log(`Dataset run tag ID: ${datasetRun.tags[0].id}`);

run = client.dataset_runs.create(
    dataset_name="qa-golden-set",
    name="gpt-4o-run-1",
    description="GPT-4o baseline evaluation",
    tag="production-v1",  # optional
)

print(f"Run ID: {run['id']}")
print(f"Run tag id: {run['tags'][0]['id']}")

import com.browserstack.aisdk.eval.DatasetRunsClient;

DatasetRunsClient datasetRuns = sdk.datasetRuns();

DatasetRunResponse run = datasetRuns.create("qa-golden-set");
System.out.println("Run ID: " + run.getId());

Add Run Items

Add dataset run items to a datasetRun

await datasetRuns.createItems(
  'qa-golden-set',
  datasetRun.id,
  [
    {
      input: { question: 'What is AI?' },
      expectedOutput: { answer: 'AI is...' },
    },
  ]
);

client.dataset_runs.create_items(
    dataset_name="qa-golden-set",
    dataset_run_id=run["id"],
    items=[
        {
            "input": {"messages": [{"role": "user", "content": "What is machine learning?"}]},
            "expectedOutput": {"should_contain": ["algorithm", "data", "patterns"]},
            "traceId": "trace-id-abc",
            "observationId": "gen-id-xyz",  # optional
        },
    ],
)

import com.browserstack.aisdk.eval.model.CreateDatasetRunItemRequest;

List<CreateDatasetRunItemRequest> runItems = items.stream()
    .map(item -> CreateDatasetRunItemRequest.builder()
        .datasetItemId(item.getId())
        .output(generatedOutput)
        .traceId(trace.getId())
        .build())
    .collect(Collectors.toList());

CreateDatasetRunItemsResponse resp = datasetRuns.createItems(run.getId(), runItems);

Complete Example

End-to-end example that creates a dataset, runs the pipeline for each item, and links traces to a dataset run.

import { AISDK } from '@browserstack/ai-sdk';
import OpenAI from 'openai';

const testOps = new AISDK({
  publicKey: process.env.AISDK_PUBLIC_KEY,
  secretKey: process.env.AISDK_SECRET_KEY,
});
const openai = new OpenAI();

async function runEvaluation() {
  const datasets = testOps.datasets;
  const datasetRuns = testOps.datasetRuns;

  // 1. Create dataset with test cases
  await datasets.create('trivia-set', 'Trivia QA dataset');
  await datasets.createItems({
    datasetName: 'trivia-set',
    items: [
      { input: { q: 'Capital of Japan?' }, expectedOutput: { a: 'Tokyo' } },
      { input: { q: 'Speed of light?' }, expectedOutput: { a: '299,792,458 m/s' } },
    ],
  });

  // 2. Create a run to track results
  const run = await datasetRuns.create(
    'trivia-set',
    { name: `eval-${Date.now()}` }
  );

  // 3. Run pipeline for each item and link traces
  const items = [
    { id: 'item-1', q: 'Capital of Japan?' },
    { id: 'item-2', q: 'Speed of light?' },
  ];

  for (const item of items) {
    const trace = testOps.trace({ name: 'trivia-answer', input: item });
    const generation = trace.generation({
      name: 'answer',
      model: 'gpt-4o',
      input: [{ role: 'user', content: item.q }],
    });

    const response = await openai.chat.completions.create({
      model: 'gpt-4o',
      messages: [{ role: 'user', content: item.q }],
    });

    generation.end({ output: response.choices[0].message.content });
    trace.update({ output: response.choices[0].message.content });

    await datasetRuns.createItems(
      'trivia-set',
      run.id,
      [
        {
          input: { question: item.q },
          expectedOutput: { answer: response.choices[0].message.content },
        },
      ]
    );
  }

  await testOps.shutdown();
  console.log('Evaluation complete. Run ID:', run.id);
}

runEvaluation();

import os
import openai
from browserstack_ai_sdk import AISDK

client = AISDK(
    public_key=os.environ["AISDK_PUBLIC_KEY"],
    secret_key=os.environ["AISDK_SECRET_KEY"],
)
openai_client = openai.OpenAI()

# 1. Create dataset with test cases
client.datasets.create(name="trivia-set", description="Trivia QA dataset")
client.datasets.create_items(
    dataset_name="trivia-set",
    items=[
        {"input": {"q": "Capital of Japan?"}, "expectedOutput": {"a": "Tokyo"}},
        {"input": {"q": "Speed of light?"}, "expectedOutput": {"a": "299,792,458 m/s"}},
    ],
)

# 2. Create a run to track results
run = client.dataset_runs.create(
    dataset_name="trivia-set",
    name="gpt-4o-eval-run",
)

# 3. Run pipeline for each item and link traces
items = [
    {"q": "Capital of Japan?", "expected": "Tokyo"},
    {"q": "Speed of light?", "expected": "299,792,458 m/s"},
]

for item in items:
    trace = client.trace(name="trivia-answer", input=item)
    generation = trace.start_generation(
        name="answer",
        model="gpt-4o-mini",
        prompt=[{"role": "user", "content": item["q"]}],
    )

    response = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": item["q"]}],
    )

    answer = response.choices[0].message.content
    generation.update(output={"response": answer})
    generation.end()
    trace.update(output={"answer": answer})

    client.dataset_runs.create_items(
        dataset_name="trivia-set",
        dataset_run_id=run["id"],
        items=[{
            "input": {"q": item["q"]},
            "expectedOutput": {"a": item["expected"]},
        }],
    )


client.flush()
print(f"Evaluation complete. Run ID: {run['id']}")

Dataset Runs

On this page