BrowserStack AI Evals
EvaluationDatasets

Dataset Runs

Create dataset runs and link traces to dataset items from the dashboard or SDK.

Dataset Runs

Dataset runs group execution results for a given dataset, linking traces to specific dataset items for evaluation.

From the Dashboard

Open any dataset and select the Runs tab.

Runs Table

Each row shows one dataset run with these columns:

ColumnDescription
Run NameRun name (click to open run detail)
VersionRun version number
StatusBadge: COMPLETED, PROCESSING, FAILED, etc.
ErrorError message if the run failed
Created AtTimestamp
Count Run ItemsNumber of items in the run
Avg LatencyAverage latency in seconds
Avg Total CostAverage cost in USD
ScoresEvaluation scores (grouped by evaluator)
TagsTags attached to the run
Linked ExperimentsExperiments referencing this run

Create a Run

Click Create Dataset Run in the top-right of the Runs tab.

Choose how outputs will be generated:

  • Prompt — select a prompt and version from your library. Model parameters (provider, model, temperature) are inherited from the prompt but can be edited.
  • API Record — select a configured API endpoint that will be called for each dataset item.

Set Concurrency (1–10) — how many items to process in parallel.

Click Execute Dataset Run to start. The run appears in the table with a PROCESSING status and updates to COMPLETED when done.

Upload CSV to a Run

Click Upload Dataset Run in the top-right of the Runs tab.

Select or create a Dataset Run Tag — type a name to create a new tag.

Upload a CSV file. Preview the parsed data before confirming.

The items are added to a mutable dataset run under the selected tag.

Run Actions

Click the actions menu on any run row for:

  • Export as CSV — download run results as CSV
  • Export as JSON — download run results as JSON
  • Delete — permanently remove the run

Bulk Actions

Select multiple runs using the checkboxes, then use the Actions dropdown:

  • Compare — compare selected runs side by side (requires 2+ runs)

From the SDK

Create a Dataset Run


import { AISDK } from '@browserstack/ai-sdk';
const testOps = new AISDK({
  publicKey: process.env.AISDK_PUBLIC_KEY,
  secretKey: process.env.AISDK_SECRET_KEY,
});

const datasetRuns = testOps.datasetRuns; // DatasetRuns

const datasetName = 'qa-golden-set';
const tagName = 'production-v1';
    
console.log('Creating dataset run...');
const datasetRun = await datasetRuns.create(
  datasetName,
  { name: 'My Dataset Run', tag: tagName}
);
    
console.log(`Dataset run created with ID: ${datasetRun.id}`);
console.log(`Dataset run tag ID: ${datasetRun.tags[0].id}`);
run = client.dataset_runs.create(
    dataset_name="qa-golden-set",
    name="gpt-4o-run-1",
    description="GPT-4o baseline evaluation",
    tag="production-v1",  # optional
)

print(f"Run ID: {run['id']}")
print(f"Run tag id: {run['tags'][0]['id']}")
import com.browserstack.aisdk.eval.DatasetRunsClient;

DatasetRunsClient datasetRuns = sdk.datasetRuns();

DatasetRunResponse run = datasetRuns.create("qa-golden-set");
System.out.println("Run ID: " + run.getId());

Add Run Items

Add dataset run items to a datasetRun

await datasetRuns.createItems(
  'qa-golden-set',
  datasetRun.id,
  [
    {
      input: { question: 'What is AI?' },
      expectedOutput: { answer: 'AI is...' },
    },
  ]
);
client.dataset_runs.create_items(
    dataset_name="qa-golden-set",
    dataset_run_id=run["id"],
    items=[
        {
            "input": {"messages": [{"role": "user", "content": "What is machine learning?"}]},
            "expectedOutput": {"should_contain": ["algorithm", "data", "patterns"]},
            "traceId": "trace-id-abc",
            "observationId": "gen-id-xyz",  # optional
        },
    ],
)
import com.browserstack.aisdk.eval.model.CreateDatasetRunItemRequest;

List<CreateDatasetRunItemRequest> runItems = items.stream()
    .map(item -> CreateDatasetRunItemRequest.builder()
        .datasetItemId(item.getId())
        .output(generatedOutput)
        .traceId(trace.getId())
        .build())
    .collect(Collectors.toList());

CreateDatasetRunItemsResponse resp = datasetRuns.createItems(run.getId(), runItems);

Complete Example

End-to-end example that creates a dataset, runs the pipeline for each item, and links traces to a dataset run.

import { AISDK } from '@browserstack/ai-sdk';
import OpenAI from 'openai';

const testOps = new AISDK({
  publicKey: process.env.AISDK_PUBLIC_KEY,
  secretKey: process.env.AISDK_SECRET_KEY,
});
const openai = new OpenAI();

async function runEvaluation() {
  const datasets = testOps.datasets;
  const datasetRuns = testOps.datasetRuns;

  // 1. Create dataset with test cases
  await datasets.create('trivia-set', 'Trivia QA dataset');
  await datasets.createItems({
    datasetName: 'trivia-set',
    items: [
      { input: { q: 'Capital of Japan?' }, expectedOutput: { a: 'Tokyo' } },
      { input: { q: 'Speed of light?' }, expectedOutput: { a: '299,792,458 m/s' } },
    ],
  });

  // 2. Create a run to track results
  const run = await datasetRuns.create(
    'trivia-set',
    { name: `eval-${Date.now()}` }
  );

  // 3. Run pipeline for each item and link traces
  const items = [
    { id: 'item-1', q: 'Capital of Japan?' },
    { id: 'item-2', q: 'Speed of light?' },
  ];

  for (const item of items) {
    const trace = testOps.trace({ name: 'trivia-answer', input: item });
    const generation = trace.generation({
      name: 'answer',
      model: 'gpt-4o',
      input: [{ role: 'user', content: item.q }],
    });

    const response = await openai.chat.completions.create({
      model: 'gpt-4o',
      messages: [{ role: 'user', content: item.q }],
    });

    generation.end({ output: response.choices[0].message.content });
    trace.update({ output: response.choices[0].message.content });

    await datasetRuns.createItems(
      'trivia-set',
      run.id,
      [
        {
          input: { question: item.q },
          expectedOutput: { answer: response.choices[0].message.content },
        },
      ]
    );
  }

  await testOps.shutdown();
  console.log('Evaluation complete. Run ID:', run.id);
}

runEvaluation();
import os
import openai
from browserstack_ai_sdk import AISDK

client = AISDK(
    public_key=os.environ["AISDK_PUBLIC_KEY"],
    secret_key=os.environ["AISDK_SECRET_KEY"],
)
openai_client = openai.OpenAI()

# 1. Create dataset with test cases
client.datasets.create(name="trivia-set", description="Trivia QA dataset")
client.datasets.create_items(
    dataset_name="trivia-set",
    items=[
        {"input": {"q": "Capital of Japan?"}, "expectedOutput": {"a": "Tokyo"}},
        {"input": {"q": "Speed of light?"}, "expectedOutput": {"a": "299,792,458 m/s"}},
    ],
)

# 2. Create a run to track results
run = client.dataset_runs.create(
    dataset_name="trivia-set",
    name="gpt-4o-eval-run",
)

# 3. Run pipeline for each item and link traces
items = [
    {"q": "Capital of Japan?", "expected": "Tokyo"},
    {"q": "Speed of light?", "expected": "299,792,458 m/s"},
]

for item in items:
    trace = client.trace(name="trivia-answer", input=item)
    generation = trace.start_generation(
        name="answer",
        model="gpt-4o-mini",
        prompt=[{"role": "user", "content": item["q"]}],
    )

    response = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": item["q"]}],
    )

    answer = response.choices[0].message.content
    generation.update(output={"response": answer})
    generation.end()
    trace.update(output={"answer": answer})

    client.dataset_runs.create_items(
        dataset_name="trivia-set",
        dataset_run_id=run["id"],
        items=[{
            "input": {"q": item["q"]},
            "expectedOutput": {"a": item["expected"]},
        }],
    )


client.flush()
print(f"Evaluation complete. Run ID: {run['id']}")