BrowserStack AI Evals
EvaluationExperiments

Experiment Runs

Start experiment runs, poll for completion, and view results across TypeScript, Python, and Java SDKs.

Experiment Runs

Create a Run

const run = await experimentRuns.create(
  'experiment-id-abc', // experimentId
  'output',            // llmColumnName
  5,                   // concurrency (1-100)
  { temperature: 0.0 } // optional runConfig
);

console.log(run.id, run.status); // 'PENDING'
run = client.experiment_runs.create(
    "cmnr8ovww005mif07uuxoafrq",  # experiment_id
    )

print(run["id"], run["status"])
import com.browserstack.aisdk.eval.model.CreateExperimentRunRequest;
import com.browserstack.aisdk.eval.model.ExperimentRunResponse;

ExperimentRunResponse run = experimentRuns.create(
    CreateExperimentRunRequest.builder()
        .experimentId(experiment.getId())
        .build()
);

System.out.println("Run ID: " + run.getId());
System.out.println("Status: " + run.getStatus());

Poll for Completion

const result = await experimentRuns.subscribe(
  run.id,
  120_000, // timeout in ms
  5_000    // poll interval in ms
);

if (result.finalStatus === 'COMPLETED') {
  console.log('Run completed:', result.experimentRunData);
} else {
  console.error('Run failed or timed out:', result.finalStatus);
}
// Default: polls every 5s, times out after 5 minutes
ExperimentRunResponse finalRun = experimentRuns.subscribe(
    run.getId(),
    update -> System.out.println("Status: " + update.getStatus())
);

System.out.println("Final status: " + finalRun.getStatus());

With a custom timeout:

long tenMinutes = 10 * 60 * 1_000L;

ExperimentRunResponse finalRun = experimentRuns.subscribe(
    run.getId(),
    update -> System.out.println("[" + update.getStatus() + "] " + update.getProgress()),
    tenMinutes
);

Without a callback:

ExperimentRunResponse finalRun = experimentRuns.subscribe(run.getId());

Get a Run

const run = await experimentRuns.find('run-id-abc');
console.log(run.status);
run = client.experiment_runs.find("run-id")
ExperimentRunResponse run = experimentRuns.get("run_abc123");

List Runs

const result = await experimentRuns.list(
  'experiment-id-abc', // optional filter
  1,                   // page
  20                   // limit
);

for (const r of result.experimentRuns) {
  console.log(r.id, r.status, r.createdAt);
}
runs = client.experiment_runs.list(experiment_id="experiment-id")
// Runs for a specific experiment
ListExperimentRunsResponse runs = experimentRuns.list("exp_abc123");

// With pagination
ListExperimentRunsResponse runs = experimentRuns.list("exp_abc123", 1, 20);

// All runs (no experiment filter)
ListExperimentRunsResponse runs = experimentRuns.list(null, 1, 50);

Update a Run

import { ExperimentRunStatus } from '@browserstack/ai-sdk';

await experimentRuns.update('run-id-abc', {
  status: ExperimentRunStatus.COMPLETED,
  result: { score: 0.87 },
  metadata: { note: 'Manual override' },
});

Delete a Run

await experimentRuns.delete('run-id-abc');

Run Status Values

enum ExperimentRunStatus {
  PENDING    = 'PENDING',
  RUNNING    = 'RUNNING',
  COMPLETED  = 'COMPLETED',
  FAILED     = 'FAILED',
  CANCELLED  = 'CANCELLED',
}

Complete End-to-End Example

import { AISDK, ExperimentRunStatus } from '@browserstack/ai-sdk';

const testOps = new AISDK({
  publicKey: process.env.AISDK_PUBLIC_KEY,
  secretKey: process.env.AISDK_SECRET_KEY,
});

async function runExperiment() {
  const experiments = testOps.experiments;
  const experimentRuns = testOps.experimentRuns;

  // 1. Create the experiment
  const experiment = await experiments.create({
    name: `accuracy-test-${Date.now()}`,
    description: 'Automated accuracy evaluation',
    promptId: process.env.PROMPT_ID!,
    datasetId: process.env.DATASET_ID!,
    evaluatorListId: process.env.EVALUATOR_LIST_ID!,
    concurrency: 5,
  });

  console.log('Created experiment:', experiment.id);

  // 2. Start a run
  const run = await experimentRuns.create(experiment.id, 'output', 5);
  console.log('Started run:', run.id, 'status:', run.status);

  // 3. Wait for completion
  const result = await experimentRuns.subscribe(run.id, 300_000, 10_000);

  if (result.finalStatus === 'COMPLETED') {
    console.log('Experiment completed successfully!');
    console.log('Run details:', result.experimentRunData);
  } else {
    console.error('Experiment did not complete:', result.finalStatus);
    process.exit(1);
  }

  await testOps.shutdown();
}

runExperiment();
import os
from browserstack_ai_sdk import AISDK

client = AISDK(
    public_key=os.environ["AISDK_PUBLIC_KEY"],
    secret_key=os.environ["AISDK_SECRET_KEY"],
)

# 1. Create dataset (see Datasets page)
client.datasets.create(name="support-qa")
client.datasets.create_items(
    dataset_name="support-qa",
    items=[
        {"input": {"q": "How do I reset my password?"}, "expectedOutput": "Go to Settings > Security > Reset."},
        {"input": {"q": "What are your business hours?"}, "expectedOutput": "We are open 9am–6pm EST."},
    ],
)

# 2. Create experiment
experiment = client.experiments.create({
    "name": "support-qa-eval",
    "evaluatorListId": "<evaluator-list-id>",
    "datasetId": "<dataset-id>",
    "promptId": "<prompt-id>",
})

# 3. Create a run
run = client.experiment_runs.create({
    "experimentId": experiment["id"],
    "name": "initial-run",
})

print(f"Experiment run created: {run['id']}")
import com.browserstack.aisdk.TestOps;
import com.browserstack.aisdk.eval.model.*;
import java.util.List;

public class ExperimentExample {
    public static void main(String[] args) throws InterruptedException {
        TestOps sdk = TestOps.fromEnv();

        // 1. Create a dataset
        sdk.datasets().create("aurora-qa", "Q&A about Aurora Borealis");
        sdk.datasets().createItems("aurora-qa", List.of(
            CreateDatasetItemRequest.builder()
                .input("What causes Northern Lights?")
                .expectedOutput("Solar wind particles colliding with atmospheric gases.")
                .build()
        ));

        // 2. Create an experiment
        ExperimentResponse experiment = sdk.experiments().create(
            CreateExperimentRequest.builder()
                .name("aurora-qa-v1")
                .evaluatorListId("evl_abc123")
                .promptId("prm_xyz789")
                .datasetId("aurora-qa")
                .build()
        );

        // 3. Start a run
        ExperimentRunResponse run = sdk.experimentRuns().create(
            CreateExperimentRunRequest.builder()
                .experimentId(experiment.getId())
                .build()
        );

        // 4. Wait for completion
        ExperimentRunResponse result = sdk.experimentRuns().subscribe(
            run.getId(),
            r -> System.out.printf("[%s] %s%n", r.getStatus(), r.getId())
        );

        System.out.println("Done: " + result.getStatus());
        sdk.shutdown();
    }
}