BrowserStack AI Evals
Evaluation

Experiments

Create and run experiments across TypeScript, Python, and Java SDKs.

Experiments

Experiments let you systematically evaluate a prompt + dataset + evaluator combination. Each run executes every dataset item through the prompt and scores the output.

Setup

import { AISDK } from '@browserstack/ai-sdk';

const testOps = new AISDK({
  publicKey: process.env.AISDK_PUBLIC_KEY,
  secretKey: process.env.AISDK_SECRET_KEY,
});

const experiments = testOps.experiments;       // Experiments
const experimentRuns = testOps.experimentRuns; // ExperimentRuns
import os
from browserstack_ai_sdk import AISDK

client = AISDK(
    public_key=os.environ["AISDK_PUBLIC_KEY"],
    secret_key=os.environ["AISDK_SECRET_KEY"],
)
import com.browserstack.aisdk.TestOps;
import com.browserstack.aisdk.eval.ExperimentsClient;
import com.browserstack.aisdk.eval.ExperimentRunsClient;

TestOps sdk = TestOps.fromEnv();
ExperimentsClient experiments = sdk.experiments();
ExperimentRunsClient experimentRuns = sdk.experimentRuns();

Create an Experiment

An experiment requires a name, an evaluator list, and either a datasetRunTagId or both promptId + datasetId.

With a prompt and dataset

import { CreateExperimentWithPromptRequest } from '@browserstack/ai-sdk';

const experiment = await experiments.create({
  name: 'qa-accuracy-test',
  description: 'Evaluate QA accuracy on golden set',
  promptId: 'prompt-id-abc',
  datasetId: 'dataset-id-xyz',
  evaluatorListId: 'eval-list-id-123',
  concurrency: 5,
} satisfies CreateExperimentWithPromptRequest);

console.log(experiment.id);
experiment = client.experiments.create({
    "name": "rag-eval-v2",
    "evaluatorListId": "eval-list-id",
    "datasetId": "dataset-id",
    "promptId": "prompt-id",
})
import com.browserstack.aisdk.eval.model.CreateExperimentRequest;
import com.browserstack.aisdk.eval.model.ExperimentResponse;

ExperimentResponse experiment = experiments.create(
    CreateExperimentRequest.builder()
        .name("gpt-4o-faithfulness-v2")
        .evaluatorListId("evl_abc123")
        .promptId("prm_xyz789")
        .datasetId("ds_def456")
        .concurrency(5)
        .build()
);

System.out.println("Experiment ID: " + experiment.getId());

You must provide either datasetRunTagId alone, or both promptId and datasetId together. Mixing them throws IllegalArgumentException.

With a dataset run tag

import { CreateExperimentWithTagRequest } from '@browserstack/ai-sdk';

const experiment = await experiments.create({
  name: 'tag-based-experiment',
  datasetRunTagId: 'tag-id-abc',
  evaluatorListId: 'eval-list-id-123',
  concurrency: 3,
} satisfies CreateExperimentWithTagRequest);
experiment = client.experiments.create({
    "name": "rag-eval-v1",
    "evaluatorListId": "eval-list-id",
    "datasetRunTagId": "tag-id-from-dataset-run",
})
print(experiment)

List and Get Experiments

const result = await experiments.list(
  20, // limit (1-100)
  1   // page
);

for (const exp of result.experiments) {
  console.log(exp.id, exp.name, exp.createdAt);
}

const experiment = await experiments.find('experiment-id-abc');
console.log(experiment.name, experiment.status);
experiments = client.experiments.list(page=1, limit=50)

experiment = client.experiments.get("experiment-id")
print(experiment)
ListExperimentsResponse list = experiments.list(20, 1);
list.getData().forEach(e -> System.out.println(e.getId() + " " + e.getName()));

ExperimentResponse experiment = experiments.find("exp_abc123");
System.out.println("Status: " + experiment.getStatus());

Create and Monitor Runs

Create a Run

const run = await experimentRuns.create(
  'experiment-id-abc', // experimentId
  'output',            // llmColumnName
  5,                   // concurrency (1-100)
  { temperature: 0.0 } // optional runConfig
);

console.log(run.id, run.status); // 'PENDING'
run = client.experiment_runs.create({
    "experimentId": "experiment-id",
    "name": "run-2026-04-01",
})
print(run)
import com.browserstack.aisdk.eval.model.CreateExperimentRunRequest;
import com.browserstack.aisdk.eval.model.ExperimentRunResponse;

ExperimentRunResponse run = experimentRuns.create(
    CreateExperimentRunRequest.builder()
        .experimentId(experiment.getId())
        .build()
);

System.out.println("Run ID: " + run.getId());

Poll for Completion

const result = await experimentRuns.subscribe(
  run.id,
  120_000, // timeout in ms
  5_000    // poll interval in ms
);

if (result.finalStatus === 'COMPLETED') {
  console.log('Run completed:', result.experimentRunData);
} else {
  console.error('Run failed or timed out:', result.finalStatus);
}
// Default: polls every 5s, times out after 5 minutes
ExperimentRunResponse finalRun = experimentRuns.subscribe(
    run.getId(),
    update -> System.out.println("Status: " + update.getStatus())
);

System.out.println("Final status: " + finalRun.getStatus());

With a custom timeout:

long tenMinutes = 10 * 60 * 1_000L;

ExperimentRunResponse finalRun = experimentRuns.subscribe(
    run.getId(),
    update -> System.out.println("[" + update.getStatus() + "] " + update.getProgress()),
    tenMinutes
);

Run Status Values

enum ExperimentRunStatus {
  PENDING    = 'PENDING',
  RUNNING    = 'RUNNING',
  COMPLETED  = 'COMPLETED',
  FAILED     = 'FAILED',
  CANCELLED  = 'CANCELLED',
}

Complete End-to-End Example

import { AISDK, ExperimentRunStatus } from '@browserstack/ai-sdk';

const testOps = new AISDK({
  publicKey: process.env.AISDK_PUBLIC_KEY,
  secretKey: process.env.AISDK_SECRET_KEY,
});

async function runExperiment() {
  const experiments = testOps.experiments;
  const experimentRuns = testOps.experimentRuns;

  // 1. Create the experiment
  const experiment = await experiments.create({
    name: `accuracy-test-${Date.now()}`,
    description: 'Automated accuracy evaluation',
    promptId: process.env.PROMPT_ID!,
    datasetId: process.env.DATASET_ID!,
    evaluatorListId: process.env.EVALUATOR_LIST_ID!,
    concurrency: 5,
  });

  console.log('Created experiment:', experiment.id);

  // 2. Start a run
  const run = await experimentRuns.create(experiment.id, 'output', 5);
  console.log('Started run:', run.id, 'status:', run.status);

  // 3. Wait for completion
  const result = await experimentRuns.subscribe(run.id, 300_000, 10_000);

  if (result.finalStatus === 'COMPLETED') {
    console.log('Experiment completed successfully!');
    console.log('Run details:', result.experimentRunData);
  } else {
    console.error('Experiment did not complete:', result.finalStatus);
    process.exit(1);
  }

  await testOps.shutdown();
}

runExperiment();
import os
from browserstack_ai_sdk import AISDK

client = AISDK(
    public_key=os.environ["AISDK_PUBLIC_KEY"],
    secret_key=os.environ["AISDK_SECRET_KEY"],
)

# 1. Create experiment
experiment = client.experiments.create({
    "name": "support-qa-eval",
    "evaluatorListId": "<evaluator-list-id>",
    "datasetId": "<dataset-id>",
    "promptId": "<prompt-id>",
})

# 2. Create a run
run = client.experiment_runs.create({
    "experimentId": experiment["id"],
    "name": "initial-run",
})

print(f"Experiment run created: {run['id']}")
import com.browserstack.aisdk.TestOps;
import com.browserstack.aisdk.eval.model.*;
import java.util.List;

public class ExperimentExample {
    public static void main(String[] args) throws InterruptedException {
        TestOps sdk = TestOps.fromEnv();

        // 1. Create an experiment
        ExperimentResponse experiment = sdk.experiments().create(
            CreateExperimentRequest.builder()
                .name("aurora-qa-v1")
                .evaluatorListId("evl_abc123")
                .promptId("prm_xyz789")
                .datasetId("aurora-qa")
                .build()
        );

        // 2. Start a run
        ExperimentRunResponse run = sdk.experimentRuns().create(
            CreateExperimentRunRequest.builder()
                .experimentId(experiment.getId())
                .build()
        );

        // 3. Wait for completion
        ExperimentRunResponse result = sdk.experimentRuns().subscribe(
            run.getId(),
            r -> System.out.printf("[%s] %s%n", r.getStatus(), r.getId())
        );

        System.out.println("Done: " + result.getStatus());
        sdk.shutdown();
    }
}