Evaluation
Experiments
Create and run experiments across TypeScript, Python, and Java SDKs.
Experiments
Experiments let you systematically evaluate a prompt + dataset + evaluator combination. Each run executes every dataset item through the prompt and scores the output.
Setup
import { AISDK } from '@browserstack/ai-sdk';
const testOps = new AISDK({
publicKey: process.env.AISDK_PUBLIC_KEY,
secretKey: process.env.AISDK_SECRET_KEY,
});
const experiments = testOps.experiments; // Experiments
const experimentRuns = testOps.experimentRuns; // ExperimentRunsimport os
from browserstack_ai_sdk import AISDK
client = AISDK(
public_key=os.environ["AISDK_PUBLIC_KEY"],
secret_key=os.environ["AISDK_SECRET_KEY"],
)import com.browserstack.aisdk.TestOps;
import com.browserstack.aisdk.eval.ExperimentsClient;
import com.browserstack.aisdk.eval.ExperimentRunsClient;
TestOps sdk = TestOps.fromEnv();
ExperimentsClient experiments = sdk.experiments();
ExperimentRunsClient experimentRuns = sdk.experimentRuns();Create an Experiment
An experiment requires a name, an evaluator list, and either a datasetRunTagId or both promptId + datasetId.
With a prompt and dataset
import { CreateExperimentWithPromptRequest } from '@browserstack/ai-sdk';
const experiment = await experiments.create({
name: 'qa-accuracy-test',
description: 'Evaluate QA accuracy on golden set',
promptId: 'prompt-id-abc',
datasetId: 'dataset-id-xyz',
evaluatorListId: 'eval-list-id-123',
concurrency: 5,
} satisfies CreateExperimentWithPromptRequest);
console.log(experiment.id);experiment = client.experiments.create({
"name": "rag-eval-v2",
"evaluatorListId": "eval-list-id",
"datasetId": "dataset-id",
"promptId": "prompt-id",
})import com.browserstack.aisdk.eval.model.CreateExperimentRequest;
import com.browserstack.aisdk.eval.model.ExperimentResponse;
ExperimentResponse experiment = experiments.create(
CreateExperimentRequest.builder()
.name("gpt-4o-faithfulness-v2")
.evaluatorListId("evl_abc123")
.promptId("prm_xyz789")
.datasetId("ds_def456")
.concurrency(5)
.build()
);
System.out.println("Experiment ID: " + experiment.getId());You must provide either datasetRunTagId alone, or both promptId and datasetId together. Mixing them throws IllegalArgumentException.
With a dataset run tag
import { CreateExperimentWithTagRequest } from '@browserstack/ai-sdk';
const experiment = await experiments.create({
name: 'tag-based-experiment',
datasetRunTagId: 'tag-id-abc',
evaluatorListId: 'eval-list-id-123',
concurrency: 3,
} satisfies CreateExperimentWithTagRequest);experiment = client.experiments.create({
"name": "rag-eval-v1",
"evaluatorListId": "eval-list-id",
"datasetRunTagId": "tag-id-from-dataset-run",
})
print(experiment)List and Get Experiments
const result = await experiments.list(
20, // limit (1-100)
1 // page
);
for (const exp of result.experiments) {
console.log(exp.id, exp.name, exp.createdAt);
}
const experiment = await experiments.find('experiment-id-abc');
console.log(experiment.name, experiment.status);experiments = client.experiments.list(page=1, limit=50)
experiment = client.experiments.get("experiment-id")
print(experiment)ListExperimentsResponse list = experiments.list(20, 1);
list.getData().forEach(e -> System.out.println(e.getId() + " " + e.getName()));
ExperimentResponse experiment = experiments.find("exp_abc123");
System.out.println("Status: " + experiment.getStatus());Create and Monitor Runs
Create a Run
const run = await experimentRuns.create(
'experiment-id-abc', // experimentId
'output', // llmColumnName
5, // concurrency (1-100)
{ temperature: 0.0 } // optional runConfig
);
console.log(run.id, run.status); // 'PENDING'run = client.experiment_runs.create({
"experimentId": "experiment-id",
"name": "run-2026-04-01",
})
print(run)import com.browserstack.aisdk.eval.model.CreateExperimentRunRequest;
import com.browserstack.aisdk.eval.model.ExperimentRunResponse;
ExperimentRunResponse run = experimentRuns.create(
CreateExperimentRunRequest.builder()
.experimentId(experiment.getId())
.build()
);
System.out.println("Run ID: " + run.getId());Poll for Completion
const result = await experimentRuns.subscribe(
run.id,
120_000, // timeout in ms
5_000 // poll interval in ms
);
if (result.finalStatus === 'COMPLETED') {
console.log('Run completed:', result.experimentRunData);
} else {
console.error('Run failed or timed out:', result.finalStatus);
}// Default: polls every 5s, times out after 5 minutes
ExperimentRunResponse finalRun = experimentRuns.subscribe(
run.getId(),
update -> System.out.println("Status: " + update.getStatus())
);
System.out.println("Final status: " + finalRun.getStatus());With a custom timeout:
long tenMinutes = 10 * 60 * 1_000L;
ExperimentRunResponse finalRun = experimentRuns.subscribe(
run.getId(),
update -> System.out.println("[" + update.getStatus() + "] " + update.getProgress()),
tenMinutes
);Run Status Values
enum ExperimentRunStatus {
PENDING = 'PENDING',
RUNNING = 'RUNNING',
COMPLETED = 'COMPLETED',
FAILED = 'FAILED',
CANCELLED = 'CANCELLED',
}Complete End-to-End Example
import { AISDK, ExperimentRunStatus } from '@browserstack/ai-sdk';
const testOps = new AISDK({
publicKey: process.env.AISDK_PUBLIC_KEY,
secretKey: process.env.AISDK_SECRET_KEY,
});
async function runExperiment() {
const experiments = testOps.experiments;
const experimentRuns = testOps.experimentRuns;
// 1. Create the experiment
const experiment = await experiments.create({
name: `accuracy-test-${Date.now()}`,
description: 'Automated accuracy evaluation',
promptId: process.env.PROMPT_ID!,
datasetId: process.env.DATASET_ID!,
evaluatorListId: process.env.EVALUATOR_LIST_ID!,
concurrency: 5,
});
console.log('Created experiment:', experiment.id);
// 2. Start a run
const run = await experimentRuns.create(experiment.id, 'output', 5);
console.log('Started run:', run.id, 'status:', run.status);
// 3. Wait for completion
const result = await experimentRuns.subscribe(run.id, 300_000, 10_000);
if (result.finalStatus === 'COMPLETED') {
console.log('Experiment completed successfully!');
console.log('Run details:', result.experimentRunData);
} else {
console.error('Experiment did not complete:', result.finalStatus);
process.exit(1);
}
await testOps.shutdown();
}
runExperiment();import os
from browserstack_ai_sdk import AISDK
client = AISDK(
public_key=os.environ["AISDK_PUBLIC_KEY"],
secret_key=os.environ["AISDK_SECRET_KEY"],
)
# 1. Create experiment
experiment = client.experiments.create({
"name": "support-qa-eval",
"evaluatorListId": "<evaluator-list-id>",
"datasetId": "<dataset-id>",
"promptId": "<prompt-id>",
})
# 2. Create a run
run = client.experiment_runs.create({
"experimentId": experiment["id"],
"name": "initial-run",
})
print(f"Experiment run created: {run['id']}")import com.browserstack.aisdk.TestOps;
import com.browserstack.aisdk.eval.model.*;
import java.util.List;
public class ExperimentExample {
public static void main(String[] args) throws InterruptedException {
TestOps sdk = TestOps.fromEnv();
// 1. Create an experiment
ExperimentResponse experiment = sdk.experiments().create(
CreateExperimentRequest.builder()
.name("aurora-qa-v1")
.evaluatorListId("evl_abc123")
.promptId("prm_xyz789")
.datasetId("aurora-qa")
.build()
);
// 2. Start a run
ExperimentRunResponse run = sdk.experimentRuns().create(
CreateExperimentRunRequest.builder()
.experimentId(experiment.getId())
.build()
);
// 3. Wait for completion
ExperimentRunResponse result = sdk.experimentRuns().subscribe(
run.getId(),
r -> System.out.printf("[%s] %s%n", r.getStatus(), r.getId())
);
System.out.println("Done: " + result.getStatus());
sdk.shutdown();
}
}