BrowserStack AI Evals
Evaluation

Datasets

Create and manage evaluation datasets and dataset runs across TypeScript, Python, and Java SDKs.

Datasets

Datasets store collections of input/expected-output pairs used to evaluate LLM pipelines. Each item has an input, an optional expectedOutput, and optional context and metadata.

Setup

import { AISDK } from '@browserstack/ai-sdk';

const testOps = new AISDK({
  publicKey: process.env.AISDK_PUBLIC_KEY,
  secretKey: process.env.AISDK_SECRET_KEY,
});

const datasets = testOps.datasets;       // DatasetsClient
const datasetRuns = testOps.datasetRuns; // DatasetRuns
import os
from browserstack_ai_sdk import AISDK

client = AISDK(
    public_key=os.environ["AISDK_PUBLIC_KEY"],
    secret_key=os.environ["AISDK_SECRET_KEY"],
)

datasets = client.datasets;       # datasets Client
datasetRuns = client.datasetRuns; # dataset_runs client
import com.browserstack.aisdk.TestOps;
import com.browserstack.aisdk.eval.DatasetsClient;
import com.browserstack.aisdk.eval.model.*;

TestOps sdk = TestOps.fromEnv();
DatasetsClient datasets = sdk.datasets();

Create a Dataset

const dataset = await datasets.create(
  'qa-golden-set',
  'Golden set for QA evaluation',
  { owner: 'ml-team', version: '1.0' } // optional metadata
);

console.log(dataset.id);   // dataset ID
console.log(dataset.name); // 'qa-golden-set'
dataset = client.datasets.create(
    name="qa-dataset-v1",
    description="Question-answer pairs for RAG evaluation",
    metadata='{"domain": "customer-support"}',
)
print(dataset)
// Name only
DatasetResponse ds = datasets.create("my-dataset");

// With description
DatasetResponse ds = datasets.create("my-dataset", "Questions about Northern Lights");

// With metadata
DatasetResponse ds = datasets.create(
    "my-dataset",
    "Questions about Northern Lights",
    Map.of("team", "nlp", "domain", "science")
);

System.out.println("Created: " + ds.getId() + " — " + ds.getName());

List Datasets

const result = await datasets.list(
  1,           // page (1-indexed)
  20,          // limit per page
  'qa'         // optional name filter
);

for (const ds of result.data) {
  console.log(ds.id, ds.name);
}
result = client.datasets.list(page=1, limit=20)

for dataset in result["data"]:
    print(f"{dataset.get('name')}: {dataset.get('id')}")

Filter by name:

datasets = client.datasets.list(name="qa-dataset-v1")
// Default (page 1, limit 50)
ListDatasetsResponse list = datasets.list();

// With pagination
ListDatasetsResponse list = datasets.list(1, 20);

// Filtered by name
ListDatasetsResponse list = datasets.list(1, 20, "my-dataset");

list.getData().forEach(d -> System.out.println(d.getId() + " " + d.getName()));

Create Items

Add multiple items to a dataset at once. Each item has an input, optional expectedOutput, optional context, and optional metadata.

Expected CSV columns: input, expectedOutput, context, metadata, id, sourceTraceId, sourceObservationId, status.

await datasets.createItems({
  datasetName: 'qa-golden-set',
  items: [
    {
      input: { question: 'What is the capital of France?' },
      expectedOutput: { answer: 'Paris' },
      metadata: { difficulty: 'easy' },
    },
    {
      input: { question: 'What is 2 + 2?' },
      expectedOutput: { answer: '4' },
    },
    {
      input: { question: 'Who wrote Hamlet?' },
      expectedOutput: { answer: 'William Shakespeare' },
      context: 'Classic English literature',
    },
  ],
});

Import from CSV:

const result = await datasets.createItems({
  datasetName: 'qa-golden-set',
  fileUrl: '/path/to/dataset.csv',
});
console.log(`Imported ${result.itemCount} items`);
result = client.datasets.create_items(
    dataset_name="qa-dataset-v1",
    items=[
        {
            "input": {"question": "What is your return policy?"},
            "expectedOutput": "Items can be returned within 30 days.",
            "metadata": {"category": "returns"},
        },
        {
            "input": {"question": "How do I track my order?"},
            "expectedOutput": "Log in and visit the Orders page.",
        },
    ],
)
print(f"Created {result['itemCount']} items")

Import from CSV:

result = client.datasets.create_items(
    dataset_name="qa-golden-set",
    file_url="/path/to/dataset.csv",
    options={"batchSize": 50},
)
print(f"Imported {result['itemCount']} items")

Items are sent in batches of 100.

import com.browserstack.aisdk.eval.model.CreateDatasetItemRequest;
import java.util.List;

List<CreateDatasetItemRequest> items = List.of(
    CreateDatasetItemRequest.builder()
        .input("What causes Northern Lights?")
        .expectedOutput("Solar wind particles interact with Earth's magnetic field...")
        .context("Reference document: Aurora Borealis — NASA")
        .build(),

    CreateDatasetItemRequest.builder()
        .input("How far is the Moon from Earth?")
        .expectedOutput("Approximately 384,400 km on average.")
        .build()
);

CreateDatasetItemsResponse result = datasets.createItems("my-dataset", items);
System.out.println("Added " + result.getItemCount() + " items");

Import from CSV:

CreateDatasetItemsResponse result = datasets.createItemsFromCsv(
    "my-dataset",
    "/path/to/test-cases.csv"
);
System.out.println("Imported " + result.getItemCount() + " items");

Dataset Runs

Dataset runs group execution results for a given dataset, linking traces to specific dataset items.

Create a Dataset Run

console.log('Creating dataset run...');
const datasetRun = await datasetRuns.create(
  datasetName,
  { name: 'My Dataset Run', tag: tagName}
);
    
console.log(`Dataset run created with ID: ${datasetRun.id}`);
console.log(`Dataset run tag ID: ${datasetRun.tags[0].id}`);
run = client.dataset_runs.create(
    dataset_name="qa-golden-set",
    name="gpt-4o-run-1",
    description="GPT-4o baseline evaluation",
    tag="production-v1",  # optional
)

print(f"Run ID: {run['id']}")
import com.browserstack.aisdk.eval.DatasetRunsClient;

DatasetRunsClient datasetRuns = sdk.datasetRuns();

DatasetRunResponse run = datasetRuns.create("my-dataset");
System.out.println("Run ID: " + run.getId());

Add Run Items

Add dataset run items to a datasetRun

await datasetRuns.createItems(
  'qa-golden-set',
  datasetRun.id,
  [
    {
      input: { question: 'What is AI?' },
      expectedOutput: { answer: 'AI is...' },
    },
  ]
);
client.dataset_runs.create_items(
    dataset_name="qa-golden-set",
    dataset_run_id=run["id"],
    items=[
        {
            "input": {"messages": [{"role": "user", "content": "What is machine learning?"}]},
            "expectedOutput": {"should_contain": ["algorithm", "data", "patterns"]},
            "traceId": "trace-id-abc",
            "observationId": "gen-id-xyz",  # optional
        },
    ],
)
import com.browserstack.aisdk.eval.model.CreateDatasetRunItemRequest;

List<CreateDatasetRunItemRequest> runItems = items.stream()
    .map(item -> CreateDatasetRunItemRequest.builder()
        .datasetItemId(item.getId())
        .output(generatedOutput)
        .traceId(trace.getId())
        .build())
    .collect(Collectors.toList());

CreateDatasetRunItemsResponse resp = datasetRuns.createItems(run.getId(), runItems);

Complete Example

import { AISDK } from '@browserstack/ai-sdk';
import OpenAI from 'openai';

const testOps = new AISDK({
  publicKey: process.env.AISDK_PUBLIC_KEY,
  secretKey: process.env.AISDK_SECRET_KEY,
});
const openai = new OpenAI();

async function runEvaluation() {
  const datasets = testOps.datasets;
  const datasetRuns = testOps.datasetRuns;

  // 1. Create dataset with test cases
  await datasets.create('trivia-set', 'Trivia QA dataset');
  await datasets.createItems({
    datasetName: 'trivia-set',
    items: [
      { input: { q: 'Capital of Japan?' }, expectedOutput: { a: 'Tokyo' } },
      { input: { q: 'Speed of light?' }, expectedOutput: { a: '299,792,458 m/s' } },
    ],
  });

  // 2. Create a run to track results
  const run = await datasetRuns.create(
    'trivia-set',
    { name: `eval-${Date.now()}` }
  );

  // 3. Run pipeline for each item and link traces
  const items = [
    { id: 'item-1', q: 'Capital of Japan?' },
    { id: 'item-2', q: 'Speed of light?' },
  ];

  for (const item of items) {
    const trace = testOps.trace({ name: 'trivia-answer', input: item });
    const generation = trace.generation({
      name: 'answer',
      model: 'gpt-4o',
      input: [{ role: 'user', content: item.q }],
    });

    const response = await openai.chat.completions.create({
      model: 'gpt-4o',
      messages: [{ role: 'user', content: item.q }],
    });

    generation.end({ output: response.choices[0].message.content });
    trace.update({ output: response.choices[0].message.content });

    await datasetRuns.createItems(
      'trivia-set',
      run.id,
      [
        {
          input: { question: item.q },
          expectedOutput: { answer: response.choices[0].message.content },
        },
      ]
    );
  }

  await testOps.shutdown();
  console.log('Evaluation complete. Run ID:', run.id);
}

runEvaluation();
import os
import openai
from browserstack_ai_sdk import AISDK

client = AISDK(
    public_key=os.environ["AISDK_PUBLIC_KEY"],
    secret_key=os.environ["AISDK_SECRET_KEY"],
)
openai_client = openai.OpenAI()

# 1. Create dataset with test cases
client.datasets.create(name="trivia-set", description="Trivia QA dataset")
client.datasets.create_items(
    dataset_name="trivia-set",
    items=[
        {"input": {"q": "Capital of Japan?"}, "expectedOutput": {"a": "Tokyo"}},
        {"input": {"q": "Speed of light?"}, "expectedOutput": {"a": "299,792,458 m/s"}},
    ],
)

# 2. Create a run to track results
run = client.dataset_runs.create(
    dataset_name="trivia-set",
    name="gpt-4o-eval-run",
)

# 3. Run pipeline for each item and link traces
items = [
    {"q": "Capital of Japan?", "expected": "Tokyo"},
    {"q": "Speed of light?", "expected": "299,792,458 m/s"},
]

for item in items:
    trace = client.trace(name="trivia-answer", input=item)
    generation = trace.start_generation(
        name="answer",
        model="gpt-4o-mini",
        prompt=[{"role": "user", "content": item["q"]}],
    )

    response = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": item["q"]}],
    )

    answer = response.choices[0].message.content
    generation.update(output={"response": answer})
    generation.end()
    trace.update(output={"answer": answer})

    client.dataset_runs.create_items(
        dataset_name="trivia-set",
        dataset_run_id=run["id"],
        items=[{
            "input": {"q": item["q"]},
            "expectedOutput": {"a": item["expected"]},
        }],
    )


client.flush()
print(f"Evaluation complete. Run ID: {run['id']}")