BrowserStack AI Evals
EvaluationEvaluators

Inline Evaluation

Run evaluations directly against LLM outputs without creating an experiment, using the TypeScript and Java SDKs.

Inline Evaluation

Run evaluations directly against LLM outputs without creating an experiment — useful for spot-checking or custom pipelines.

Evaluate

const result = await evals.evaluate({
  evaluators: [
    {
      metricName: 'faithfulness',
      displayName: 'Faithfulness',
      params: { threshold: 0.8 },
    },
    {
      metricName: 'answer_relevance',
      displayName: 'Answer Relevance',
    },
  ],
  provider: 'openai',
  model: 'gpt-4o',
  data: {
    input: 'What is the boiling point of water?',
    output: 'Water boils at 100°C (212°F) at sea level.',
    expectedOutput: '100 degrees Celsius',
    context: ['Water boiling point reference: 100°C at 1 atm pressure.'],
  },
  concurrency: 2,
});

if (result.success) {
  for (const r of result.results) {
    console.log(r.evaluator, '→ score:', r.score, '| reasoning:', r.reasoning);
  }
} else {
  console.error('Evaluation failed:', result.error);
}
import com.browserstack.aisdk.eval.EvaluationExecutionClient;
import com.browserstack.aisdk.eval.model.*;
import java.util.List;
import java.util.Map;

EvaluationExecutionClient evalExec = sdk.evaluationExecution();

EvaluationExecutionRequest request = EvaluationExecutionRequest.builder()
    .evaluators(List.of(
        EvaluationExecutionRequest.EvaluatorSpec.builder()
            .metricName("faithfulness")
            .build(),
        EvaluationExecutionRequest.EvaluatorSpec.builder()
            .metricName("answer-relevancy")
            .build()
    ))
    .data(Map.of(
        "input",          "What causes Northern Lights?",
        "output",         "Solar wind particles collide with atmospheric gases.",
        "expectedOutput", "Solar wind particles interact with Earth's magnetic field.",
        "context",        "Aurora Borealis occur when charged particles from the Sun..."
    ))
    .build();

EvaluationExecutionResponse result = evalExec.evaluate(request);

System.out.println("Job ID: " + result.getJobId());
result.getResults().forEach(r ->
    System.out.printf("  %s: %.2f%n", r.getMetricName(), r.getScore())
);

evaluate() is synchronous and blocks until results are returned — up to 5 minutes. For batch evaluation use experiments.

Complete Example

import { AISDK } from '@browserstack/ai-sdk';
import OpenAI from 'openai';

const testOps = new AISDK({
  publicKey: process.env.AISDK_PUBLIC_KEY,
  secretKey: process.env.AISDK_SECRET_KEY,
});
const openai = new OpenAI();

async function evaluateAnswer(question: string, context: string): Promise<void> {
  const evals = testOps.evals;

  const response = await openai.chat.completions.create({
    model: 'gpt-4o',
    messages: [
      { role: 'system', content: `Answer based on: ${context}` },
      { role: 'user', content: question },
    ],
  });

  const answer = response.choices[0].message.content ?? '';

  const result = await evals.evaluate({
    evaluators: [
      { metricName: 'faithfulness' },
      { metricName: 'answer_relevance' },
    ],
    provider: 'openai',
    model: 'gpt-4o',
    data: {
      input: question,
      output: answer,
      context: [context],
    },
    concurrency: 2,
  });

  console.log('Answer:', answer);
  for (const r of result.results) {
    console.log(`  ${r.evaluator}: ${r.score} (${r.reasoning})`);
  }

  await testOps.shutdown();
}

evaluateAnswer(
  'What is the boiling point of water?',
  'Water boils at 100°C (212°F) at standard atmospheric pressure.'
);