Evaluation
Evaluators
Create evaluator lists and run inline evaluations across TypeScript, Python, and Java SDKs.
Evaluators
Evaluator lists group one or more evaluation metrics that can be attached to experiments or run inline against LLM outputs.
Setup
import { AISDK } from '@browserstack/ai-sdk';
const testOps = new AISDK({
publicKey: process.env.AISDK_PUBLIC_KEY,
secretKey: process.env.AISDK_SECRET_KEY,
});
const evalsList = testOps.evalsList; // EvaluatorLists
const evals = testOps.evals; // EvaluationExecutionimport os
from browserstack_ai_sdk import AISDK
client = AISDK(
public_key=os.environ["AISDK_PUBLIC_KEY"],
secret_key=os.environ["AISDK_SECRET_KEY"],
)import com.browserstack.aisdk.TestOps;
import com.browserstack.aisdk.eval.EvaluatorListsClient;
import com.browserstack.aisdk.eval.model.*;
TestOps sdk = TestOps.fromEnv();
EvaluatorListsClient evaluatorLists = sdk.evaluatorLists();Create an Evaluator List
import { EvaluatorParamDataType } from '@browserstack/ai-sdk';
const list = await evalsList.create({
name: 'rag-quality-metrics',
description: 'Evaluators for RAG pipeline quality',
evaluators: [
{
evaluatorId: 'faithfulness-evaluator-id',
params: [
{ key: 'threshold', value: '0.8', dataType: EvaluatorParamDataType.FLOAT },
],
},
{
evaluatorId: 'relevance-evaluator-id',
params: [
{ key: 'model', value: 'gpt-4o', dataType: EvaluatorParamDataType.STRING },
],
},
],
});
console.log(list.id, list.name);Parameter data types:
enum EvaluatorParamDataType {
STRING = 'string',
INTEGER = 'integer',
FLOAT = 'float',
BOOLEAN = 'boolean',
STRING_ARRAY = 'string[]',
INTEGER_ARRAY = 'integer[]',
FLOAT_ARRAY = 'float[]',
BOOLEAN_ARRAY = 'boolean[]',
OBJECT = 'object',
}result = client.evaluator_lists.create({
"name": "qa-evaluators",
"evaluators": [
{
"evaluatorId": "correctness-evaluator-id",
"params": [
{"key": "threshold", "dataType": "float"},
{"key": "strict", "dataType": "boolean"},
],
},
{
"evaluatorId": "faithfulness-evaluator-id",
"params": [
{"key": "context_key", "dataType": "string"},
],
},
],
})
print(result)Supported dataType values: string, integer, float, boolean, string[], integer[], float[], boolean[], list, dict
import com.browserstack.aisdk.eval.model.CreateEvaluatorListRequest;
import com.browserstack.aisdk.eval.model.CreateEvaluatorListRequest.EvaluatorConfig;
import java.util.List;
import java.util.Map;
CreateEvaluatorListRequest request = CreateEvaluatorListRequest.builder()
.name("rag-quality-suite")
.evaluators(List.of(
EvaluatorConfig.builder()
.evaluatorId("faithfulness")
.build(),
EvaluatorConfig.builder()
.evaluatorId("answer-relevancy")
.build(),
EvaluatorConfig.builder()
.evaluatorId("llm-judge")
.params(List.of(
Map.of(
"key", "rubric",
"value", "Score 1 if the answer is factually correct, 0 otherwise.",
"dataType", "STRING"
)
))
.build()
))
.build();
EvaluatorListResponse result = evaluatorLists.create(request);
System.out.println("Created evaluator list: " + result.getId());List and Get Evaluator Lists
const result = await evalsList.list(
20, // limit (1-100)
1, // page
{ column: 'createdAt', order: 'DESC' } // optional sort
);
for (const list of result.evaluators) {
console.log(list.id, list.name, list.evaluatorConfigs.length, 'evaluators');
}
const list = await evalsList.get('evaluator-list-id-abc');
console.log(list.name);result = client.evaluator_lists.list(page=1, limit=50)
for ev_list in result.get("data", []):
print(ev_list["name"], ev_list["id"])
ev_list = client.evaluator_lists.get("evaluator-list-id")
print(ev_list)ListEvaluatorListsResponse list = evaluatorLists.list(20, 1);
list.getData().forEach(el ->
System.out.println(el.getId() + " — " + el.getName())
);
EvaluatorListResponse el = evaluatorLists.find("evl_abc123");
System.out.println("Name: " + el.getName());Inline Evaluation
Run evaluations directly against LLM outputs without creating an experiment — useful for spot-checking or custom pipelines.
const result = await evals.evaluate({
evaluators: [
{
metricName: 'faithfulness',
displayName: 'Faithfulness',
params: { threshold: 0.8 },
},
{
metricName: 'answer_relevance',
displayName: 'Answer Relevance',
},
],
provider: 'openai',
model: 'gpt-4o',
data: {
input: 'What is the boiling point of water?',
output: 'Water boils at 100°C (212°F) at sea level.',
expectedOutput: '100 degrees Celsius',
context: ['Water boiling point reference: 100°C at 1 atm pressure.'],
},
concurrency: 2,
});
if (result.success) {
for (const r of result.results) {
console.log(r.evaluator, '→ score:', r.score, '| reasoning:', r.reasoning);
}
} else {
console.error('Evaluation failed:', result.error);
}import com.browserstack.aisdk.eval.EvaluationExecutionClient;
import com.browserstack.aisdk.eval.model.*;
import java.util.List;
import java.util.Map;
EvaluationExecutionClient evalExec = sdk.evaluationExecution();
EvaluationExecutionRequest request = EvaluationExecutionRequest.builder()
.evaluators(List.of(
EvaluationExecutionRequest.EvaluatorSpec.builder()
.metricName("faithfulness")
.build(),
EvaluationExecutionRequest.EvaluatorSpec.builder()
.metricName("answer-relevancy")
.build()
))
.data(Map.of(
"input", "What causes Northern Lights?",
"output", "Solar wind particles collide with atmospheric gases.",
"expectedOutput", "Solar wind particles interact with Earth's magnetic field.",
"context", "Aurora Borealis occur when charged particles from the Sun..."
))
.build();
EvaluationExecutionResponse result = evalExec.evaluate(request);
System.out.println("Job ID: " + result.getJobId());
result.getResults().forEach(r ->
System.out.printf(" %s: %.2f%n", r.getMetricName(), r.getScore())
);evaluate() is synchronous and blocks until results are returned — up to 5 minutes. For batch evaluation use experiments.