Model Experiments Kit
Compare model performance across tasks with controlled evaluation frameworks.
Configuration
Model Experiments Dashboard
Models
0/4
Tasks
0
Leader
-
#1GPT-4oOpenAI
Acc: --$0.025/1k
#2Claude 3.5 SonnetAnthropic
Acc: --$0.02/1k
#3Llama 3.1 405BMeta
Acc: --$0.01/1k
#4Gemini 1.5 ProGoogle
Acc: --$0.015/1k
Integration Code
import { createModelExperiment } from 'agent-tools-kit/experimentation'
const exp = createModelExperiment({
mode: 'head-to-head',
blindEval: true,
metrics: ['accuracy', 'latency', 'cost'],
tasksPerModel: 24,
})
const results = await exp.compare([
{ name: 'gpt-4o', provider: 'openai' },
{ name: 'claude-3.5-sonnet', provider: 'anthropic' },
{ name: 'llama-3.1-405b', provider: 'meta' },
])
console.log('Rankings:')
results.ranked.forEach((r, i) => {
console.log(` #${i + 1} ${r.model}: ${r.accuracy}% (${r.latency}ms)`)
})
// Best cost/performance tradeoff
console.log('Best value:', results.bestValue)