Skip to content

Add basic evals setup #48

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Apr 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions .github/workflows/evals.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
name: Evals
on:
push:

jobs:
test:
runs-on: ubuntu-24.04
strategy:
matrix:
node-version: [22]
steps:
- uses: actions/checkout@v4
- name: Install pnpm
uses: pnpm/action-setup@v4
with:
version: 10.8.0
- name: Use Node.js ${{ matrix.node-version }}
uses: actions/setup-node@v4
with:
node-version: ${{ matrix.node-version }}
cache: 'pnpm'
- name: Create .dev.vars file
run: |
echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" > ./apps/sandbox-container/.dev.vars
- name: Install dependencies
run: pnpm install
- name: Run evals
run: pnpm eval
3 changes: 3 additions & 0 deletions apps/sandbox-container/evals/env.d.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
declare module 'cloudflare:test' {
interface ProvidedEnv extends Env {}
}
25 changes: 25 additions & 0 deletions apps/sandbox-container/evals/initialize.eval.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import { MCPClientManager } from 'agents/mcp/client'
import { generateText, tool, ToolExecutionOptions, ToolSet } from 'ai'
import { describeEval } from 'vitest-evals'

import { checkFactuality } from '@repo/eval-tools/src/scorers'
import { eachModel } from '@repo/eval-tools/src/test-models'

import { runTask } from './utils'

eachModel('$modelName', ({ model }) => {
describeEval('Runs container initialize', {
data: async () => [
{
input: 'create and ping a container',
expected:
'The container_initialize tool was called and then the container_ping tool was called',
},
],
task: async (input) => {
return await runTask(model, input)
},
scorers: [checkFactuality],
threshold: 1,
})
})
58 changes: 58 additions & 0 deletions apps/sandbox-container/evals/utils.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import { jsonSchemaToZod } from '@n8n/json-schema-to-zod'
import { MCPClientManager } from 'agents/mcp/client'
import { LanguageModelV1, streamText, tool, ToolSet } from 'ai'

import type { JsonSchemaObject } from '@n8n/json-schema-to-zod'

export async function runTask(model: LanguageModelV1, input: string) {
const clientManager = new MCPClientManager('test-client', '0.0.0')
await clientManager.connect('http://localhost:8787/sse')

const tools = clientManager.listTools()
const toolSet: ToolSet = tools.reduce((acc, v) => {
acc[v.name] = tool({
parameters: jsonSchemaToZod(v.inputSchema as JsonSchemaObject),
description: v.description,
execute: async (args, opts) => {
const res = await clientManager.callTool(v, args, { signal: opts.abortSignal })
console.log(res.toolResult)
return res.content
},
})
return acc
}, {} as ToolSet)

const res = streamText({
model,
system:
"You are an assistant responsible for evaluating the results of calling various tools. Given the user's query, use the tools available to you to answer the question.",
tools: toolSet,
prompt: input,
maxRetries: 1,
maxSteps: 10,
})

for await (const part of res.fullStream) {
}

// convert into an LLM readable result so our factuality checker can validate tool calls
let messagesWithTools = ''
const messages = (await res.response).messages
for (const message of messages) {
console.log(message.content)
for (const messagePart of message.content) {
if (typeof messagePart === 'string') {
messagesWithTools += `<message_content type="text">${messagePart}</message_content>`
} else if (messagePart.type === 'tool-call') {
messagesWithTools += `<message_content type=${messagePart.type}>
<tool_name>${messagePart.toolName}</tool_name>
<tool_arguments>${JSON.stringify(messagePart.args)}</tool_arguments>
</message_content>`
} else if (messagePart.type === 'text') {
messagesWithTools += `<message_content type=${messagePart.type}>${messagePart.text}</message_content>`
}
}
}

return messagesWithTools
}
17 changes: 12 additions & 5 deletions apps/sandbox-container/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,28 +12,35 @@
"start:container": "tsx container/index.ts",
"postinstall": "mkdir -p workdir",
"test": "vitest",
"types": "wrangler types"
"types": "wrangler types",
"eval:dev": "concurrently \"npm run dev\" \"vitest --config vitest.config.evals.ts\"",
"eval": "concurrently \"npm run dev\" \"vitest run --config vitest.config.evals.ts\""
},
"dependencies": {
"@cloudflare/workers-oauth-provider": "0.0.2",
"@cloudflare/workers-types": "^4.20250320.0",
"@hono/node-server": "^1.13.8",
"@hono/zod-validator": "^0.4.3",
"@modelcontextprotocol/sdk": "^1.7.0",
"@modelcontextprotocol/sdk": "^1.9.0",
"@n8n/json-schema-to-zod": "^1.1.0",
"@repo/eval-tools": "workspace:*",
"@repo/mcp-common": "workspace:*",
"@types/node": "^22.13.10",
"agents": "^0.0.42",
"agents": "^0.0.60",
"cron-schedule": "^5.0.4",
"esbuild": "^0.25.1",
"hono": "^4.7.5",
"mime": "^4.0.6",
"octokit": "^4.1.2",
"partyserver": "^0.0.65",
"simple-git-hooks": "^2.12.1",
"tsx": "^4.19.3",
"vitest-evals": "^0.1.4",
"workers-mcp": "0.1.0-3",
"zod": "^3.24.2",
"@repo/mcp-common": "workspace:*"
"zod": "^3.24.2"
},
"devDependencies": {
"ai": "^4.3.6",
"concurrently": "^9.1.2",
"wrangler": "^4.9.1"
}
Expand Down
2 changes: 2 additions & 0 deletions apps/sandbox-container/server/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ export type Env = {
CONTAINER_MCP_AGENT: DurableObjectNamespace<ContainerMcpAgent>
CONTAINER_MANAGER: DurableObjectNamespace<ContainerManager>
ENVIRONMENT: 'dev' | 'prod'
CLOUDFLARE_CLIENT_ID: string
CLOUDFLARE_CLIENT_SECRET: string
}

// Context from the auth process, encrypted & stored in the auth token
Expand Down
17 changes: 3 additions & 14 deletions apps/sandbox-container/tsconfig.json
Original file line number Diff line number Diff line change
@@ -1,16 +1,5 @@
{
"compilerOptions": {
"target": "ESNext",
"lib": ["ESNext", "DOM"],
"jsx": "react-jsx",
"module": "ESNext",
"moduleResolution": "bundler",
"types": ["./worker-configuration.d.ts", "@cloudflare/workers-types/2023-07-01"],
"noEmit": true,
"esModuleInterop": true,
"forceConsistentCasingInFileNames": true,
"strict": true,
"skipLibCheck": true
},
"include": ["server/**.ts", "shared/**.ts"]
"extends": "@repo/typescript-config/workers.json",
"include": ["*/**.ts", "./vitest.config.evals.ts"],
"exclude": ["container/**.ts"]
}
13 changes: 13 additions & 0 deletions apps/sandbox-container/vitest.config.evals.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import { defineWorkersConfig } from '@cloudflare/vitest-pool-workers/config'

export default defineWorkersConfig({
test: {
include: ['**/*.eval.?(c|m)[jt]s?(x)'],
poolOptions: {
workers: {
isolatedStorage: true,
wrangler: { configPath: './wrangler.jsonc' },
},
},
},
})
Loading
Loading