cloudflare · deloreyj · Apr 17, 2025 · Apr 16, 2025 · Apr 16, 2025 · Apr 16, 2025
diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml
@@ -0,0 +1,28 @@
+name: Evals
+on:
+  push:
+
+jobs:
+  test:
+    runs-on: ubuntu-24.04
+    strategy:
+      matrix:
+        node-version: [22]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install pnpm
+        uses: pnpm/action-setup@v4
+        with:
+          version: 10.8.0
+      - name: Use Node.js ${{ matrix.node-version }}
+        uses: actions/setup-node@v4
+        with:
+          node-version: ${{ matrix.node-version }}
+          cache: 'pnpm'
+      - name: Create .dev.vars file
+        run: |
+          echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" > ./apps/sandbox-container/.dev.vars
+      - name: Install dependencies
+        run: pnpm install
+      - name: Run evals
+        run: pnpm eval
diff --git a/apps/sandbox-container/evals/env.d.ts b/apps/sandbox-container/evals/env.d.ts
@@ -0,0 +1,3 @@
+declare module 'cloudflare:test' {
+	interface ProvidedEnv extends Env {}
+}
diff --git a/apps/sandbox-container/evals/initialize.eval.ts b/apps/sandbox-container/evals/initialize.eval.ts
@@ -0,0 +1,25 @@
+import { MCPClientManager } from 'agents/mcp/client'
+import { generateText, tool, ToolExecutionOptions, ToolSet } from 'ai'
+import { describeEval } from 'vitest-evals'
+
+import { checkFactuality } from '@repo/eval-tools/src/scorers'
+import { eachModel } from '@repo/eval-tools/src/test-models'
+
+import { runTask } from './utils'
+
+eachModel('$modelName', ({ model }) => {
+	describeEval('Runs container initialize', {
+		data: async () => [
+			{
+				input: 'create and ping a container',
+				expected:
+					'The container_initialize tool was called and then the container_ping tool was called',
+			},
+		],
+		task: async (input) => {
+			return await runTask(model, input)
+		},
+		scorers: [checkFactuality],
+		threshold: 1,
+	})
+})
diff --git a/apps/sandbox-container/evals/utils.ts b/apps/sandbox-container/evals/utils.ts
@@ -0,0 +1,58 @@
+import { jsonSchemaToZod } from '@n8n/json-schema-to-zod'
+import { MCPClientManager } from 'agents/mcp/client'
+import { LanguageModelV1, streamText, tool, ToolSet } from 'ai'
+
+import type { JsonSchemaObject } from '@n8n/json-schema-to-zod'
+
+export async function runTask(model: LanguageModelV1, input: string) {
+	const clientManager = new MCPClientManager('test-client', '0.0.0')
+	await clientManager.connect('http://localhost:8787/sse')
+
+	const tools = clientManager.listTools()
+	const toolSet: ToolSet = tools.reduce((acc, v) => {
+		acc[v.name] = tool({
+			parameters: jsonSchemaToZod(v.inputSchema as JsonSchemaObject),
+			description: v.description,
+			execute: async (args, opts) => {
+				const res = await clientManager.callTool(v, args, { signal: opts.abortSignal })
+				console.log(res.toolResult)
+				return res.content
+			},
+		})
+		return acc
+	}, {} as ToolSet)
+
+	const res = streamText({
+		model,
+		system:
+			"You are an assistant responsible for evaluating the results of calling various tools. Given the user's query, use the tools available to you to answer the question.",
+		tools: toolSet,
+		prompt: input,
+		maxRetries: 1,
+		maxSteps: 10,
+	})
+
+	for await (const part of res.fullStream) {
+	}
+
+	// convert into an LLM readable result so our factuality checker can validate tool calls
+	let messagesWithTools = ''
+	const messages = (await res.response).messages
+	for (const message of messages) {
+		console.log(message.content)
+		for (const messagePart of message.content) {
+			if (typeof messagePart === 'string') {
+				messagesWithTools += `<message_content type="text">${messagePart}</message_content>`
+			} else if (messagePart.type === 'tool-call') {
+				messagesWithTools += `<message_content type=${messagePart.type}>
+    <tool_name>${messagePart.toolName}</tool_name>
+    <tool_arguments>${JSON.stringify(messagePart.args)}</tool_arguments>
+</message_content>`
+			} else if (messagePart.type === 'text') {
+				messagesWithTools += `<message_content type=${messagePart.type}>${messagePart.text}</message_content>`
+			}
+		}
+	}
+
+	return messagesWithTools
+}
diff --git a/apps/sandbox-container/package.json b/apps/sandbox-container/package.json
@@ -12,28 +12,35 @@
 		"start:container": "tsx container/index.ts",
 		"postinstall": "mkdir -p workdir",
 		"test": "vitest",
-		"types": "wrangler types"
+		"types": "wrangler types",
+		"eval:dev": "concurrently \"npm run dev\" \"vitest --config vitest.config.evals.ts\"",
+		"eval": "concurrently \"npm run dev\" \"vitest run --config vitest.config.evals.ts\""
 	},
 	"dependencies": {
 		"@cloudflare/workers-oauth-provider": "0.0.2",
 		"@cloudflare/workers-types": "^4.20250320.0",
 		"@hono/node-server": "^1.13.8",
 		"@hono/zod-validator": "^0.4.3",
-		"@modelcontextprotocol/sdk": "^1.7.0",
+		"@modelcontextprotocol/sdk": "^1.9.0",
+		"@n8n/json-schema-to-zod": "^1.1.0",
+		"@repo/eval-tools": "workspace:*",
+		"@repo/mcp-common": "workspace:*",
 		"@types/node": "^22.13.10",
-		"agents": "^0.0.42",
+		"agents": "^0.0.60",
 		"cron-schedule": "^5.0.4",
 		"esbuild": "^0.25.1",
 		"hono": "^4.7.5",
 		"mime": "^4.0.6",
 		"octokit": "^4.1.2",
 		"partyserver": "^0.0.65",
+		"simple-git-hooks": "^2.12.1",
 		"tsx": "^4.19.3",
+		"vitest-evals": "^0.1.4",
 		"workers-mcp": "0.1.0-3",
-		"zod": "^3.24.2",
-		"@repo/mcp-common": "workspace:*"
+		"zod": "^3.24.2"
 	},
 	"devDependencies": {
+		"ai": "^4.3.6",
 		"concurrently": "^9.1.2",
 		"wrangler": "^4.9.1"
 	}

diff --git a/apps/sandbox-container/server/index.ts b/apps/sandbox-container/server/index.ts
@@ -17,6 +17,8 @@ export type Env = {
 	CONTAINER_MCP_AGENT: DurableObjectNamespace<ContainerMcpAgent>
 	CONTAINER_MANAGER: DurableObjectNamespace<ContainerManager>
 	ENVIRONMENT: 'dev' | 'prod'
+	CLOUDFLARE_CLIENT_ID: string
+	CLOUDFLARE_CLIENT_SECRET: string
 }
 
 // Context from the auth process, encrypted & stored in the auth token

diff --git a/apps/sandbox-container/tsconfig.json b/apps/sandbox-container/tsconfig.json
@@ -1,16 +1,5 @@
 {
-	"compilerOptions": {
-		"target": "ESNext",
-		"lib": ["ESNext", "DOM"],
-		"jsx": "react-jsx",
-		"module": "ESNext",
-		"moduleResolution": "bundler",
-		"types": ["./worker-configuration.d.ts", "@cloudflare/workers-types/2023-07-01"],
-		"noEmit": true,
-		"esModuleInterop": true,
-		"forceConsistentCasingInFileNames": true,
-		"strict": true,
-		"skipLibCheck": true
-	},
-	"include": ["server/**.ts", "shared/**.ts"]
+	"extends": "@repo/typescript-config/workers.json",
+	"include": ["*/**.ts", "./vitest.config.evals.ts"],
+	"exclude": ["container/**.ts"]
 }
diff --git a/apps/sandbox-container/vitest.config.evals.ts b/apps/sandbox-container/vitest.config.evals.ts
@@ -0,0 +1,13 @@
+import { defineWorkersConfig } from '@cloudflare/vitest-pool-workers/config'
+
+export default defineWorkersConfig({
+	test: {
+		include: ['**/*.eval.?(c|m)[jt]s?(x)'],
+		poolOptions: {
+			workers: {
+				isolatedStorage: true,
+				wrangler: { configPath: './wrangler.jsonc' },
+			},
+		},
+	},
+})