cloudflare · deloreyj · Apr 30, 2025 · Apr 29, 2025 · Apr 30, 2025 · Apr 30, 2025
diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml
@@ -22,9 +22,12 @@ jobs:
       - name: Create .dev.vars file
         run: |
           echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" > ./apps/sandbox-container/.dev.vars
+          echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" > ./apps/workers-bindings/.dev.vars
+          echo "DEV_CLOUDFLARE_API_TOKEN=${{ secrets.DEV_CLOUDFLARE_API_TOKEN }}" >> ./apps/workers-bindings/.dev.vars
       - name: Verify .dev.vars file
         run: |
           du -h ./apps/sandbox-container/.dev.vars
+          du -h ./apps/workers-bindings/.dev.vars
       - name: Install dependencies
         run: pnpm install
       - name: Run evals

diff --git a/apps/demo-day/package.json b/apps/demo-day/package.json
@@ -14,6 +14,7 @@
 		"@modelcontextprotocol/sdk": "1.10.2",
 		"@repo/mcp-common": "workspace:*",
 		"@repo/mcp-observability": "workspace:*",
+		"@types/node": "22.14.1",
 		"agents": "0.0.67",
 		"zod": "3.24.2"
 	},

diff --git a/apps/sandbox-container/package.json b/apps/sandbox-container/package.json
@@ -26,6 +26,7 @@
 		"@repo/eval-tools": "workspace:*",
 		"@repo/mcp-common": "workspace:*",
 		"@repo/mcp-observability": "workspace:*",
+		"@types/node": "22.14.1",
 		"agents": "0.0.67",
 		"cron-schedule": "5.0.4",
 		"esbuild": "0.25.1",
@@ -40,7 +41,7 @@
 		"@cloudflare/vitest-pool-workers": "0.8.14",
 		"@types/mock-fs": "4.13.4",
 		"@types/node": "22.14.1",
-		"ai": "4.3.6",
+		"ai": "4.3.10",
 		"concurrently": "9.1.2",
 		"mock-fs": "5.5.0",
 		"start-server-and-test": "2.0.11",

diff --git a/apps/sandbox-container/server/index.ts b/apps/sandbox-container/server/index.ts
@@ -78,6 +78,4 @@ export default {
 			clientRegistrationEndpoint: '/register',
 		}).fetch(req, env, ctx)
 	},
-} /*
-
-*/
+}
diff --git a/apps/workers-bindings/evals/accounts.eval.ts b/apps/workers-bindings/evals/accounts.eval.ts
@@ -0,0 +1,46 @@
+import { expect } from 'vitest'
+import { describeEval } from 'vitest-evals'
+
+import { checkFactuality } from '@repo/eval-tools/src/scorers'
+import { eachModel } from '@repo/eval-tools/src/test-models'
+
+import { initializeClient, runTask } from './utils' // Assuming utils.ts will exist here
+
+// Define a mock account ID for testing
+const MOCK_ACCOUNT_ID = 'mock-account-12345'
+
+eachModel('$modelName', ({ model }) => {
+	describeEval('Account Tool Evaluations', {
+		data: async () => [
+			{
+				input: 'List all my Cloudflare accounts.',
+				expected: 'The accounts_list tool should be called to retrieve the list of accounts.',
+			},
+			{
+				input: `Set my active Cloudflare account to ${MOCK_ACCOUNT_ID}.`,
+				expected: `The set_active_account tool should be called with the account ID ${MOCK_ACCOUNT_ID}.`,
+			},
+		],
+		task: async (input: string) => {
+			const client = await initializeClient(/* Pass necessary mocks/config */)
+			const { promptOutput, toolCalls } = await runTask(client, model, input)
+
+			if (input.includes('List all my Cloudflare accounts')) {
+				const toolCall = toolCalls.find((call) => call.toolName === 'accounts_list')
+				expect(toolCall, 'Tool accounts_list was not called').toBeDefined()
+			} else if (input.includes(`Set my active Cloudflare account to ${MOCK_ACCOUNT_ID}`)) {
+				const toolCall = toolCalls.find((call) => call.toolName === 'set_active_account')
+				expect(toolCall, 'Tool set_active_account was not called').toBeDefined()
+
+				expect(toolCall?.args, 'Arguments for set_active_account did not match').toEqual(
+					expect.objectContaining({ activeAccountIdParam: MOCK_ACCOUNT_ID })
+				)
+			}
+
+			return promptOutput
+		},
+		scorers: [checkFactuality],
+		threshold: 1,
+		timeout: 60000, // 60 seconds
+	})
+})
diff --git a/apps/workers-bindings/evals/hyperdrive.eval.ts b/apps/workers-bindings/evals/hyperdrive.eval.ts
@@ -0,0 +1,40 @@
+import { expect } from 'vitest'
+import { describeEval } from 'vitest-evals'
+
+import { checkFactuality } from '@repo/eval-tools/src/scorers'
+import { eachModel } from '@repo/eval-tools/src/test-models'
+
+import { initializeClient, runTask } from './utils' // Assuming utils.ts will exist here
+
+const HYPERDRIVE_NAME = 'neon-test-hyperdrive'
+const HYPERDRIVE_DATABASE = 'neondb'
+const HYPERDRIVE_HOST = 'ep-late-cell-a4fm3g5p-pooler.us-east-1.aws.neon.tech'
+const HYPERDRIVE_PORT = 5432
+const HYPERDRIVE_USER = 'neondb_owner'
+const HYPERDRIVE_PASSWORD = 'my-test-password'
+
+eachModel('$modelName', ({ model }) => {
+	describeEval('Hyperdrive Tool Evaluations', {
+		data: async () => [
+			{
+				input: `Create a new Hyperdrive configuration with the name "${HYPERDRIVE_NAME}" and the database "${HYPERDRIVE_DATABASE}" and the host "${HYPERDRIVE_HOST}" and the port "${HYPERDRIVE_PORT}" and the user "${HYPERDRIVE_USER}" and the password "${HYPERDRIVE_PASSWORD}".`,
+				expected:
+					'The hyperdrive_configs_create tool should be called to create a new hyperdrive configuration.',
+			},
+		],
+		task: async (input: string) => {
+			const client = await initializeClient(/* Pass necessary mocks/config */)
+			const { promptOutput, toolCalls } = await runTask(client, model, input)
+
+			if (input.includes(`Create a new Hyperdrive configuration`)) {
+				const toolCall = toolCalls.find((call) => call.toolName === 'hyperdrive_config_create')
+				expect(toolCall, 'Tool hyperdrive_configs_create was not called').toBeDefined()
+			}
+
+			return promptOutput
+		},
+		scorers: [checkFactuality],
+		threshold: 1,
+		timeout: 60000, // 60 seconds
+	})
+})
diff --git a/apps/workers-bindings/evals/kv_namespaces.eval.ts b/apps/workers-bindings/evals/kv_namespaces.eval.ts
@@ -0,0 +1,52 @@
+import { expect } from 'vitest'
+import { describeEval } from 'vitest-evals'
+
+import { checkFactuality } from '@repo/eval-tools/src/scorers'
+import { eachModel } from '@repo/eval-tools/src/test-models'
+
+import { initializeClient, runTask } from './utils' // Assuming utils.ts will exist here
+
+eachModel('$modelName', ({ model }) => {
+	describeEval('KV Namespaces Tool Evaluations', {
+		data: async () => [
+			{
+				input: 'Create a new Cloudflare KV Namespace called "my-test-namespace".',
+				expected: 'The kv_namespaces_create tool should be called to create a new kv namespace.',
+			},
+			{
+				input: 'List all my Cloudflare KV Namespaces.',
+				expected:
+					'The kv_namespaces_list tool should be called to retrieve the list of kv namespaces. There should be at least one kv namespace in the list.',
+			},
+			{
+				input:
+					'Rename my Cloudflare KV Namespace called "my-test-namespace" to "my-new-test-namespace".',
+				expected: 'The kv_namespace_update tool should be called to rename the kv namespace.',
+			},
+			{
+				input: 'Get details of my Cloudflare KV Namespace called "my-new-test-namespace".',
+				expected:
+					'The kv_namespace_get tool should be called to retrieve the details of the kv namespace.',
+			},
+			{
+				input: 'Look up the id of my only KV namespace and delete it.',
+				expected: 'The kv_namespace_delete tool should be called to delete the kv namespace.',
+			},
+		],
+		task: async (input: string) => {
+			const client = await initializeClient(/* Pass necessary mocks/config */)
+			const { promptOutput, toolCalls, fullResult } = await runTask(client, model, input)
+
+			if (input.includes('List all my Cloudflare KV Namespaces')) {
+				console.log('fullResult', JSON.stringify(await fullResult.response, null, 2))
+				const toolCall = toolCalls.find((call) => call.toolName === 'kv_namespaces_list')
+				expect(toolCall, 'Tool kv_namespaces_list was not called').toBeDefined()
+			}
+
+			return promptOutput
+		},
+		scorers: [checkFactuality],
+		threshold: 1,
+		timeout: 60000, // 60 seconds
+	})
+})
diff --git a/apps/workers-bindings/test/types.d.ts → apps/workers-bindings/evals/types.d.ts b/apps/workers-bindings/test/types.d.ts → apps/workers-bindings/evals/types.d.ts
diff --git a/apps/workers-bindings/evals/utils.ts b/apps/workers-bindings/evals/utils.ts
@@ -0,0 +1,88 @@
+import { MCPClientManager } from 'agents/mcp/client'
+import { jsonSchema, streamText, tool } from 'ai'
+import { z } from 'zod'
+
+import type { LanguageModelV1, StreamTextResult, ToolCallPart, ToolSet } from 'ai'
+
+export async function initializeClient(): Promise<MCPClientManager> {
+	const clientManager = new MCPClientManager('test-client', '0.0.0')
+	await clientManager.connect('http://localhost:8977/sse')
+	return clientManager
+}
+
+export async function runTask(
+	clientManager: MCPClientManager,
+	model: LanguageModelV1,
+	input: string
+): Promise<{
+	promptOutput: string
+	fullResult: StreamTextResult<ToolSet, never>
+	toolCalls: ToolCallPart[]
+}> {
+	const tools = clientManager.listTools()
+	const toolSet: ToolSet = tools.reduce((acc, v) => {
+		if (!v.inputSchema.properties) {
+			v.inputSchema.properties = {}
+		}
+
+		acc[v.name] = tool({
+			parameters: jsonSchema(v.inputSchema as any),
+			description: v.description,
+			execute: async (args: any, opts) => {
+				try {
+					const res = await clientManager.callTool(
+						{
+							...v,
+							arguments: { ...args },
+						},
+						z.any() as any,
+						{ signal: opts.abortSignal }
+					)
+					return res.content
+				} catch (e) {
+					console.log('Error calling tool')
+					console.log(e)
+					return e
+				}
+			},
+		})
+		return acc
+	}, {} as ToolSet)
+
+	const res = streamText({
+		model,
+		system:
+			"You are an assistant responsible for evaluating the results of calling various tools. Given the user's query, use the tools available to you to answer the question.",
+		tools: toolSet,
+		prompt: input,
+		maxRetries: 1,
+		maxSteps: 10,
+	})
+
+	for await (const part of res.fullStream) {
+	}
+
+	// convert into an LLM readable result so our factuality checker can validate tool calls
+	let messagesWithTools = ''
+	const toolCalls: ToolCallPart[] = []
+	const response = await res.response
+	const messages = response.messages
+
+	for (const message of messages) {
+		for (const messagePart of message.content) {
+			if (typeof messagePart === 'string') {
+				messagesWithTools += `<message_content type="text">${messagePart}</message_content>`
+			} else if (messagePart.type === 'tool-call') {
+				messagesWithTools += `<message_content type=${messagePart.type}>
+    <tool_name>${messagePart.toolName}</tool_name>
+    <tool_arguments>${JSON.stringify(messagePart.args)}</tool_arguments>
+</message_content>`
+				toolCalls.push(messagePart)
+			} else if (messagePart.type === 'text') {
+				messagesWithTools += `<message_content type=${messagePart.type}>${messagePart.text}</message_content>`
+			}
+		}
+	}
+
+	return { promptOutput: messagesWithTools, fullResult: res, toolCalls }
+}
diff --git a/apps/workers-bindings/package.json b/apps/workers-bindings/package.json
@@ -8,6 +8,9 @@
 		"deploy": "wrangler deploy",
 		"deploy:staging": "wrangler deploy --env staging",
 		"deploy:production": "wrangler deploy --env production",
+		"eval:dev": "start-server-and-test --expect 404 eval:server http://localhost:8977 'vitest --testTimeout=60000 --config vitest.config.evals.ts'",
+		"eval:server": "wrangler dev --var ENVIRONMENT:test --var DEV_DISABLE_OAUTH:true --var DEV_CLOUDFLARE_EMAIL:[email protected] --inspector-port 9230",
+		"eval:ci": "start-server-and-test --expect 404 eval:server http://localhost:8977 'vitest run --testTimeout=60000 --config vitest.config.evals.ts'",
 		"dev": "wrangler dev",
 		"start": "wrangler dev",
 		"types": "wrangler types --include-env=false",
@@ -25,10 +28,15 @@
 		"@cloudflare/workers-oauth-provider": "0.0.5",
 		"@modelcontextprotocol/sdk": "1.10.2",
 		"@n8n/json-schema-to-zod": "1.1.0",
+		"@repo/eval-tools": "workspace:*",
 		"@repo/mcp-common": "workspace:*",
 		"@repo/mcp-observability": "workspace:*",
 		"agents": "0.0.67",
+		"ai": "4.3.10",
+		"concurrently": "9.1.2",
 		"hono": "4.7.6",
+		"start-server-and-test": "2.0.11",
+		"vitest-evals": "0.1.4",
 		"zod": "3.24.2"
 	}
 }
diff --git a/apps/workers-bindings/src/context.ts b/apps/workers-bindings/src/context.ts
@@ -3,7 +3,7 @@ import type { WorkersBindingsMCP } from './index'
 
 export interface Env {
 	OAUTH_KV: KVNamespace
-	ENVIRONMENT: 'development' | 'staging' | 'production'
+	ENVIRONMENT: 'development' | 'staging' | 'production' | 'test'
 	MCP_SERVER_NAME: string
 	MCP_SERVER_VERSION: string
 	CLOUDFLARE_CLIENT_ID: string
@@ -14,4 +14,7 @@ export interface Env {
 	DEV_DISABLE_OAUTH: string
 	DEV_CLOUDFLARE_API_TOKEN: string
 	DEV_CLOUDFLARE_EMAIL: string
+	CLOUDFLARE_API_TOKEN: string
+	OPENAI_API_KEY: string
+	AI: Ai
 }
diff --git a/apps/workers-bindings/src/index.ts b/apps/workers-bindings/src/index.ts
@@ -1,17 +1,19 @@
 import OAuthProvider from '@cloudflare/workers-oauth-provider'
 import { McpAgent } from 'agents/mcp'
 
+import { createApiHandler } from '@repo/mcp-common/src/api-handler'
 import {
 	createAuthHandlers,
-	getUserAndAccounts,
 	handleTokenExchangeCallback,
 } from '@repo/mcp-common/src/cloudflare-oauth-handler'
+import { handleDevMode } from '@repo/mcp-common/src/dev-mode'
 import { getUserDetails, UserDetails } from '@repo/mcp-common/src/durable-objects/user_details'
 import { getEnv } from '@repo/mcp-common/src/env'
 import { RequiredScopes } from '@repo/mcp-common/src/scopes'
 import { CloudflareMCPServer } from '@repo/mcp-common/src/server'
 import { registerAccountTools } from '@repo/mcp-common/src/tools/account'
 import { registerD1Tools } from '@repo/mcp-common/src/tools/d1'
+import { registerHyperdriveTools } from '@repo/mcp-common/src/tools/hyperdrive'
 import { registerKVTools } from '@repo/mcp-common/src/tools/kv_namespace'
 import { registerR2BucketTools } from '@repo/mcp-common/src/tools/r2_bucket'
 import { registerWorkersTools } from '@repo/mcp-common/src/tools/worker'
@@ -72,6 +74,7 @@ export class WorkersBindingsMCP extends McpAgent<Env, WorkersBindingsMCPState, P
 		registerWorkersTools(this)
 		registerR2BucketTools(this)
 		registerD1Tools(this)
+		registerHyperdriveTools(this)
 	}
 
 	async getActiveAccountId() {
@@ -104,31 +107,18 @@ const BindingsScopes = {
 	'd1:write': 'Create, read, and write to D1 databases',
 } as const
 
-// TODO: Move this in to mcp-common
-async function handleDevMode(req: Request, env: Env, ctx: ExecutionContext) {
-	const { user, accounts } = await getUserAndAccounts(env.DEV_CLOUDFLARE_API_TOKEN, {
-		'X-Auth-Email': env.DEV_CLOUDFLARE_EMAIL,
-		'X-Auth-Key': env.DEV_CLOUDFLARE_API_TOKEN,
-	})
-	ctx.props = {
-		accessToken: env.DEV_CLOUDFLARE_API_TOKEN,
-		user,
-		accounts,
-	} as Props
-	return WorkersBindingsMCP.mount('/sse').fetch(req, env, ctx)
-}
-
 export default {
 	fetch: async (req: Request, env: Env, ctx: ExecutionContext) => {
-		if (env.ENVIRONMENT === 'development' && env.DEV_DISABLE_OAUTH === 'true') {
-			return await handleDevMode(req, env, ctx)
+		if (
+			(env.ENVIRONMENT === 'development' || env.ENVIRONMENT === 'test') &&
+			env.DEV_DISABLE_OAUTH === 'true'
+		) {
+			return await handleDevMode(WorkersBindingsMCP, req, env, ctx)
 		}
 
 		return new OAuthProvider({
-			apiHandlers: {
-				'/mcp': WorkersBindingsMCP.serve('/mcp'),
-				'/sse': WorkersBindingsMCP.serveSSE('/sse'),
-			},
+			apiRoute: ['/mcp', '/sse'],
+			apiHandler: createApiHandler(WorkersBindingsMCP),
 			// @ts-ignore
 			defaultHandler: createAuthHandlers({ scopes: BindingsScopes, metrics }),
 			authorizeEndpoint: '/oauth/authorize',

diff --git a/apps/workers-bindings/test/index.test.ts b/apps/workers-bindings/test/index.test.ts
-Original file line number
+Diff line change
@@ Expand Up / @@ -78,6 +78,4 @@ export default { @@
     			clientRegistrationEndpoint: '/register',
     		}).fetch(req, env, ctx)
     	},
-    } /*
-    */
+    }