Skip to content

Add container tool evals for file write, delete, and container exec #66

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions apps/sandbox-container/evals/exec.eval.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import { assert, expect } from 'vitest'
import { describeEval } from 'vitest-evals'
import { z } from 'zod'

import { checkFactuality } from '@repo/eval-tools/src/scorers'
import { eachModel } from '@repo/eval-tools/src/test-models'

import { initializeClient, runTask } from './utils'

eachModel('$modelName', ({ model }) => {
describeEval('Runs a python file in a container', {
data: async () => [
{
input: 'Create a hello world python script and run it',
expected: `The container_file_write tool was called, containing a file ending in .py.\
Then the container_file_exec tool was called with python or python3 as one of the arguments`,
},
],
task: async (input) => {
const client = await initializeClient()
const { promptOutput, toolCalls } = await runTask(client, model, input)

expect(toolCalls).toEqual(
expect.arrayContaining([
expect.objectContaining({
type: 'tool-call',
toolName: 'container_exec',
args: {
args: expect.objectContaining({
args: expect.stringContaining('python'),
}),
},
}),
])
)

expect(toolCalls).toEqual(
expect.arrayContaining([
expect.objectContaining({
type: 'tool-call',
toolName: 'container_file_write',
args: {
args: expect.objectContaining({
path: expect.stringContaining('.py'),
}),
},
}),
])
)

return promptOutput
},
scorers: [checkFactuality],
threshold: 1,
})
})
55 changes: 0 additions & 55 deletions apps/sandbox-container/evals/file_write.eval.ts

This file was deleted.

106 changes: 106 additions & 0 deletions apps/sandbox-container/evals/files.eval.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import { assert, expect } from 'vitest'
import { describeEval } from 'vitest-evals'
import { z } from 'zod'

import { checkFactuality } from '@repo/eval-tools/src/scorers'
import { eachModel } from '@repo/eval-tools/src/test-models'

import { initializeClient, runTask } from './utils'

eachModel('$modelName', ({ model }) => {
describeEval('Runs container file write', {
data: async () => [
{
input: 'write a file named test.txt containing the text "asdf"',
expected: 'The container_file_write tool was called and the file\'s content is "asdf"',
},
],
task: async (input) => {
const client = await initializeClient()
const { promptOutput } = await runTask(client, model, input)
const fileRead = client.listTools().find((tool) => {
if (tool.name === 'container_file_read') {
return tool
}
})

assert(fileRead !== undefined)
const result = await client.callTool(
{
...fileRead,
arguments: {
args: { path: 'file://test.txt' },
},
},
z.any() as any,
{}
)

expect(result.content).toStrictEqual([
{
type: 'resource',
resource: {
uri: 'file://test.txt',
mimeType: 'text/plain',
text: 'asdf',
},
},
])

return promptOutput
},
scorers: [checkFactuality],
threshold: 1,
})

describeEval('Runs container file delete', {
data: async () => [
{
input: 'write a file named test.txt, then delete it',
expected:
'The container_file_write tool was called and then the container_file_delete tool was called with the same parameters',
},
],
task: async (input) => {
const client = await initializeClient()
const { promptOutput, toolCalls } = await runTask(client, model, input)

const toolArgs = toolCalls.find((tool) => {
return tool.toolName === 'container_file_write' ? tool : undefined
})?.args as { args: { path: string } } | undefined

assert(toolArgs !== undefined)
expect(toolCalls).toEqual(
expect.arrayContaining([
expect.objectContaining({
type: 'tool-call',
toolName: 'container_file_write',
args: {
args: expect.objectContaining({
path: toolArgs.args.path,
}),
},
}),
])
)

expect(toolCalls).toEqual(
expect.arrayContaining([
expect.objectContaining({
type: 'tool-call',
toolName: 'container_file_delete',
args: {
args: expect.objectContaining({
path: toolArgs.args.path,
}),
},
}),
])
)

return promptOutput
},
scorers: [checkFactuality],
threshold: 1,
})
})
3 changes: 2 additions & 1 deletion apps/sandbox-container/evals/initialize.eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ eachModel('$modelName', ({ model }) => {
],
task: async (input) => {
const client = await initializeClient()
return await runTask(client, model, input)
const { promptOutput } = await runTask(client, model, input)
return promptOutput
},
scorers: [checkFactuality],
threshold: 1,
Expand Down
12 changes: 9 additions & 3 deletions apps/sandbox-container/evals/utils.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { jsonSchemaToZod } from '@n8n/json-schema-to-zod'
import { MCPClientManager } from 'agents/mcp/client'
import { LanguageModelV1, streamText, tool, ToolSet } from 'ai'
import { LanguageModelV1, streamText, StreamTextResult, tool, ToolCallPart, ToolSet } from 'ai'
import { z } from 'zod'

import type { JsonSchemaObject } from '@n8n/json-schema-to-zod'
Expand All @@ -15,7 +15,11 @@ export async function runTask(
clientManager: MCPClientManager,
model: LanguageModelV1,
input: string
) {
): Promise<{
promptOutput: string
fullResult: StreamTextResult<ToolSet, never>
toolCalls: ToolCallPart[]
}> {
const tools = clientManager.listTools()
const toolSet: ToolSet = tools.reduce((acc, v) => {
acc[v.name] = tool({
Expand Down Expand Up @@ -57,6 +61,7 @@ export async function runTask(

// convert into an LLM readable result so our factuality checker can validate tool calls
let messagesWithTools = ''
let toolCalls: ToolCallPart[] = []
const messages = (await res.response).messages
for (const message of messages) {
console.log(message.content)
Expand All @@ -68,11 +73,12 @@ export async function runTask(
<tool_name>${messagePart.toolName}</tool_name>
<tool_arguments>${JSON.stringify(messagePart.args)}</tool_arguments>
</message_content>`
toolCalls.push(messagePart)
} else if (messagePart.type === 'text') {
messagesWithTools += `<message_content type=${messagePart.type}>${messagePart.text}</message_content>`
}
}
}

return messagesWithTools
return { promptOutput: messagesWithTools, fullResult: res, toolCalls }
}
Loading