Skip to content

Human seeded evals #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@ build/
dist/
wheels/
*.egg-info
.DS_Store

# Virtual environments
.venv
*.svg
scratch/
.self-improving-agent/
32 changes: 32 additions & 0 deletions human-seeded-evals/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Human Seeded Evals Demo

Like evals ... but without all the hard work.

Panacea or pipedream?

# Usage

Run the frontend:

```bash
cd frontend
npm run dev
```

Run the backend:

```bash
uv run uvicorn app.main:app --port 5000
```

Run the eval generator:

```bash
uv run evals/eval_prompt_generator.py
```

Run the live evals agent:

```bash
uv run evals/auto_evals.py
```
Empty file.
35 changes: 35 additions & 0 deletions human-seeded-evals/app/agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from __future__ import annotations as _annotations

from dataclasses import dataclass
from datetime import datetime

from pydantic_ai import Agent, RunContext

from .models import TimeRangeInputs, TimeRangeResponse


@dataclass
class TimeRangeDeps:
now: datetime


instrunctions = "Convert the user's request into a structured time range."
time_range_agent = Agent[TimeRangeDeps, TimeRangeResponse](
'anthropic:claude-sonnet-4-0',
output_type=TimeRangeResponse, # type: ignore # we can't yet annotate something as receiving a TypeForm
deps_type=TimeRangeDeps,
instructions=instrunctions,
retries=1,
)


@time_range_agent.instructions
def inject_current_time(ctx: RunContext[TimeRangeDeps]) -> str:
"""Add the user's current time and timezone in the format 'Friday, November 22, 2024 11:15:14 PST' to context."""
return f"The user's current time is {ctx.deps.now:%A, %B %d, %Y %H:%M:%S %Z}."


async def infer_time_range(inputs: TimeRangeInputs) -> TimeRangeResponse:
"""Infer a time range from a user prompt."""
result = await time_range_agent.run(inputs.prompt, deps=TimeRangeDeps(now=inputs.now))
return result.output
16 changes: 16 additions & 0 deletions human-seeded-evals/app/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import logfire
from fastapi import FastAPI

from .agent import infer_time_range
from .models import TimeRangeInputs, TimeRangeResponse

logfire.configure(environment='dev')
logfire.instrument_pydantic_ai()

app = FastAPI()
logfire.instrument_fastapi(app)


@app.post('/api/timerange')
async def convert_time_range(time_range_inputs: TimeRangeInputs) -> TimeRangeResponse:
return await infer_time_range(time_range_inputs)
39 changes: 39 additions & 0 deletions human-seeded-evals/app/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from __future__ import annotations as _annotations

from datetime import datetime

from pydantic import AwareDatetime, BaseModel, Field


class TimeRangeBuilderSuccess(BaseModel, use_attribute_docstrings=True):
"""Response when a time range could be successfully generated."""

start_timestamp: AwareDatetime = Field(serialization_alias='startTimestamp')
"""A datetime in ISO format with timezone offset when the interval starts."""

end_timestamp: AwareDatetime = Field(serialization_alias='endTimestamp')
"""A datetime in ISO format with timezone offset when the interval ends."""

explanation: str | None
"""
A brief explanation of the time range that was selected.
For example, if a user only mentions a specific point in time, you might explain that you selected a 10 minute
window around that time.
"""


class TimeRangeBuilderError(BaseModel):
"""Response when a time range cannot not be generated."""

error: str


TimeRangeResponse = TimeRangeBuilderSuccess | TimeRangeBuilderError


class TimeRangeInputs(BaseModel):
"""The inputs for the time range inference agent."""

prompt: str
now: AwareDatetime = Field(default_factory=lambda: datetime.now().astimezone())
108 changes: 108 additions & 0 deletions human-seeded-evals/auto_evals/auto_evals.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import asyncio
import os
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import Any, Literal

import logfire
from logfire.experimental import annotations
from logfire.experimental.query_client import AsyncLogfireQueryClient
from pydantic import BaseModel, TypeAdapter
from pydantic_ai import Agent, format_as_xml

read_token = os.environ['LOGFIRE_READ_TOKEN']
logfire.configure(environment='evals')
logfire.instrument_pydantic_ai()


class EvalFeedback(BaseModel, use_attribute_docstrings=True):
reaction: Literal['positive', 'negative']
comment: str | None = None
"""Very concise comment for the evaluation"""


prompt_path = Path(__file__).parent / 'eval_agent_prompt.txt'
evals_agent = Agent(
'anthropic:claude-sonnet-4-0',
instructions=prompt_path.read_text(),
output_type=EvalFeedback,
)
runs_query = """
select
created_at,
trace_id,
span_id,
attributes->'all_messages_events'->1->>'content' as prompt,
attributes->'final_result' as output
from records
where otel_scope_name = 'pydantic-ai' and message = 'time_range_agent run'
"""

with_annotations_query = """
select
'00-' || trace_id || '-' || parent_span_id || '-01' as trace_parent
from records
where kind='annotation'
"""


class RunData(BaseModel):
created_at: datetime
trace_id: str
span_id: str
prompt: str
output: Any

@property
def trace_parent(self):
return f'00-{self.trace_id}-{self.span_id}-01'


run_data_list_schema = TypeAdapter(list[RunData])


async def apply_feedback(run: RunData):
if run.output is None:
return
r = await evals_agent.run(
format_as_xml({'run_timestamp': run.created_at, 'prompt': run.prompt, 'output': run.output})
)
print(f'Adding feedback to {run.trace_parent}: {r.output}')
annotations.record_feedback(
run.trace_parent,
'AI Annotation',
value=r.output.reaction,
comment=r.output.comment,
extra={'path': ''},
)


async def main():
min_timestamp = datetime.now(tz=timezone.utc) - timedelta(minutes=30)
async with AsyncLogfireQueryClient(read_token) as client:
while True:
response = await client.query_json_rows(runs_query, min_timestamp=min_timestamp)
runs = run_data_list_schema.validate_python(response['rows'])
if runs:
response = await client.query_json_rows(with_annotations_query, min_timestamp=min_timestamp)
annotated_spans: set[str] = {r['trace_parent'] for r in response['rows']}
runs = [run for run in runs if run.trace_parent not in annotated_spans]
if runs:
print('')
logfire.info('found {runs} new runs to evaluate', runs=len(runs))
min_timestamp = min(runs, key=lambda run: run.created_at).created_at.astimezone(timezone.utc)
await asyncio.gather(*[apply_feedback(run) for run in runs])
await asyncio.sleep(2)
continue

min_timestamp = datetime.now(tz=timezone.utc) - timedelta(minutes=1)
print('.', end='', flush=True)

await asyncio.sleep(2)


if __name__ == '__main__':
try:
asyncio.run(main())
except KeyboardInterrupt:
print('stopping')
25 changes: 25 additions & 0 deletions human-seeded-evals/auto_evals/eval_agent_prompt.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
You are an evaluation agent responsible for assessing the performance of the time_range_agent. The time_range_agent converts user requests into structured time ranges with start and end timestamps.

Your task is to evaluate whether the time_range_agent correctly interprets temporal requests and generates appropriate time ranges according to these criteria:

1. **Temporal Interpretation**: The agent should correctly identify the time period referenced by the user (e.g., "yesterday", "last Monday", "4pm", etc.)

2. **Past Time Constraint**: Time ranges must be in the past relative to the provided context timestamp. Future time requests should return an error.

3. **Timezone Handling**: The agent should handle timezone specifications correctly (e.g., "ET", "BST") and default to an appropriate timezone when not specified.

4. **Range Generation**:
- For full day requests (e.g., "Monday", "yesterday"): Generate ranges from 00:00:00 to 23:59:59
- For specific time points (e.g., "4pm", "9am"): Generate a 10-minute window around the specified time
- The explanation should clearly describe the selected time range

5. **Error Handling**: When a valid time range cannot be generated (e.g., future dates), the agent should return an error response.

Evaluate each agent output by checking:
- Is the interpreted time period correct given the user's request and context timestamp?
- Are the start and end timestamps properly formatted with timezone information?
- Is the time range in the past relative to the context timestamp?
- Is the explanation clear and accurate?
- Are errors properly returned when appropriate?

Provide concise, specific feedback identifying what the agent did correctly or incorrectly. Focus on the most important issues that would impact the usability of the time range.
123 changes: 123 additions & 0 deletions human-seeded-evals/auto_evals/eval_prompt_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import asyncio
import json
import os
import sys
from datetime import datetime
from pathlib import Path
from typing import Any, Literal

import logfire
from logfire.experimental.query_client import AsyncLogfireQueryClient
from pydantic import BaseModel, TypeAdapter
from pydantic_ai import Agent, format_as_xml

sys.path.append(str(Path(__file__).parent.parent))

from app import agent

read_token = os.environ['LOGFIRE_READ_TOKEN']
logfire.configure(environment='evals')
logfire.instrument_pydantic_ai()

auto_annotation_agent = Agent(
'anthropic:claude-opus-4-0',
instructions="""
Your task is to build a system prompt for an agent (the evals agent) which will evaluate the performance of another
agent and provide feedback on its performance.
You should return the system prompt for the evals agent ONLY.
""",
)


class RunFeedback(BaseModel):
reaction: Literal['positive', 'negative'] | None
comment: str | None


class AgentRunSummary(BaseModel):
prompt: str
context: Any
output: Any
feedback: RunFeedback | None = None


count_runs_query = "select count(*) from records where message = 'time_range_agent run'"
runs_query = """
select
trace_id,
span_id,
'time timestamp: ' || created_at as context,
attributes->'all_messages_events'->1->>'content' as prompt,
attributes->'final_result' as output
from records
where message = 'time_range_agent run'
"""
feedback_query = """
select
trace_id,
parent_span_id,
attributes->>'Annotation' as reaction,
attributes->>'logfire.feedback.comment' as comment
from records
where kind='annotation' and attributes->>'logfire.feedback.name'='Annotation'
"""
min_count = 1


async def get_runs() -> None | list[AgentRunSummary]:
min_timestamp = datetime(2025, 7, 2)
async with AsyncLogfireQueryClient(read_token) as client:
c = await client.query_json(sql=count_runs_query, min_timestamp=min_timestamp)
count = c['columns'][0]['values'][0]
if count < min_count:
print(f'Insufficient runs ({count})')
return

r = await client.query_json_rows(sql=feedback_query, min_timestamp=min_timestamp)
feedback_lookup: dict[str, Any] = {
f'{row["trace_id"]}-{row["parent_span_id"]}': RunFeedback(**row) for row in r['rows']
}

r = await client.query_json_rows(sql=runs_query, min_timestamp=min_timestamp)
runs: list[AgentRunSummary] = []
with_feedback = 0
for row in r['rows']:
key = f'{row["trace_id"]}-{row["span_id"]}'
if feedback := feedback_lookup.get(key):
row['feedback'] = feedback
with_feedback += 1
runs.append(AgentRunSummary(**row))

logfire.info(f'Found {len(runs)} runs, {with_feedback} with feedback')
return runs


async def generate_evals_prompt(
name: str, instrunctions: str, output_type: type[Any] | None, runs: list[AgentRunSummary]
) -> str:
data: dict[str, Any] = {'agent_name': name, 'agent_instructions': instrunctions}
if output_type is not None:
data['output_schema'] = json.dumps(TypeAdapter(output_type).json_schema(), indent=2)
data['agent_runs'] = [run.model_dump(exclude_none=True) for run in runs]
prompt = format_as_xml(data, include_root_tag=False)
r = await auto_annotation_agent.run(prompt)
return r.output


async def main():
runs = await get_runs()
if runs:
prompt = await generate_evals_prompt(
'time_range_agent',
agent.instrunctions,
agent.TimeRangeResponse, # type: ignore
runs,
)
prompt_path = Path(__file__).parent / 'eval_agent_prompt.txt'
prompt_path.write_text(prompt)
print(f'prompt written to {prompt_path}')


if __name__ == '__main__':
asyncio.run(main())
Empty file.
Loading