pydantic · samuelcolvin · Jul 2, 2025 · Jul 2, 2025 · Jul 2, 2025 · Jul 2, 2025
diff --git a/.gitignore b/.gitignore
@@ -5,7 +5,10 @@ build/
 dist/
 wheels/
 *.egg-info
+.DS_Store
 
 # Virtual environments
 .venv
 *.svg
+scratch/
+.self-improving-agent/
diff --git a/human-seeded-evals/README.md b/human-seeded-evals/README.md
@@ -0,0 +1,32 @@
+# Human Seeded Evals Demo
+
+Like evals ... but without all the hard work.
+
+Panacea or pipedream?
+
+# Usage
+
+Run the frontend:
+
+```bash
+cd frontend
+npm run dev
+```
+
+Run the backend:
+
+```bash
+uv run uvicorn app.main:app --port 5000
+```
+
+Run the eval generator:
+
+```bash
+uv run evals/eval_prompt_generator.py
+```
+
+Run the live evals agent:
+
+```bash
+uv run evals/auto_evals.py
+```
diff --git a/human-seeded-evals/app/__init__.py b/human-seeded-evals/app/__init__.py
diff --git a/human-seeded-evals/app/agent.py b/human-seeded-evals/app/agent.py
@@ -0,0 +1,35 @@
+from __future__ import annotations as _annotations
+
+from dataclasses import dataclass
+from datetime import datetime
+
+from pydantic_ai import Agent, RunContext
+
+from .models import TimeRangeInputs, TimeRangeResponse
+
+
+@dataclass
+class TimeRangeDeps:
+    now: datetime
+
+
+instrunctions = "Convert the user's request into a structured time range."
+time_range_agent = Agent[TimeRangeDeps, TimeRangeResponse](
+    'anthropic:claude-sonnet-4-0',
+    output_type=TimeRangeResponse,  # type: ignore  # we can't yet annotate something as receiving a TypeForm
+    deps_type=TimeRangeDeps,
+    instructions=instrunctions,
+    retries=1,
+)
+
+
+@time_range_agent.instructions
+def inject_current_time(ctx: RunContext[TimeRangeDeps]) -> str:
+    """Add the user's current time and timezone in the format 'Friday, November 22, 2024 11:15:14 PST' to context."""
+    return f"The user's current time is {ctx.deps.now:%A, %B %d, %Y %H:%M:%S %Z}."
+
+
+async def infer_time_range(inputs: TimeRangeInputs) -> TimeRangeResponse:
+    """Infer a time range from a user prompt."""
+    result = await time_range_agent.run(inputs.prompt, deps=TimeRangeDeps(now=inputs.now))
+    return result.output
diff --git a/human-seeded-evals/app/main.py b/human-seeded-evals/app/main.py
@@ -0,0 +1,16 @@
+import logfire
+from fastapi import FastAPI
+
+from .agent import infer_time_range
+from .models import TimeRangeInputs, TimeRangeResponse
+
+logfire.configure(environment='dev')
+logfire.instrument_pydantic_ai()
+
+app = FastAPI()
+logfire.instrument_fastapi(app)
+
+
+@app.post('/api/timerange')
+async def convert_time_range(time_range_inputs: TimeRangeInputs) -> TimeRangeResponse:
+    return await infer_time_range(time_range_inputs)
diff --git a/human-seeded-evals/app/models.py b/human-seeded-evals/app/models.py
@@ -0,0 +1,39 @@
+from __future__ import annotations as _annotations
+
+from datetime import datetime
+
+from pydantic import AwareDatetime, BaseModel, Field
+
+
+class TimeRangeBuilderSuccess(BaseModel, use_attribute_docstrings=True):
+    """Response when a time range could be successfully generated."""
+
+    start_timestamp: AwareDatetime = Field(serialization_alias='startTimestamp')
+    """A datetime in ISO format with timezone offset when the interval starts."""
+
+    end_timestamp: AwareDatetime = Field(serialization_alias='endTimestamp')
+    """A datetime in ISO format with timezone offset when the interval ends."""
+
+    explanation: str | None
+    """
+    A brief explanation of the time range that was selected.
+
+    For example, if a user only mentions a specific point in time, you might explain that you selected a 10 minute
+    window around that time.
+    """
+
+
+class TimeRangeBuilderError(BaseModel):
+    """Response when a time range cannot not be generated."""
+
+    error: str
+
+
+TimeRangeResponse = TimeRangeBuilderSuccess | TimeRangeBuilderError
+
+
+class TimeRangeInputs(BaseModel):
+    """The inputs for the time range inference agent."""
+
+    prompt: str
+    now: AwareDatetime = Field(default_factory=lambda: datetime.now().astimezone())
diff --git a/human-seeded-evals/auto_evals/auto_evals.py b/human-seeded-evals/auto_evals/auto_evals.py
@@ -0,0 +1,108 @@
+import asyncio
+import os
+from datetime import datetime, timedelta, timezone
+from pathlib import Path
+from typing import Any, Literal
+
+import logfire
+from logfire.experimental import annotations
+from logfire.experimental.query_client import AsyncLogfireQueryClient
+from pydantic import BaseModel, TypeAdapter
+from pydantic_ai import Agent, format_as_xml
+
+read_token = os.environ['LOGFIRE_READ_TOKEN']
+logfire.configure(environment='evals')
+logfire.instrument_pydantic_ai()
+
+
+class EvalFeedback(BaseModel, use_attribute_docstrings=True):
+    reaction: Literal['positive', 'negative']
+    comment: str | None = None
+    """Very concise comment for the evaluation"""
+
+
+prompt_path = Path(__file__).parent / 'eval_agent_prompt.txt'
+evals_agent = Agent(
+    'anthropic:claude-sonnet-4-0',
+    instructions=prompt_path.read_text(),
+    output_type=EvalFeedback,
+)
+runs_query = """
+select
+    created_at,
+    trace_id,
+    span_id,
+    attributes->'all_messages_events'->1->>'content' as prompt,
+    attributes->'final_result' as output
+from records
+where otel_scope_name = 'pydantic-ai' and message = 'time_range_agent run'
+"""
+
+with_annotations_query = """
+select
+    '00-' || trace_id || '-' || parent_span_id || '-01' as trace_parent
+from records
+where kind='annotation'
+"""
+
+
+class RunData(BaseModel):
+    created_at: datetime
+    trace_id: str
+    span_id: str
+    prompt: str
+    output: Any
+
+    @property
+    def trace_parent(self):
+        return f'00-{self.trace_id}-{self.span_id}-01'
+
+
+run_data_list_schema = TypeAdapter(list[RunData])
+
+
+async def apply_feedback(run: RunData):
+    if run.output is None:
+        return
+    r = await evals_agent.run(
+        format_as_xml({'run_timestamp': run.created_at, 'prompt': run.prompt, 'output': run.output})
+    )
+    print(f'Adding feedback to {run.trace_parent}: {r.output}')
+    annotations.record_feedback(
+        run.trace_parent,
+        'AI Annotation',
+        value=r.output.reaction,
+        comment=r.output.comment,
+        extra={'path': ''},
+    )
+
+
+async def main():
+    min_timestamp = datetime.now(tz=timezone.utc) - timedelta(minutes=30)
+    async with AsyncLogfireQueryClient(read_token) as client:
+        while True:
+            response = await client.query_json_rows(runs_query, min_timestamp=min_timestamp)
+            runs = run_data_list_schema.validate_python(response['rows'])
+            if runs:
+                response = await client.query_json_rows(with_annotations_query, min_timestamp=min_timestamp)
+                annotated_spans: set[str] = {r['trace_parent'] for r in response['rows']}
+                runs = [run for run in runs if run.trace_parent not in annotated_spans]
+                if runs:
+                    print('')
+                    logfire.info('found {runs} new runs to evaluate', runs=len(runs))
+                    min_timestamp = min(runs, key=lambda run: run.created_at).created_at.astimezone(timezone.utc)
+                    await asyncio.gather(*[apply_feedback(run) for run in runs])
+                    await asyncio.sleep(2)
+                    continue
+
+            min_timestamp = datetime.now(tz=timezone.utc) - timedelta(minutes=1)
+            print('.', end='', flush=True)
+
+            await asyncio.sleep(2)
+
+
+if __name__ == '__main__':
+    try:
+        asyncio.run(main())
+    except KeyboardInterrupt:
+        print('stopping')
diff --git a/human-seeded-evals/auto_evals/eval_agent_prompt.txt b/human-seeded-evals/auto_evals/eval_agent_prompt.txt
@@ -0,0 +1,25 @@
+You are an evaluation agent responsible for assessing the performance of the time_range_agent. The time_range_agent converts user requests into structured time ranges with start and end timestamps.
+
+Your task is to evaluate whether the time_range_agent correctly interprets temporal requests and generates appropriate time ranges according to these criteria:
+
+1. **Temporal Interpretation**: The agent should correctly identify the time period referenced by the user (e.g., "yesterday", "last Monday", "4pm", etc.)
+
+2. **Past Time Constraint**: Time ranges must be in the past relative to the provided context timestamp. Future time requests should return an error.
+
+3. **Timezone Handling**: The agent should handle timezone specifications correctly (e.g., "ET", "BST") and default to an appropriate timezone when not specified.
+
+4. **Range Generation**:
+   - For full day requests (e.g., "Monday", "yesterday"): Generate ranges from 00:00:00 to 23:59:59
+   - For specific time points (e.g., "4pm", "9am"): Generate a 10-minute window around the specified time
+   - The explanation should clearly describe the selected time range
+
+5. **Error Handling**: When a valid time range cannot be generated (e.g., future dates), the agent should return an error response.
+
+Evaluate each agent output by checking:
+- Is the interpreted time period correct given the user's request and context timestamp?
+- Are the start and end timestamps properly formatted with timezone information?
+- Is the time range in the past relative to the context timestamp?
+- Is the explanation clear and accurate?
+- Are errors properly returned when appropriate?
+
+Provide concise, specific feedback identifying what the agent did correctly or incorrectly. Focus on the most important issues that would impact the usability of the time range.
diff --git a/human-seeded-evals/auto_evals/eval_prompt_generator.py b/human-seeded-evals/auto_evals/eval_prompt_generator.py
@@ -0,0 +1,123 @@
+import asyncio
+import json
+import os
+import sys
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Literal
+
+import logfire
+from logfire.experimental.query_client import AsyncLogfireQueryClient
+from pydantic import BaseModel, TypeAdapter
+from pydantic_ai import Agent, format_as_xml
+
+sys.path.append(str(Path(__file__).parent.parent))
+
+from app import agent
+
+read_token = os.environ['LOGFIRE_READ_TOKEN']
+logfire.configure(environment='evals')
+logfire.instrument_pydantic_ai()
+
+auto_annotation_agent = Agent(
+    'anthropic:claude-opus-4-0',
+    instructions="""
+Your task is to build a system prompt for an agent (the evals agent) which will evaluate the performance of another
+agent and provide feedback on its performance.
+
+You should return the system prompt for the evals agent ONLY.
+""",
+)
+
+
+class RunFeedback(BaseModel):
+    reaction: Literal['positive', 'negative'] | None
+    comment: str | None
+
+
+class AgentRunSummary(BaseModel):
+    prompt: str
+    context: Any
+    output: Any
+    feedback: RunFeedback | None = None
+
+
+count_runs_query = "select count(*) from records where message = 'time_range_agent run'"
+runs_query = """
+select
+    trace_id,
+    span_id,
+    'time timestamp: ' || created_at as context,
+    attributes->'all_messages_events'->1->>'content' as prompt,
+    attributes->'final_result' as output
+from records
+where message = 'time_range_agent run'
+"""
+feedback_query = """
+select
+    trace_id,
+    parent_span_id,
+    attributes->>'Annotation' as reaction,
+    attributes->>'logfire.feedback.comment' as comment
+from records
+where kind='annotation' and attributes->>'logfire.feedback.name'='Annotation'
+"""
+min_count = 1
+
+
+async def get_runs() -> None | list[AgentRunSummary]:
+    min_timestamp = datetime(2025, 7, 2)
+    async with AsyncLogfireQueryClient(read_token) as client:
+        c = await client.query_json(sql=count_runs_query, min_timestamp=min_timestamp)
+        count = c['columns'][0]['values'][0]
+        if count < min_count:
+            print(f'Insufficient runs ({count})')
+            return
+
+        r = await client.query_json_rows(sql=feedback_query, min_timestamp=min_timestamp)
+        feedback_lookup: dict[str, Any] = {
+            f'{row["trace_id"]}-{row["parent_span_id"]}': RunFeedback(**row) for row in r['rows']
+        }
+
+        r = await client.query_json_rows(sql=runs_query, min_timestamp=min_timestamp)
+        runs: list[AgentRunSummary] = []
+        with_feedback = 0
+        for row in r['rows']:
+            key = f'{row["trace_id"]}-{row["span_id"]}'
+            if feedback := feedback_lookup.get(key):
+                row['feedback'] = feedback
+                with_feedback += 1
+            runs.append(AgentRunSummary(**row))
+
+        logfire.info(f'Found {len(runs)} runs, {with_feedback} with feedback')
+        return runs
+
+
+async def generate_evals_prompt(
+    name: str, instrunctions: str, output_type: type[Any] | None, runs: list[AgentRunSummary]
+) -> str:
+    data: dict[str, Any] = {'agent_name': name, 'agent_instructions': instrunctions}
+    if output_type is not None:
+        data['output_schema'] = json.dumps(TypeAdapter(output_type).json_schema(), indent=2)
+    data['agent_runs'] = [run.model_dump(exclude_none=True) for run in runs]
+    prompt = format_as_xml(data, include_root_tag=False)
+    r = await auto_annotation_agent.run(prompt)
+    return r.output
+
+
+async def main():
+    runs = await get_runs()
+    if runs:
+        prompt = await generate_evals_prompt(
+            'time_range_agent',
+            agent.instrunctions,
+            agent.TimeRangeResponse,  # type: ignore
+            runs,
+        )
+        prompt_path = Path(__file__).parent / 'eval_agent_prompt.txt'
+        prompt_path.write_text(prompt)
+        print(f'prompt written to {prompt_path}')
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/human-seeded-evals/evals/__init__.py b/human-seeded-evals/evals/__init__.py