Open
Description
Initial Checks
- I confirm that I'm using the latest version of Pydantic AI
- I confirm that I searched for my issue in https://github.com/pydantic/pydantic-ai/issues before opening this issue
Description
Pydantic AI Issue: Meta Llama Model Inconsistency with Tool Calling + Structured Output -- llama4 models basically unusable with Pydantic
Test Description
Setup: Identical agent configuration across 4 models with:
- Two simple tools:
get_count_of_oranges()
→ returns 25,get_count_of_apples()
→ returns 30 - Structured output:
FruitCountResponse(fruit_count_by_color: dict[str, int])
- Query: "Call find out total number of fruits"
Expected Behavior: All models should call both tools and return {'orange': 25, 'apple': 30}
Results
Model | Tools Called | Output | Status |
---|---|---|---|
Mistral Large 2407 | ✅ Both tools | {'orange': 25, 'apple': 30} |
✅ PASS |
Meta Llama 3.2 90B | ❌ None | {} (empty) |
❌ FAIL |
Meta Llama 3.3 70B | ❌ None | UnexpectedModelBehavior validation error |
❌ FAIL |
Meta Llama 4 Maverick | ❌ None | ValidationError: Input should be a valid integer |
❌ FAIL |
Issue Summary
Problem: Meta Llama models fail basic tool calling + structured output combination. Updated test reveals specific failure patterns:
- Llama 3.2: Skips tools entirely, returns empty dict
{}
- Llama 3.3: Returns text format:
'{"type": "function", "name": "final_result", "parameters": {"fruit_count_by_color": {"function_name": "get_count_of_oranges", "args": []}}}'
- Llama 4: Returns tool references instead of values:
{'orange': {'name': 'get_count_of_oranges', 'parameters': {}}, 'apple': {'name': 'get_count_of_apples', 'parameters': {}}}
Root Issue: Meta Llama models consistently confuse tool calling syntax with structured output format, returning tool metadata instead of actual values.
Environment
- AWS Bedrock, us-west-2
- Pydantic AI latest via uv
- Python 3.11.11
Test Code
Complete reproducible test at: /src/functions/slackbot/test_multi_tool_agent.py
Example Code
import asyncio
import logging
import os
import pytest
from pydantic import BaseModel
from pydantic_ai import Agent, RunContext
from pydantic_ai.models.bedrock import BedrockConverseModel
from app.pydantic_ai_extensions.enhanced_bedrock_model import EnhancedBedrockModel
# Configure more detailed logging
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
# Also enable pydantic_ai logging
pydantic_logger = logging.getLogger("pydantic_ai")
pydantic_logger.setLevel(logging.DEBUG)
# Enable even more detailed logging for validation errors
boto_logger = logging.getLogger("botocore")
boto_logger.setLevel(logging.INFO) # Keep boto3 at INFO to avoid too much noise
# Enable validation logging
validation_logger = logging.getLogger("pydantic")
validation_logger.setLevel(logging.DEBUG)
class AgentTestContext(BaseModel):
called_tools: list[str] = []
class FruitCountResponse(BaseModel):
"""Response from the fruit counting agent."""
fruit_count_by_color: dict[str, int]
def setup_aws_environment():
"""Set up AWS environment variables for testing."""
os.environ["AWS_DEFAULT_REGION"] = "us-west-2"
# Patch for Meta Llama models that don't support toolChoice.any
def _patched_map_tool_config(self, model_request_parameters):
"""Patched version that always uses 'auto' instead of 'any' for tool choice"""
tools = self._get_tools(model_request_parameters)
if not tools:
return None
# Always use 'auto' for Meta Llama models instead of 'any'
tool_choice = {"auto": {}}
tool_config = {"tools": tools, "toolChoice": tool_choice}
return tool_config
def create_test_agent(model_id: str) -> Agent[AgentTestContext, FruitCountResponse]:
"""Create a test agent with two simple tools.
Args:
model_id: The model ID to use for the agent
Returns:
A configured agent instance
"""
logger.info(f"Creating agent with model: {model_id}")
# model = EnhancedBedrockModel(model_name=model_id)
model = BedrockConverseModel(model_name=model_id )
# Apply Meta Llama patch for toolChoice.any issue
if "meta.llama" in model_id.lower() or "llama" in model_id.lower():
logger.info(f"Applying Meta Llama patch for model: {model_id}")
model._map_tool_config = _patched_map_tool_config.__get__(model, EnhancedBedrockModel)
agent = Agent(
model=model,
deps_type=AgentTestContext,
output_type=FruitCountResponse,
retries=3,
output_retries=3,
system_prompt="""
You are a fruit counting assistant. You MUST use the available tools to get fruit counts.
When asked about fruit counts:
1. Call relevant tools to get fruit count
2. Return a JSON response with fruit_count_by_color filled in with correct counts
Example response: {"fruit_count_by_color": {"orange": 3, "apple": 7}}
""",
)
@agent.tool
async def get_count_of_oranges(ctx: RunContext[AgentTestContext]) -> int:
"""Get the current count of oranges in inventory."""
logger.info("🍊 get_count_of_oranges tool called")
ctx.deps.called_tools.append("get_count_of_oranges")
# Simulate some work
await asyncio.sleep(0.1)
# Return a fixed count for testing
orange_count = 25
logger.info(f"🍊 Returning orange count: {orange_count}")
return orange_count
@agent.tool
async def get_count_of_apples(ctx: RunContext[AgentTestContext]) -> int:
"""Get the current count of apples in inventory."""
logger.info("🍎 get_count_of_apples tool called")
ctx.deps.called_tools.append("get_count_of_apples")
# Simulate some work
await asyncio.sleep(0.1)
# Return a fixed count for testing
apple_count = 30
logger.info(f"🍎 Returning apple count: {apple_count}")
return apple_count
return agent
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_id",
[
"mistral.mistral-large-2407-v1:0",
"arn:aws:bedrock:us-west-2:<account_number>:inference-profile/us.meta.llama3-2-90b-instruct-v1:0",
"arn:aws:bedrock:us-west-2:<account_number>:inference-profile/us.meta.llama3-3-70b-instruct-v1:0",
"arn:aws:bedrock:us-west-2:<account_number>:inference-profile/us.meta.llama4-maverick-17b-instruct-v1:0",
],
)
@pytest.mark.timeout(120) # 2 minute timeout
async def test_multi_tool_usage_parametrized(model_id: str) -> None:
"""Test if the agent uses multiple tools when asked for total fruit count.
Args:
model_id: The model ID to test with
"""
# Set up AWS environment
setup_aws_environment()
logger.info(f"🧪 Starting test with model: {model_id}")
logger.info(f"🔧 Model type: {'Meta Llama' if 'llama' in model_id.lower() else 'Mistral'}")
# Create the test agent and context
logger.info("📦 Creating test agent...")
agent = create_test_agent(model_id)
context = AgentTestContext()
logger.info("✅ Agent created successfully")
# Test with a more explicit query that should trigger both tools
query = "Call find out total number of fruits"
logger.info(f"🔍 Query: {query}")
logger.info("🚀 Starting agent execution...")
result = await agent.run(query, deps=context)
logger.info(f"📝 Results from agent with model {model_id} produced results: {result.output}")
# assert sum of fruit counts is correct
assert sum(result.output.fruit_count_by_color.values()) == 55, "Total fruit count does not match expected value"
# Check tool usage
expected_tools = ["get_count_of_oranges", "get_count_of_apples"]
tools_called = set(context.called_tools)
expected_set = set(expected_tools)
logger.info(f"🔧 Tools actually called: {context.called_tools}")
logger.info(f"🎯 Expected tools: {expected_tools}")
assert expected_set == tools_called, (
f"Expected tools {expected_set} but got {tools_called}. "
"This means the agent did not call all required tools."
)
if __name__ == "__main__":
pytest.main([__file__, "-v"])
Python, Pydantic AI & LLM client version
0.2.18