Skip to content

Agent tool calling works with mistral but fails with llama models (on AWS Bedrock) #2123

Open
@okigan

Description

@okigan

Initial Checks

Description

Pydantic AI Issue: Meta Llama Model Inconsistency with Tool Calling + Structured Output -- llama4 models basically unusable with Pydantic

Test Description

Setup: Identical agent configuration across 4 models with:

  • Two simple tools: get_count_of_oranges() → returns 25, get_count_of_apples() → returns 30
  • Structured output: FruitCountResponse(fruit_count_by_color: dict[str, int])
  • Query: "Call find out total number of fruits"

Expected Behavior: All models should call both tools and return {'orange': 25, 'apple': 30}

Results

Model Tools Called Output Status
Mistral Large 2407 ✅ Both tools {'orange': 25, 'apple': 30} PASS
Meta Llama 3.2 90B ❌ None {} (empty) FAIL
Meta Llama 3.3 70B ❌ None UnexpectedModelBehavior validation error FAIL
Meta Llama 4 Maverick ❌ None ValidationError: Input should be a valid integer FAIL

Issue Summary

Problem: Meta Llama models fail basic tool calling + structured output combination. Updated test reveals specific failure patterns:

  • Llama 3.2: Skips tools entirely, returns empty dict {}
  • Llama 3.3: Returns text format: '{"type": "function", "name": "final_result", "parameters": {"fruit_count_by_color": {"function_name": "get_count_of_oranges", "args": []}}}'
  • Llama 4: Returns tool references instead of values: {'orange': {'name': 'get_count_of_oranges', 'parameters': {}}, 'apple': {'name': 'get_count_of_apples', 'parameters': {}}}

Root Issue: Meta Llama models consistently confuse tool calling syntax with structured output format, returning tool metadata instead of actual values.

Environment

  • AWS Bedrock, us-west-2
  • Pydantic AI latest via uv
  • Python 3.11.11

Test Code

Complete reproducible test at: /src/functions/slackbot/test_multi_tool_agent.py

Example Code

import asyncio
import logging
import os

import pytest
from pydantic import BaseModel
from pydantic_ai import Agent, RunContext
from pydantic_ai.models.bedrock import BedrockConverseModel

from app.pydantic_ai_extensions.enhanced_bedrock_model import EnhancedBedrockModel

# Configure more detailed logging
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

# Also enable pydantic_ai logging
pydantic_logger = logging.getLogger("pydantic_ai")
pydantic_logger.setLevel(logging.DEBUG)

# Enable even more detailed logging for validation errors
boto_logger = logging.getLogger("botocore")
boto_logger.setLevel(logging.INFO)  # Keep boto3 at INFO to avoid too much noise

# Enable validation logging
validation_logger = logging.getLogger("pydantic")
validation_logger.setLevel(logging.DEBUG)


class AgentTestContext(BaseModel):
    called_tools: list[str] = []


class FruitCountResponse(BaseModel):
    """Response from the fruit counting agent."""

    fruit_count_by_color: dict[str, int]


def setup_aws_environment():
    """Set up AWS environment variables for testing."""
    os.environ["AWS_DEFAULT_REGION"] = "us-west-2"


# Patch for Meta Llama models that don't support toolChoice.any
def _patched_map_tool_config(self, model_request_parameters):
    """Patched version that always uses 'auto' instead of 'any' for tool choice"""
    tools = self._get_tools(model_request_parameters)
    if not tools:
        return None

    # Always use 'auto' for Meta Llama models instead of 'any'
    tool_choice = {"auto": {}}
    tool_config = {"tools": tools, "toolChoice": tool_choice}

    return tool_config


def create_test_agent(model_id: str) -> Agent[AgentTestContext, FruitCountResponse]:
    """Create a test agent with two simple tools.

    Args:
        model_id: The model ID to use for the agent

    Returns:
        A configured agent instance
    """
    logger.info(f"Creating agent with model: {model_id}")

    # model = EnhancedBedrockModel(model_name=model_id)
    model = BedrockConverseModel(model_name=model_id )

    # Apply Meta Llama patch for toolChoice.any issue
    if "meta.llama" in model_id.lower() or "llama" in model_id.lower():
        logger.info(f"Applying Meta Llama patch for model: {model_id}")
        model._map_tool_config = _patched_map_tool_config.__get__(model, EnhancedBedrockModel)

    agent = Agent(
        model=model,
        deps_type=AgentTestContext,
        output_type=FruitCountResponse,
        retries=3,
        output_retries=3,
        system_prompt="""
        You are a fruit counting assistant. You MUST use the available tools to get fruit counts.
        
        When asked about fruit counts:
        1. Call relevant tools to get fruit count
        2. Return a JSON response with fruit_count_by_color filled in with correct counts

        Example response: {"fruit_count_by_color": {"orange": 3, "apple": 7}}
        """,
    )

    @agent.tool
    async def get_count_of_oranges(ctx: RunContext[AgentTestContext]) -> int:
        """Get the current count of oranges in inventory."""
        logger.info("🍊 get_count_of_oranges tool called")
        ctx.deps.called_tools.append("get_count_of_oranges")

        # Simulate some work
        await asyncio.sleep(0.1)

        # Return a fixed count for testing
        orange_count = 25
        logger.info(f"🍊 Returning orange count: {orange_count}")
        return orange_count

    @agent.tool
    async def get_count_of_apples(ctx: RunContext[AgentTestContext]) -> int:
        """Get the current count of apples in inventory."""
        logger.info("🍎 get_count_of_apples tool called")
        ctx.deps.called_tools.append("get_count_of_apples")

        # Simulate some work
        await asyncio.sleep(0.1)

        # Return a fixed count for testing
        apple_count = 30
        logger.info(f"🍎 Returning apple count: {apple_count}")
        return apple_count

    return agent


@pytest.mark.asyncio
@pytest.mark.parametrize(
    "model_id",
    [
        "mistral.mistral-large-2407-v1:0",
        "arn:aws:bedrock:us-west-2:<account_number>:inference-profile/us.meta.llama3-2-90b-instruct-v1:0",
        "arn:aws:bedrock:us-west-2:<account_number>:inference-profile/us.meta.llama3-3-70b-instruct-v1:0",
        "arn:aws:bedrock:us-west-2:<account_number>:inference-profile/us.meta.llama4-maverick-17b-instruct-v1:0",
    ],
)
@pytest.mark.timeout(120)  # 2 minute timeout
async def test_multi_tool_usage_parametrized(model_id: str) -> None:
    """Test if the agent uses multiple tools when asked for total fruit count.

    Args:
        model_id: The model ID to test with
    """
    # Set up AWS environment
    setup_aws_environment()

    logger.info(f"🧪 Starting test with model: {model_id}")
    logger.info(f"🔧 Model type: {'Meta Llama' if 'llama' in model_id.lower() else 'Mistral'}")

    # Create the test agent and context
    logger.info("📦 Creating test agent...")
    agent = create_test_agent(model_id)
    context = AgentTestContext()
    logger.info("✅ Agent created successfully")

    # Test with a more explicit query that should trigger both tools
    query = "Call find out total number of fruits"

    logger.info(f"🔍 Query: {query}")
    logger.info("🚀 Starting agent execution...")

    result = await agent.run(query, deps=context)

    logger.info(f"📝 Results from agent with model {model_id} produced results: {result.output}")
    # assert sum of fruit counts is correct
    assert sum(result.output.fruit_count_by_color.values()) == 55, "Total fruit count does not match expected value"

    # Check tool usage
    expected_tools = ["get_count_of_oranges", "get_count_of_apples"]
    tools_called = set(context.called_tools)
    expected_set = set(expected_tools)

    logger.info(f"🔧 Tools actually called: {context.called_tools}")
    logger.info(f"🎯 Expected tools: {expected_tools}")

    assert expected_set == tools_called, (
        f"Expected tools {expected_set} but got {tools_called}. "
        "This means the agent did not call all required tools."
    )


if __name__ == "__main__":
    pytest.main([__file__, "-v"])

Python, Pydantic AI & LLM client version

0.2.18

Metadata

Metadata

Assignees

Labels

bugSomething isn't working

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions