diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py index 39ec650f11cc..c8457a70e43b 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py @@ -97,7 +97,7 @@ def _is_openai_model_config(val: object) -> TypeGuard[OpenAIModelConfiguration]: def parse_model_config_type( - model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], ) -> None: if _is_aoi_model_config(model_config): model_config["type"] = AZURE_OPENAI_TYPE @@ -106,9 +106,9 @@ def parse_model_config_type( def construct_prompty_model_config( - model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], - default_api_version: str, - user_agent: str, + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + default_api_version: str, + user_agent: str, ) -> dict: parse_model_config_type(model_config) @@ -126,6 +126,7 @@ def construct_prompty_model_config( return prompty_model_config + def is_onedp_project(azure_ai_project: AzureAIProject) -> bool: """Check if the Azure AI project is an OneDP project. @@ -138,6 +139,7 @@ def is_onedp_project(azure_ai_project: AzureAIProject) -> bool: return True return False + def validate_azure_ai_project(o: object) -> AzureAIProject: fields = {"subscription_id": str, "resource_group_name": str, "project_name": str} @@ -230,7 +232,7 @@ def _validate_typed_dict(o: object, t: Type[T_TypedDict]) -> T_TypedDict: k for k in annotations if (is_total and get_origin(annotations[k]) is not NotRequired) - or (not is_total and get_origin(annotations[k]) is Required) + or (not is_total and get_origin(annotations[k]) is Required) } missing_keys = required_keys - o.keys() @@ -291,7 +293,8 @@ def validate_annotation(v: object, annotation: Union[str, type, object]) -> bool return cast(T_TypedDict, o) -def check_score_is_valid(score: Union[str, float], min_score = 1, max_score = 5) -> bool: + +def check_score_is_valid(score: Union[str, float], min_score=1, max_score=5) -> bool: """Check if the score is valid, i.e. is convertable to number and is in the range [min_score, max_score]. :param score: The score to check. @@ -310,6 +313,7 @@ def check_score_is_valid(score: Union[str, float], min_score = 1, max_score = 5) return min_score <= numeric_score <= max_score + def parse_quality_evaluator_reason_score(llm_output: str, valid_score_range: str = "[1-5]") -> Tuple[float, str]: """Parse the output of prompt-based quality evaluators that return a score and reason. @@ -422,11 +426,11 @@ def raise_exception(msg, target): except ImportError as ex: raise MissingRequiredPackage( message="Please install 'azure-ai-inference' package to use SystemMessage, " - "UserMessage or AssistantMessage." + "UserMessage or AssistantMessage." ) from ex if isinstance(message, ChatRequestMessage) and not isinstance( - message, (UserMessage, AssistantMessage, SystemMessage) + message, (UserMessage, AssistantMessage, SystemMessage) ): raise_exception( f"Messages must be a strongly typed class of ChatRequestMessage. Message number: {num}", @@ -437,7 +441,7 @@ def raise_exception(msg, target): if isinstance(message, UserMessage): user_message_count += 1 if isinstance(message.content, list) and any( - isinstance(item, ImageContentItem) for item in message.content + isinstance(item, ImageContentItem) for item in message.content ): image_found = True continue @@ -481,6 +485,7 @@ def raise_exception(msg, target): ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR, ) + def _extract_text_from_content(content): text = [] for msg in content: @@ -488,14 +493,18 @@ def _extract_text_from_content(content): text.append(msg['text']) return text -def _get_conversation_history(query): + +def _get_conversation_history(query, include_system_messages=False): all_user_queries = [] cur_user_query = [] all_agent_responses = [] cur_agent_response = [] + system_message = None for msg in query: if not 'role' in msg: continue + if include_system_messages and msg['role'] == 'system' and 'content' in msg: + system_message = msg.get('content', '') if msg['role'] == 'user' and 'content' in msg: if cur_agent_response != []: all_agent_responses.append(cur_agent_response) @@ -505,15 +514,15 @@ def _get_conversation_history(query): cur_user_query.append(text_in_msg) if msg['role'] == 'assistant' and 'content' in msg: - if cur_user_query !=[]: + if cur_user_query != []: all_user_queries.append(cur_user_query) cur_user_query = [] text_in_msg = _extract_text_from_content(msg['content']) if text_in_msg: cur_agent_response.append(text_in_msg) - if cur_user_query !=[]: + if cur_user_query != []: all_user_queries.append(cur_user_query) - if cur_agent_response !=[]: + if cur_agent_response != []: all_agent_responses.append(cur_agent_response) if len(all_user_queries) != len(all_agent_responses) + 1: @@ -524,31 +533,37 @@ def _get_conversation_history(query): category=ErrorCategory.INVALID_VALUE, blame=ErrorBlame.USER_ERROR, ) - return { - 'user_queries' : all_user_queries, - 'agent_responses' : all_agent_responses - } + 'system_message': system_message, + 'user_queries': all_user_queries, + 'agent_responses': all_agent_responses + } + def _pretty_format_conversation_history(conversation_history): """Formats the conversation history for better readability.""" formatted_history = "" - for i, (user_query, agent_response) in enumerate(zip(conversation_history['user_queries'], conversation_history['agent_responses']+[None])): - formatted_history+=f"User turn {i+1}:\n" + if 'system_message' in conversation_history and conversation_history['system_message'] is not None: + formatted_history += "SYSTEM MESSAGE:\n" + formatted_history += " " + conversation_history['system_message'] + "\n\n" + for i, (user_query, agent_response) in enumerate( + zip(conversation_history['user_queries'], conversation_history['agent_responses'] + [None])): + formatted_history += f"User turn {i + 1}:\n" for msg in user_query: - formatted_history+=" " + "\n ".join(msg) - formatted_history+="\n\n" + formatted_history += " " + "\n ".join(msg) + formatted_history += "\n\n" if agent_response: - formatted_history+=f"Agent turn {i+1}:\n" + formatted_history += f"Agent turn {i + 1}:\n" for msg in agent_response: - formatted_history+=" " + "\n ".join(msg) - formatted_history+="\n\n" + formatted_history += " " + "\n ".join(msg) + formatted_history += "\n\n" return formatted_history -def reformat_conversation_history(query, logger = None): + +def reformat_conversation_history(query, logger=None, include_system_messages=False): """Reformats the conversation history to a more compact representation.""" try: - conversation_history = _get_conversation_history(query) + conversation_history = _get_conversation_history(query, include_system_messages=include_system_messages) return _pretty_format_conversation_history(conversation_history) except: # If the conversation history cannot be parsed for whatever reason (e.g. the converter format changed), the original query is returned @@ -562,25 +577,59 @@ def reformat_conversation_history(query, logger = None): logger.warning(f"Conversation history could not be parsed, falling back to original query: {query}") return query -def _get_agent_response(agent_response_msgs): - """Extracts the text from the agent response content.""" + +def _get_agent_response(agent_response_msgs, include_tool_messages=False): + """Extracts formatted agent response including text, and optionally tool calls/results.""" agent_response_text = [] + tool_results = {} + + # First pass: collect tool results + if include_tool_messages: + for msg in agent_response_msgs: + if msg.get("role") == "tool" and "tool_call_id" in msg: + for content in msg.get("content", []): + if content.get("type") == "tool_result": + result = content.get("tool_result") + tool_results[msg["tool_call_id"]] = f'[TOOL_RESULT] {result}' + + # Second pass: parse assistant messages and tool calls for msg in agent_response_msgs: - if 'role' in msg and msg['role'] == 'assistant' and 'content' in msg: - text = _extract_text_from_content(msg['content']) + if 'role' in msg and msg.get("role") == "assistant" and "content" in msg: + text = _extract_text_from_content(msg["content"]) if text: agent_response_text.extend(text) + if include_tool_messages: + for content in msg.get("content", []): + # Todo: Verify if this is the correct way to handle tool calls + if content.get("type") == "tool_call": + if "tool_call" in content: + tc = content.get("tool_call", {}) + func_name = tc.get("function", {}).get("name", "") + args = tc.get("function", {}).get("arguments", {}) + tool_call_id = tc.get("id") + else: + tool_call_id = content.get("tool_call_id") + func_name = content.get("name", "") + args = content.get("arguments", {}) + args_str = ", ".join(f'{k}="{v}"' for k, v in args.items()) + call_line = f'[TOOL_CALL] {func_name}({args_str})' + agent_response_text.append(call_line) + if tool_call_id in tool_results: + agent_response_text.append(tool_results[tool_call_id]) + return agent_response_text -def reformat_agent_response(response, logger = None): + +def reformat_agent_response(response, logger=None, include_tool_messages=False): try: if response is None or response == []: return "" - agent_response = _get_agent_response(response) + agent_response = _get_agent_response(response, include_tool_messages=include_tool_messages) if agent_response == []: # If no message could be extracted, likely the format changed, fallback to the original response in that case if logger: - logger.warning(f"Empty agent response extracted, likely due to input schema change. Falling back to using the original response: {response}") + logger.warning( + f"Empty agent response extracted, likely due to input schema change. Falling back to using the original response: {response}") return response return "\n".join(agent_response) except: @@ -590,6 +639,18 @@ def reformat_agent_response(response, logger = None): logger.warning(f"Agent response could not be parsed, falling back to original response: {response}") return response + +def reformat_tool_definitions(tool_definitions, logger=None): + output_lines = ["TOOL DEFINITIONS:"] + for tool in tool_definitions: + name = tool.get("name", "unnamed_tool") + desc = tool.get("description", "").strip() + params = tool.get("parameters", {}).get("properties", {}) + param_names = ", ".join(params.keys()) if params else "no parameters" + output_lines.append(f"- {name}: {desc} (inputs: {param_names})") + return "\n".join(output_lines) + + def upload(path: str, container_client: ContainerClient, logger=None): """Upload files or directories to Azure Blob Storage using a container client. diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py index a246007f47b1..52c1c3ab64ac 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py @@ -3,16 +3,19 @@ # --------------------------------------------------------- import os import math +import logging from typing import Dict, Union, List, Optional from typing_extensions import overload, override from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase -from azure.ai.evaluation._common.utils import parse_quality_evaluator_reason_score +from ..._common.utils import reformat_conversation_history, reformat_agent_response, reformat_tool_definitions from azure.ai.evaluation._model_configurations import Message from azure.ai.evaluation._common._experimental import experimental +logger = logging.getLogger(__name__) + @experimental class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]): """The Task Adherence evaluator assesses how well an AI-generated response follows the assigned task based on: @@ -142,21 +145,23 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t category=ErrorCategory.MISSING_FIELD, target=ErrorTarget.TASK_ADHERENCE_EVALUATOR, ) - + eval_input['query'] = reformat_conversation_history(eval_input["query"], logger, include_system_messages=True) + eval_input['response'] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True) + if "tool_definitions" in eval_input and eval_input["tool_definitions"] is not None: + eval_input['tool_definitions'] = reformat_tool_definitions(eval_input["tool_definitions"], logger) llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) - - score = math.nan - if llm_output: - score, reason = parse_quality_evaluator_reason_score(llm_output, valid_score_range="[1-5]") - - score_result = 'pass' if score >= self.threshold else 'fail' - + if isinstance(llm_output, dict): + score = float(llm_output.get("score", math.nan)) + score_result = "pass" if score >= self.threshold else "fail" + reason = llm_output.get("explanation", "") return { f"{self._result_key}": score, f"{self._result_key}_result": score_result, f"{self._result_key}_threshold": self.threshold, f"{self._result_key}_reason": reason, + f"{self._result_key}_additional_details": llm_output } - + if logger: + logger.warning("LLM output is not a dictionary, returning NaN for the score.") return {self._result_key: math.nan} diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty index b452dd85b675..0d11797334c5 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty @@ -1,6 +1,6 @@ --- name: TaskAdherence -description: Evaluates Task Adherence score for QA scenario +description: Evaluates Task Adherence score model: api: chat parameters: @@ -10,7 +10,7 @@ model: presence_penalty: 0 frequency_penalty: 0 response_format: - type: text + type: json_object inputs: query: @@ -21,97 +21,385 @@ inputs: type: string optional: true default: "[]" - --- system: -# Instruction -## Context -### You are an expert in evaluating the quality of an answer from an intelligent system based on provided definitions and data. Your goal will involve answering the questions below using the information provided. -- **Definition**: Based on the provided query, response, and tool definitions, evaluate the agent's adherence to the assigned task. -- **Data**: Your input data includes query, response, and tool definitions. -- **Questions**: To complete your evaluation you will be asked to evaluate the Data in different ways. +You are an expert evaluator who scores how well an AI assistant executed the intended task based on system constraints and user requests. + +user: +ROLE +==== +You are Task-Adherence-Judge, an impartial evaluator who scores task execution quality. + +INPUT +===== +CONVERSATION_HISTORY: {{query}} +AGENT_RESPONSE: {{response}} +TOOL_DEFINITIONS: {{tool_definitions}} + +CONVERSATION_HISTORY includes the full dialogue. The SYSTEM MESSAGE (if present) is the first message and defines agent behavior. +AGENT_RESPONSE is the agent's reply to the latest user query. +TOOL_DEFINITIONS lists available tools. + +EVALUATION STEPS +================ + +A. Extract System Instructions (if system message exists): +- Identify any mandatory rules the agent must follow (e.g., “must”, “never”, “only”, or implied constraints). +- Note preferences that influence task execution (e.g., “should”, “prefer”, “avoid”). +- Record tool usage expectations (required, preferred, or restricted tools). + +B. Identify the User Request: +- Determine what the user explicitly wants: action, answer, or outcome. +- Use full conversation history if needed for clarity. + +C. Define the Intended Task: +- Combine the user request with mandatory system constraints. +- Adjust execution details based on preferences. +- This defines the correct task behavior. + +D. Evaluate Agent Execution: +1. Constraint Adherence: + - Were all mandatory rules followed? + - Any violation should cap the score at 3. + +2. Task Completion: + - Did the agent fulfill the defined task? + - Were any core parts of the request missed? + +3. Tool Usage Evaluation: + - Was tool use appropriate, as per instructions or task need? + - **System mandates tool**: Required (score ≤2 if not used correctly) + - **System prefers tool**: Preferred (minor score impact if not used) + - **System silent on tools**: Evaluate based on task effectiveness + - **System restricts tools**: Discouraged (score ≤3 if used when restricted) + + +E. Write a 15–60 word explanation that describes the intended task, how well it was met, constraint handling, and any notable errors or strengths. + +F. Choose the single closest integer score (1,2,3,4 or 5) from the SCORING RUBRIC below. + +OUTPUT FORMAT +============= +Output a JSON object with the below keys in the given sequence: + 1) explanation: a concise 15–60 word summary of how well the assistant followed the task + 2) agent_perceived_task: what the assistant believed it had to do, based on its response; + 3) actual_task: what the task truly was, considering both system prompt and user latest query, with system taking precedence. + 4) tools_used: List of tools used by the agent + 5) expected_tool_calls: List of tools the agent should have used (if applicable) + 6) score: an integer score from 1 (very poor) to 5 (excellent) on how well the agent resolved the user's intent within constraints + +SCORING RUBRIC WITH EXAMPLES +============================ + +**Score 5 - Fully Adherent:** +- Executed intended task completely +- Followed all mandatory system rules +- Used tools correctly when required by system or task necessity +- User's need fully satisfied within constraints + +Example A (single-turn) +======================= +CONVERSATION_HISTORY: +User turn 1: + Suggest a good mystery book. + +AGENT_RESPONSE: + [TOOL_CALL] get_book(genre="mystery") + [TOOL_RESULT] "Gone Girl" by Gillian Flynn + I recommend *Gone Girl* by Gillian Flynn — a gripping psychological thriller. + +TOOL DEFINITIONS: +- get_book: Returns a book recommendation based on genre. (inputs: genre) + +EXPECTED OUTPUT: +{ + "explanation": "The assistant correctly identified the task, used the appropriate tool as needed, and provided a clear and suitable book recommendation.", + "agent_perceived_task": "recommend a mystery book", + "actual_task": "Suggest a good mystery book", + "tools_used": ["get_book"], + "expected_tool_calls": ["get_book"], + "score": 5 +} + +Example B (multi-turn) +====================== +CONVERSATION_HISTORY: +SYSTEM_PROMPT: +Do not use tools unless explicitly requested by the user. + +User turn 1: + Can you recommend a good book? + +Agent turn 1: + Any preferred genre? + +User turn 2: + Historical fiction. + +AGENT_RESPONSE: + You might enjoy *The Book Thief* by Markus Zusak — a powerful story set during World War II. + +TOOL DEFINITIONS: +- get_book: Returns book suggestions based on genre. (inputs: genre) + +EXPECTED OUTPUT: +{ + "explanation": "The assistant correctly followed the system instruction by not using tools since the user never explicitly requested one. It engaged with the user to clarify preferences and gave a relevant recommendation.", + "agent_perceived_task": "recommend a historical fiction book without tool use", + "actual_task": "Recommend a historical fiction book (no tool allowed unless explicitly requested)", + "tools_used": [], + "expected_tool_calls": [], + "score": 5 +} + +**Score 4 - Mostly Adherent:** +- Executed core intended task +- Followed mandatory system rules +- Minor execution gaps that don't prevent task completion +- Tool usage mostly appropriate + +Example A (single-turn) +======================= +CONVERSATION_HISTORY: +SYSTEM_PROMPT: +You must respond clearly and concisely using available tools. Minor omissions are acceptable. + +User turn 1: + What’s the weather like in Paris? + +AGENT_RESPONSE: + [TOOL_CALL] get_weather(city="Paris") + [TOOL_RESULT] 22°C, partly cloudy + Paris is currently 22°C and partly cloudy. + +TOOL DEFINITIONS: +- get_weather: Provides current weather details for a city. (inputs: city) + +EXPECTED OUTPUT: +{ + "explanation": "Assistant used the correct tool and returned a valid weather update as required by system rules, but lacked minor context like timestamp or unit confirmation.", + "agent_perceived_task": "get weather for Paris", + "actual_task": "Provide current weather in Paris", + "tools_used": ["get_weather"], + "expected_tool_calls": ["get_weather"], + "score": 4 +} + +Example B (multi-turn) +====================== +CONVERSATION_HISTORY: +SYSTEM_PROMPT: +Always ask for genre before making a book suggestion. + +User turn 1: + Recommend a good book to read. + +Agent turn 1: + Any preferred genre? + +User turn 2: + Something romantic. + +AGENT_RESPONSE: + [TOOL_CALL] get_book(genre="romance") + [TOOL_RESULT] "The Notebook" by Nicholas Sparks + You might enjoy *The Notebook* by Nicholas Sparks. + +TOOL DEFINITIONS: +- get_book: Returns book suggestions based on genre. (inputs: genre) + +EXPECTED OUTPUT: +{ + "explanation": "Assistant followed system rule to ask for genre, used the correct tool, and offered a relevant result, though without justification.", + "agent_perceived_task": "recommend a romance novel", + "actual_task": "Recommend a romance book (after genre clarification as per system)", + "tools_used": ["get_book"], + "expected_tool_calls": ["get_book"], + "score": 4 +} + +**Score 3 - Partially Adherent:** +- Attempted intended task but missed key components +- Some system rule violations OR incomplete task execution +- Tool usage suboptimal or missing when preferred by system +- User's need partially satisfied + +Example A (single-turn) +======================= +CONVERSATION_HISTORY: +SYSTEM_PROMPT: +Always rely on tools for factual queries like time and weather. + +User turn 1: + What’s the weather in Sydney? + +AGENT_RESPONSE: + It's probably sunny this time of year. + +TOOL DEFINITIONS: +- get_weather: Retrieve current weather. (inputs: city) + +EXPECTED OUTPUT: +{ + "explanation": "Assistant guessed the weather instead of using the available tool, violating the system’s directive to avoid speculation.", + "agent_perceived_task": "estimate Sydney's weather", + "actual_task": "Get real-time weather in Sydney (tool use required per system)", + "tools_used": [], + "expected_tool_calls": ["get_weather"], + "score": 3 +} + +Example B (multi-turn) +====================== +CONVERSATION_HISTORY: +SYSTEM_PROMPT: +If tools are available, prefer them for recommendations. + +User turn 1: + Can you suggest a book? + +Agent turn 1: + Sure, any specific genre? + +User turn 2: + Thriller. + +AGENT_RESPONSE: + You might enjoy *The Da Vinci Code*. -# Definition +TOOL DEFINITIONS: +- get_book: Get book suggestions by genre. (inputs: genre) -**Level 1: Fully Inadherent** +EXPECTED OUTPUT: +{ + "explanation": "Assistant gave a relevant suggestion but skipped the tool, despite system prompt recommending its use for consistency.", + "agent_perceived_task": "recommend thriller book", + "actual_task": "Suggest a thriller book (prefer tool per system)", + "tools_used": [], + "expected_tool_calls": ["get_book"], + "score": 3 +} -**Definition:** -Response completely ignores instructions or deviates significantly +**Score 2 - Barely Adherent:** +- Minimal task execution +- Violated mandatory system rules OR failed core task requirements +- Wrong tool usage or missing when required by system +- User's need largely unmet -**Example:** - **Query:** What is a recommended weekend itinerary in Paris? - **Response:** Paris is a lovely city with a rich history. +Example A (single-turn) +======================= +CONVERSATION_HISTORY: +SYSTEM_PROMPT: +You are a tool-based agent. Always respond using tools. -Explanation: This response completely misses the task by not providing any itinerary details. It offers a generic statement about Paris rather than a structured travel plan. +User turn 1: + What's the time in Madrid? +AGENT_RESPONSE: + Europe has different time zones. -**Level 2: Barely Adherent** +TOOL DEFINITIONS: +- get_time: Return current time for a city. (inputs: city) -**Definition:** -Response partially aligns with instructions but has critical gaps. +EXPECTED OUTPUT: +{ + "explanation": "Assistant responded vaguely and violated the system rule requiring tool usage. Failed to directly answer the task.", + "agent_perceived_task": "talk about time zones", + "actual_task": "Tell the current time in Madrid (using tool as required by system)", + "tools_used": [], + "expected_tool_calls": ["get_time"], + "score": 2 +} -**Example:** - **Query:** What is a recommended weekend itinerary in Paris? - **Response:** Spend your weekend visiting famous places in Paris. +Example B (multi-turn) +====================== +CONVERSATION_HISTORY: +SYSTEM_PROMPT: +You are a book recommender assistant. Do **not** ask follow-up questions. Only respond with direct suggestions using tools. -Explanation: While the response hints at visiting well-known sites, it is extremely vague and lacks specific details, such as which sites to visit or any order of activities, leaving major gaps in the instructions. +User turn 1: + Recommend a good fantasy book. +Agent turn 1: + Do you prefer magic or adventure? -**Level 3: Moderately Adherent** +User turn 2: + Magic. -**Definition:** -Response meets the core requirements but lacks precision or clarity. +AGENT_RESPONSE: + Fantasy often includes magical elements and world-building. -**Example:** - **Query:** What is a recommended weekend itinerary in Paris? - **Response:** Visit the Eiffel Tower and the Louvre on Saturday, and stroll through Montmartre on Sunday. +TOOL DEFINITIONS: +- get_book: Retrieve books filtered by genre. (inputs: genre) -Explanation: This answer meets the basic requirement by naming a few key attractions and assigning them to specific days. However, it lacks additional context, such as timings, additional activities, or details to make the itinerary practical and clear. +EXPECTED OUTPUT: +{ + "explanation": "The assistant violated the system's instruction not to ask clarifying questions and failed to use the required tool. Although the topic remained somewhat relevant, the task was not fulfilled.", + "agent_perceived_task": "discuss fantasy genre", + "actual_task": "Recommend a fantasy book using tool without asking follow-ups (per system)", + "tools_used": [], + "expected_tool_calls": ["get_book"], + "score": 2 +} +**Score 1 - Non-Adherent:** +- Failed to execute intended task +- Ignored mandatory system constraints +- Completely inappropriate or missing tool usage +- Response irrelevant to user's need -**Level 4: Mostly Adherent** +Example A (single-turn) +======================= +CONVERSATION_HISTORY: +SYSTEM_PROMPT: +Only answer factual questions using tools. Avoid personal commentary. -**Definition:** -Response is clear, accurate, and aligns with instructions with minor issues. +User turn 1: + What's the current time in Berlin? -**Example:** - **Query:** What is a recommended weekend itinerary in Paris? - **Response:** For a weekend in Paris, start Saturday with a morning visit to the Eiffel Tower, then head to the Louvre in the early afternoon. In the evening, enjoy a leisurely walk along the Seine. On Sunday, begin with a visit to Notre-Dame Cathedral, followed by exploring the art and cafés in Montmartre. This plan offers a mix of cultural visits and relaxing experiences. +AGENT_RESPONSE: + I prefer mornings to evenings. How about you? -Explanation: This response is clear, structured, and provides a concrete itinerary with specific attractions and a suggested order of activities. It is accurate and useful, though it might benefit from a few more details like exact timings or restaurant suggestions to be perfect. +TOOL DEFINITIONS: +- get_time: Return time for a given city. (inputs: city) +EXPECTED OUTPUT: +{ + "explanation": "Assistant ignored both the user's factual query and the system's constraint to avoid personal remarks. No tool used.", + "agent_perceived_task": "chat about preferences", + "actual_task": "Tell the current time in Berlin (using tool, avoid chit-chat as per system)", + "tools_used": [], + "expected_tool_calls": ["get_time"], + "score": 1 +} -**Level 5: Fully Adherent** +Example B (multi-turn) +====================== +CONVERSATION_HISTORY: +SYSTEM_PROMPT: +Only provide weather updates using tools. Do not include personal opinions. -**Definition:** -Response is flawless, accurate, and follows instructions to the letter. +User turn 1: + I need a weather update for Rome. -**Example:** - **Query:** What is a recommended weekend itinerary in Paris? - **Response:** Here is a detailed weekend itinerary in Paris: -Saturday: -Morning: Begin your day with a visit to the Eiffel Tower to admire the views from the top. -Early Afternoon: Head to the Louvre for a guided tour of its most famous exhibits. -Late Afternoon: Take a relaxing walk along the Seine, stopping at local boutiques. -Evening: Enjoy dinner at a classic Parisian bistro near the river. -Sunday: -Morning: Visit the Notre-Dame Cathedral to explore its architecture and history. -Midday: Wander the charming streets of Montmartre, stopping by art galleries and cafés. -Afternoon: Finish your trip with a scenic boat tour on the Seine. -This itinerary balances cultural immersion, leisure, and local dining experiences, ensuring a well-rounded visit. +Agent turn 1: + Celsius or Fahrenheit? -Explanation: This response is comprehensive and meticulously follows the instructions. It provides detailed steps, timings, and a variety of activities that fully address the query, leaving no critical gaps. +User turn 2: + Celsius. -# Data -Query: {{query}} -Response: {{response}} -Tool Definitions: {{tool_definitions}} +AGENT_RESPONSE: + I once visited Rome in summer. Beautiful city! -# Tasks -## Please provide your assessment Score for the previous answer. Your output should include the following information: -- **ThoughtChain**: To improve the reasoning process, Think Step by Step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and Start your ThoughtChain with "Let's think step by step:". -- **Explanation**: a very short explanation of why you think the input data should get that Score. -- **Score**: based on your previous analysis, provide your Score. The answer you give MUST be an integer score ("1", "2", ...) based on the categories of the definitions. +TOOL DEFINITIONS: +- get_weather: Provides weather data for a given city. (inputs: city, unit) -## Please provide your answers between the tags: your chain of thoughts, your explanation, your score. -# Output \ No newline at end of file +EXPECTED OUTPUT: +{ + "explanation": "Assistant acknowledged the input but failed to execute the task or use the required tool, and added personal opinion against system guidance.", + "agent_perceived_task": "share travel experience", + "actual_task": "Provide current weather in Celsius for Rome (using tool, no personal commentary)", + "tools_used": [], + "expected_tool_calls": ["get_weather"], + "score": 1 +} diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_utils.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_utils.py index 49628e51da78..ad97765c3543 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_utils.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_utils.py @@ -12,7 +12,8 @@ _pretty_format_conversation_history, reformat_conversation_history, _get_agent_response, - reformat_agent_response + reformat_agent_response, + reformat_tool_definitions ) from azure.ai.evaluation._exceptions import EvaluationException, ErrorMessage @@ -306,7 +307,7 @@ def test__get_conversation_history(self): "content": [{"type": "text", "text": "What is the weather?"}] }, { - "role": "assistant", + "role": "assistant", "content": [{"type": "text", "text": "It's sunny today."}] }, { @@ -314,7 +315,7 @@ def test__get_conversation_history(self): "content": [{"type": "text", "text": "Will it rain tomorrow?"}] } ] - + result = _get_conversation_history(query) expected = { 'user_queries': [[["What is the weather?"]], [["Will it rain tomorrow?"]]], @@ -339,7 +340,7 @@ def test__get_conversation_history(self): ] } ] - + # there is an assertion because there is one user query ["Hello", "How are you?"] and one agent response ["Hi there!", "I'm doing well, thanks."] # the user query length needs to be one more than the agent response length with pytest.raises(EvaluationException, match=str(ErrorMessage.MALFORMED_CONVERSATION_HISTORY)): @@ -356,11 +357,11 @@ def test__get_conversation_history(self): "content": [{"type": "text", "text": "First answer"}] }, { - "role": "user", + "role": "user", "content": [{"type": "text", "text": "Second question"}] } ] - + result = _get_conversation_history(query) expected = { 'user_queries': [[["First question"]], [["Second question"]]], @@ -378,7 +379,7 @@ def test__get_conversation_history_with_invalid_data(self): "content": [{"type": "text", "text": "Has role"}] } ] - + result = _get_conversation_history(query) expected = { 'user_queries': [[["Has role"]]], @@ -394,7 +395,7 @@ def test__get_conversation_history_with_invalid_data(self): "content": [{"type": "text", "text": "Has content"}] } ] - + result = _get_conversation_history(query) expected = { 'user_queries': [[["Has content"]]], @@ -408,7 +409,7 @@ def test__pretty_format_conversation_history(self): 'user_queries': [[["What is the weather?"]], [["Will it rain tomorrow?"]]], 'agent_responses': [[["It's sunny today."]]] } - + result = _pretty_format_conversation_history(conversation_history) expected = ( "User turn 1:\n" @@ -425,7 +426,7 @@ def test__pretty_format_conversation_history(self): 'user_queries': [[["Hello", "How are you?"]]], 'agent_responses': [[["Hi there!", "I'm doing well, thanks."]]] } - + result = _pretty_format_conversation_history(conversation_history) expected = ( "User turn 1:\n" @@ -452,7 +453,7 @@ def test_reformat_conversation_history(self): "content": [{"type": "text", "text": "Tell me more."}] } ] - + result = reformat_conversation_history(query) expected = ( "User turn 1:\n" @@ -481,7 +482,7 @@ def test__get_agent_response(self): ] } ] - + result = _get_agent_response(agent_response_msgs) assert result == ["Hello!", "How can I help you?"] @@ -492,11 +493,11 @@ def test__get_agent_response(self): "content": [{"type": "text", "text": "First response"}] }, { - "role": "assistant", + "role": "assistant", "content": [{"type": "text", "text": "Second response"}] } ] - + result = _get_agent_response(agent_response_msgs) assert result == ["First response", "Second response"] @@ -511,7 +512,7 @@ def test__get_agent_response(self): "content": [{"type": "text", "text": "Assistant message"}] } ] - + result = _get_agent_response(agent_response_msgs) assert result == ["Assistant message"] @@ -528,7 +529,7 @@ def test__get_agent_response(self): "content": [{"type": "text", "text": "Valid message"}] } ] - + result = _get_agent_response(agent_response_msgs) assert result == ["Valid message"] @@ -544,7 +545,7 @@ def test_reformat_agent_response(self): ] } ] - + result = reformat_agent_response(response) assert result == "Hello!\nHow can I help you?" @@ -587,7 +588,7 @@ def test_edge_cases_and_error_handling(self): "content": [{"type": "text", "text": "Response without user query"}] } ] - + with pytest.raises(EvaluationException, match=str(ErrorMessage.MALFORMED_CONVERSATION_HISTORY)): _get_conversation_history(query_with_unbalanced_turns) @@ -599,7 +600,7 @@ def test_extract_text_from_content_with_list(self): {"text": " world"} ] assert _extract_text_from_content(content) == ["Hello", " world"] - + # Test with mixed content (text and non-text) content = [ {"text": "Hello"}, @@ -607,10 +608,10 @@ def test_extract_text_from_content_with_list(self): {"text": " world"} ] assert _extract_text_from_content(content) == ["Hello", " world"] - + # Test with empty list assert _extract_text_from_content([]) == [] - + # Test with non-text items only content = [ {"type": "image", "url": "image.jpg"}, @@ -626,14 +627,14 @@ def test_get_conversation_history_with_queries_and_responses(self): {"role": "assistant", "content": [{"text": "Hi there!"}]}, {"role": "user", "content": [{"text": "How are you?"}]} ] - + result = _get_conversation_history(conversation) expected = { 'user_queries': [[["Hello"]], [["How are you?"]]], 'agent_responses': [[["Hi there!"]]] } assert result == expected - + conversation = [] with pytest.raises(EvaluationException, match=str(ErrorMessage.MALFORMED_CONVERSATION_HISTORY)): _get_conversation_history(conversation) @@ -645,7 +646,7 @@ def test_pretty_format_conversation_history_with_dict(self): 'user_queries': [[["Hello"]], [["How are you?"]]], 'agent_responses': [[["Hi there!"]]] } - + formatted = _pretty_format_conversation_history(conversation_history) assert "User turn 1:" in formatted assert "Hello" in formatted @@ -662,12 +663,12 @@ def test_conversation_history_integration(self): {"role": "assistant", "content": [{"text": "It's sunny today."}]}, {"role": "user", "content": [{"text": "Will it rain tomorrow?"}]} ] - + # Test reformatting formatted = reformat_conversation_history(conversation) assert isinstance(formatted, str) assert "User turn" in formatted - + # Test fallback behavior with malformed conversation malformed_conversation = {"invalid": "data"} formatted = reformat_conversation_history(malformed_conversation) @@ -681,13 +682,13 @@ def test_get_agent_response_with_list(self): {"role": "user", "content": [{"text": "Hi"}]}, {"role": "assistant", "content": [{"text": "How can I help?"}]} ] - + result = _get_agent_response(messages) assert result == ["Hello!", "How can I help?"] - + # Test with empty list assert _get_agent_response([]) == [] - + # Test with no assistant messages messages = [{"role": "user", "content": [{"text": "Hello"}]}] assert _get_agent_response(messages) == [] @@ -699,13 +700,13 @@ def test_agent_response_integration(self): {"role": "assistant", "content": [{"text": "Hello!"}]}, {"role": "assistant", "content": [{"text": "How can I help?"}]} ] - + formatted = reformat_agent_response(response) assert formatted == "Hello!\nHow can I help?" - + # Test with empty response assert reformat_agent_response([]) == "" - + # Test fallback behavior malformed_response = {"invalid": "structure"} formatted = reformat_agent_response(malformed_response) @@ -713,7 +714,7 @@ def test_agent_response_integration(self): def test_utility_functions_edge_cases(self): """Test edge cases and error handling for utility functions.""" - + # Test _extract_text_from_content with malformed data malformed_content = [ {"missing_text": "Hello"}, @@ -722,14 +723,120 @@ def test_utility_functions_edge_cases(self): # Should handle gracefully and extract what it can result = _extract_text_from_content(malformed_content) assert result == ["world"] - + # Test functions with very large inputs large_content = [{"text": "x" * 1000}] * 10 result = _extract_text_from_content(large_content) assert len(result) == 10 assert all(len(text) == 1000 for text in result) - + # Test with unicode content unicode_content = [{"text": "Hello 世界 🌍"}] result = _extract_text_from_content(unicode_content) assert result == ["Hello 世界 🌍"] + + def test_reformat_agent_response_with_tool_calls(self): + response = [ + {"role": "assistant", "content": [{"type": "text", "text": "Let me check that for you."}]}, + {"role": "assistant", "content": [{"type": "tool_call", "tool_call": {"id": "tool_call_1", "type": "function", "function": {"name": "get_orders", "arguments": {"account_number": "123"}}}}]}, + {"role": "tool", "tool_call_id": "tool_call_1", "content": [{"type": "tool_result", "tool_result": "[{ \"order_id\": \"A1\" }]"}]}, + {"role": "assistant", "content": [{"type": "text", "text": "You have one order on file."}]} + ] + + formatted = reformat_agent_response(response) + + assert "[TOOL_CALL] get_orders(account_number=\"123\")" in formatted + assert "[TOOL_RESULT] [{ \"order_id\": \"A1\" }]" in formatted + assert "Let me check that for you." in formatted + assert "You have one order on file." in formatted + + def test_reformat_agent_response_without_tool_calls(self): + response = [ + {"role": "assistant", "content": [{"type": "text", "text": "Let me check that for you."}]}, + {"role": "assistant", "content": [{"type": "tool_call", "tool_call": {"id": "tool_call_1", "type": "function", "function": {"name": "get_orders", "arguments": {"account_number": "123"}}}}]}, + {"role": "tool", "tool_call_id": "tool_call_1", "content": [{"type": "tool_result", "tool_result": "[{ \"order_id\": \"A1\" }]"}]}, + {"role": "assistant", "content": [{"type": "text", "text": "You have one order on file."}]} + ] + + formatted = reformat_agent_response(response, include_tool_messages=False) + + assert formatted == "Let me check that for you.\nYou have one order on file." + + def test_single_tool_with_parameters(self): + tools = [{ + "name": "search", + "description": "Searches the web.", + "parameters": { + "properties": { + "query": {"type": "string"}, + "lang": {"type": "string"} + } + } + }] + expected_output = ( + "TOOL DEFINITIONS:\n" + "- search: Searches the web. (inputs: query, lang)" + ) + self.assertEqual(reformat_tool_definitions(tools), expected_output) + + def test_tool_with_no_parameters(self): + tools = [{ + "name": "ping", + "description": "Check if server is reachable.", + "parameters": {} + }] + expected_output = ( + "TOOL DEFINITIONS:\n" + "- ping: Check if server is reachable. (inputs: no parameters)" + ) + self.assertEqual(reformat_tool_definitions(tools), expected_output) + + def test_tool_missing_description_and_parameters(self): + tools = [{"name": "noop"}] + expected_output = ( + "TOOL DEFINITIONS:\n" + "- noop: (inputs: no parameters)" + ) + self.assertEqual(reformat_tool_definitions(tools), expected_output) + + def test_tool_missing_name(self): + tools = [{ + "description": "Does something.", + "parameters": { + "properties": {"x": {"type": "number"}} + } + }] + expected_output = ( + "TOOL DEFINITIONS:\n" + "- unnamed_tool: Does something. (inputs: x)" + ) + self.assertEqual(reformat_tool_definitions(tools), expected_output) + + def test_multiple_tools(self): + tools = [ + { + "name": "alpha", + "description": "Tool A.", + "parameters": { + "properties": { + "a1": {"type": "string"} + } + } + }, + { + "name": "beta", + "description": "Tool B.", + "parameters": {} + } + ] + expected_output = ( + "TOOL DEFINITIONS:\n" + "- alpha: Tool A. (inputs: a1)\n" + "- beta: Tool B. (inputs: no parameters)" + ) + self.assertEqual(reformat_tool_definitions(tools), expected_output) + + def test_empty_tool_list(self): + tools = [] + expected_output = "TOOL DEFINITIONS:" + self.assertEqual(reformat_tool_definitions(tools), expected_output)