From 16ceb6298ac004f700faefbc9ca930902b6d3abd Mon Sep 17 00:00:00 2001 From: Bai Li Date: Mon, 20 Oct 2025 13:29:33 -0700 Subject: [PATCH] feat(CacheForMocks): folder-based caching for LLM and input mocking in evals --- src/uipath/_cli/_evals/_runtime.py | 5 + src/uipath/_cli/_evals/mocks/cache_manager.py | 79 ++++++++ src/uipath/_cli/_evals/mocks/input_mocker.py | 50 ++++- src/uipath/_cli/_evals/mocks/llm_mocker.py | 41 +++- src/uipath/_cli/_evals/mocks/mocks.py | 7 + tests/cli/eval/mocks/test_cache_manager.py | 182 ++++++++++++++++++ tests/cli/eval/mocks/test_input_mocker.py | 3 + tests/cli/eval/mocks/test_mocks.py | 5 + 8 files changed, 361 insertions(+), 11 deletions(-) create mode 100644 src/uipath/_cli/_evals/mocks/cache_manager.py create mode 100644 tests/cli/eval/mocks/test_cache_manager.py diff --git a/src/uipath/_cli/_evals/_runtime.py b/src/uipath/_cli/_evals/_runtime.py index 4b33f0419..59ee961d3 100644 --- a/src/uipath/_cli/_evals/_runtime.py +++ b/src/uipath/_cli/_evals/_runtime.py @@ -11,6 +11,7 @@ from opentelemetry.sdk.trace import ReadableSpan, Span from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult +from uipath._cli._evals.mocks.cache_manager import CacheManager from uipath._cli._evals.mocks.input_mocker import ( generate_llm_input, ) @@ -49,6 +50,7 @@ ) from ._span_collection import ExecutionSpanCollector from .mocks.mocks import ( + cache_manager_context, clear_execution_context, set_execution_context, ) @@ -322,6 +324,9 @@ async def _execute_eval( evaluators: List[BaseEvaluator[Any]], event_bus: EventBus, ) -> EvaluationRunResult: + if cache_manager_context.get() is None: + cache_manager_context.set(CacheManager()) + # Generate LLM-based input if input_mocking_strategy is defined if eval_item.input_mocking_strategy: eval_item = await self._generate_input_for_eval(eval_item) diff --git a/src/uipath/_cli/_evals/mocks/cache_manager.py b/src/uipath/_cli/_evals/mocks/cache_manager.py new file mode 100644 index 000000000..18db05fa3 --- /dev/null +++ b/src/uipath/_cli/_evals/mocks/cache_manager.py @@ -0,0 +1,79 @@ +"""Cache manager for LLM and input mocker responses.""" + +import hashlib +import json +from pathlib import Path +from typing import Any, Dict, Optional + + +class CacheManager: + """Manages file-based caching for LLM and input mocker responses.""" + + def __init__(self, cache_dir: Optional[Path] = None): + """Initialize the cache manager.""" + self.cache_dir = cache_dir or (Path.cwd() / ".uipath" / "eval_cache") + + def _compute_cache_key(self, cache_key_data: Dict[str, Any]) -> str: + """Compute a hash from cache key data.""" + serialized = json.dumps(cache_key_data, sort_keys=True) + return hashlib.sha256(serialized.encode()).hexdigest() + + def _get_cache_path( + self, + mocker_type: str, + eval_set_id: str, + eval_item_id: str, + cache_key: str, + function_name: str, + ) -> Path: + """Get the file path for a cache entry.""" + return ( + self.cache_dir + / mocker_type + / eval_set_id + / eval_item_id + / function_name + / f"{cache_key}.json" + ) + + def get( + self, + mocker_type: str, + eval_set_id: str, + eval_item_id: str, + cache_key_data: Dict[str, Any], + function_name: str, + ) -> Optional[Any]: + """Retrieve a cached response.""" + cache_key = self._compute_cache_key(cache_key_data) + cache_path = self._get_cache_path( + mocker_type, eval_set_id, eval_item_id, cache_key, function_name + ) + + if not cache_path.exists(): + return None + + with open(cache_path, "r") as f: + cached_response = json.load(f) + + return cached_response + + def set( + self, + mocker_type: str, + eval_set_id: str, + eval_item_id: str, + cache_key_data: Dict[str, Any], + response: Any, + function_name: str, + ) -> None: + """Store a response in the cache.""" + cache_key = self._compute_cache_key(cache_key_data) + cache_path = self._get_cache_path( + mocker_type, eval_set_id, eval_item_id, cache_key, function_name + ) + + cache_path.parent.mkdir(parents=True, exist_ok=True) + + with open(cache_path, "w") as f: + json.dump(response, f) diff --git a/src/uipath/_cli/_evals/mocks/input_mocker.py b/src/uipath/_cli/_evals/mocks/input_mocker.py index a7830e824..47e7d853f 100644 --- a/src/uipath/_cli/_evals/mocks/input_mocker.py +++ b/src/uipath/_cli/_evals/mocks/input_mocker.py @@ -58,19 +58,24 @@ async def generate_llm_input( input_schema: Dict[str, Any], ) -> Dict[str, Any]: """Generate synthetic input using an LLM based on the evaluation context.""" + from .mocks import cache_manager_context + try: llm = UiPath().llm + cache_manager = cache_manager_context.get() - prompt = get_input_mocking_prompt( - input_schema=json.dumps(input_schema, indent=2), - input_generation_instructions=evaluation_item.input_mocking_strategy.prompt + prompt_generation_args = { + "input_schema": json.dumps(input_schema), + "input_generation_instructions": evaluation_item.input_mocking_strategy.prompt if evaluation_item.input_mocking_strategy else "", - expected_behavior=evaluation_item.expected_agent_behavior or "", - expected_output=json.dumps(evaluation_item.expected_output, indent=2) + "expected_behavior": evaluation_item.expected_agent_behavior or "", + "expected_output": json.dumps(evaluation_item.expected_output) if evaluation_item.expected_output else "", - ) + } + + prompt = get_input_mocking_prompt(**prompt_generation_args) response_format = { "type": "json_schema", @@ -92,6 +97,24 @@ async def generate_llm_input( else {} ) + if cache_manager is not None: + cache_key_data = { + "response_format": response_format, + "completion_kwargs": completion_kwargs, + "prompt_generation_args": prompt_generation_args, + } + + cached_response = cache_manager.get( + mocker_type="input_mocker", + eval_set_id=evaluation_item.eval_set_id, + eval_item_id=evaluation_item.id, + cache_key_data=cache_key_data, + function_name="generate_llm_input", + ) + + if cached_response is not None: + return cached_response + response = await llm.chat_completions( [{"role": "user", "content": prompt}], response_format=response_format, @@ -99,8 +122,19 @@ async def generate_llm_input( ) generated_input_str = response.choices[0].message.content - - return json.loads(generated_input_str) + result = json.loads(generated_input_str) + + if cache_manager is not None: + cache_manager.set( + mocker_type="input_mocker", + eval_set_id=evaluation_item.eval_set_id, + eval_item_id=evaluation_item.id, + cache_key_data=cache_key_data, + response=result, + function_name="generate_llm_input", + ) + + return result except json.JSONDecodeError as e: raise UiPathInputMockingError( f"Failed to parse LLM response as JSON: {str(e)}" diff --git a/src/uipath/_cli/_evals/mocks/llm_mocker.py b/src/uipath/_cli/_evals/mocks/llm_mocker.py index b6e2916cf..2706a5860 100644 --- a/src/uipath/_cli/_evals/mocks/llm_mocker.py +++ b/src/uipath/_cli/_evals/mocks/llm_mocker.py @@ -97,6 +97,7 @@ async def response( from uipath._services.llm_gateway_service import _cleanup_schema from .mocks import ( + cache_manager_context, evaluation_context, execution_id_context, span_collector_context, @@ -161,7 +162,7 @@ class OutputSchema(BaseModel): }, "testRunProctorInstructions": self.evaluation_item.mocking_strategy.prompt, } - prompt_input = { + prompt_generation_args = { k: json.dumps(pydantic_to_dict_safe(v)) for k, v in prompt_input.items() } @@ -171,11 +172,33 @@ class OutputSchema(BaseModel): if model_parameters else {} ) + + formatted_prompt = PROMPT.format(**prompt_generation_args) + + cache_key_data = { + "response_format": response_format, + "completion_kwargs": completion_kwargs, + "prompt_generation_args": prompt_generation_args, + } + + cache_manager = cache_manager_context.get() + if cache_manager is not None: + cached_response = cache_manager.get( + mocker_type="llm_mocker", + eval_set_id=self.evaluation_item.eval_set_id, + eval_item_id=self.evaluation_item.id, + cache_key_data=cache_key_data, + function_name=function_name, + ) + + if cached_response is not None: + return cached_response + response = await llm.chat_completions( [ { "role": "user", - "content": PROMPT.format(**prompt_input), + "content": formatted_prompt, }, ], response_format=response_format, @@ -184,7 +207,19 @@ class OutputSchema(BaseModel): mocked_response = OutputSchema( **json.loads(response.choices[0].message.content) ) - return mocked_response.model_dump(mode="json")["response"] + result = mocked_response.model_dump(mode="json")["response"] + + if cache_manager is not None: + cache_manager.set( + mocker_type="llm_mocker", + eval_set_id=self.evaluation_item.eval_set_id, + eval_item_id=self.evaluation_item.id, + cache_key_data=cache_key_data, + response=result, + function_name=function_name, + ) + + return result except Exception as e: raise UiPathMockResponseGenerationError() from e else: diff --git a/src/uipath/_cli/_evals/mocks/mocks.py b/src/uipath/_cli/_evals/mocks/mocks.py index 0a34dd151..361b47fb0 100644 --- a/src/uipath/_cli/_evals/mocks/mocks.py +++ b/src/uipath/_cli/_evals/mocks/mocks.py @@ -6,6 +6,7 @@ from uipath._cli._evals._models._evaluation_set import EvaluationItem from uipath._cli._evals._span_collection import ExecutionSpanCollector +from uipath._cli._evals.mocks.cache_manager import CacheManager from uipath._cli._evals.mocks.mocker import Mocker, UiPathNoMockFoundError from uipath._cli._evals.mocks.mocker_factory import MockerFactory @@ -26,6 +27,11 @@ "execution_id", default=None ) +# Cache manager for LLM and input mocker responses +cache_manager_context: ContextVar[Optional[CacheManager]] = ContextVar( + "cache_manager", default=None +) + logger = logging.getLogger(__name__) @@ -54,6 +60,7 @@ def clear_execution_context() -> None: mocker_context.set(None) span_collector_context.set(None) execution_id_context.set(None) + cache_manager_context.set(None) async def get_mocked_response( diff --git a/tests/cli/eval/mocks/test_cache_manager.py b/tests/cli/eval/mocks/test_cache_manager.py new file mode 100644 index 000000000..634cc7cfc --- /dev/null +++ b/tests/cli/eval/mocks/test_cache_manager.py @@ -0,0 +1,182 @@ +"""Tests for CacheManager.""" + +import tempfile +from pathlib import Path + +import pytest + +from uipath._cli._evals.mocks.cache_manager import CacheManager +from uipath._cli._evals.mocks.mocks import cache_manager_context + + +@pytest.fixture +def cache_manager(): + """Create a cache manager with a temp directory for tests.""" + with tempfile.TemporaryDirectory() as tmpdir: + cm = CacheManager(cache_dir=Path(tmpdir)) + cache_manager_context.set(cm) + yield cm + cache_manager_context.set(None) + + +def test_set_and_get_llm_mocker(cache_manager): + """Test setting and getting a cached response for LLM mocker.""" + cache_key_data = { + "prompt_generation_args": { + "input": "test input", + }, + "response_format": {"type": "json"}, + "completion_kwargs": {"temperature": 0.7}, + } + + response = {"result": "test response"} + + cache_manager.set( + mocker_type="llm_mocker", + eval_set_id="evalset-456", + eval_item_id="eval-123", + cache_key_data=cache_key_data, + response=response, + function_name="test_function", + ) + + cached_response = cache_manager.get( + mocker_type="llm_mocker", + eval_set_id="evalset-456", + eval_item_id="eval-123", + cache_key_data=cache_key_data, + function_name="test_function", + ) + + assert cached_response == response + + +def test_set_and_get_input_mocker(cache_manager): + """Test setting and getting a cached response for input mocker.""" + cache_key_data = { + "prompt_generation_args": { + "input": "test input", + }, + "response_format": {"type": "json"}, + "completion_kwargs": {"temperature": 0.7}, + } + + response = {"input": "test input"} + + cache_manager.set( + mocker_type="input_mocker", + eval_set_id="evalset-789", + eval_item_id="eval-456", + cache_key_data=cache_key_data, + response=response, + function_name="generate_llm_input", + ) + + cached_response = cache_manager.get( + mocker_type="input_mocker", + eval_set_id="evalset-789", + eval_item_id="eval-456", + cache_key_data=cache_key_data, + function_name="generate_llm_input", + ) + + assert cached_response == response + + +def test_cache_invalidation_on_prompt_args_change(cache_manager): + """Test that changing the prompt generation args invalidates the cache.""" + cache_key_data1 = { + "prompt_generation_args": { + "input": "original input", + }, + "response_format": {"type": "json"}, + "completion_kwargs": {"temperature": 0.7}, + } + + cache_key_data2 = { + "prompt_generation_args": { + "input": "modified input", + }, + "response_format": {"type": "json"}, + "completion_kwargs": {"temperature": 0.7}, + } + + response1 = {"result": "response 1"} + response2 = {"result": "response 2"} + + cache_manager.set( + mocker_type="llm_mocker", + eval_set_id="evalset-456", + eval_item_id="eval-123", + cache_key_data=cache_key_data1, + response=response1, + function_name="test_function", + ) + + cache_manager.set( + mocker_type="llm_mocker", + eval_set_id="evalset-456", + eval_item_id="eval-123", + cache_key_data=cache_key_data2, + response=response2, + function_name="test_function", + ) + + cached1 = cache_manager.get( + mocker_type="llm_mocker", + eval_set_id="evalset-456", + eval_item_id="eval-123", + cache_key_data=cache_key_data1, + function_name="test_function", + ) + + cached2 = cache_manager.get( + mocker_type="llm_mocker", + eval_set_id="evalset-456", + eval_item_id="eval-123", + cache_key_data=cache_key_data2, + function_name="test_function", + ) + + assert cached1 == response1 + assert cached2 == response2 + + +def test_cache_invalidation_on_model_settings_change(cache_manager): + """Test that changing model settings invalidates the cache.""" + cache_key_data1 = { + "prompt_generation_args": { + "input": "test input", + }, + "response_format": {"type": "json"}, + "completion_kwargs": {"temperature": 0.7}, + } + + cache_key_data2 = { + "prompt_generation_args": { + "input": "test input", + }, + "response_format": {"type": "json"}, + "completion_kwargs": {"temperature": 0.9}, + } + + response = {"result": "test response"} + + cache_manager.set( + mocker_type="llm_mocker", + eval_set_id="evalset-456", + eval_item_id="eval-123", + cache_key_data=cache_key_data1, + response=response, + function_name="test_function", + ) + + cached_response = cache_manager.get( + mocker_type="llm_mocker", + eval_set_id="evalset-456", + eval_item_id="eval-123", + cache_key_data=cache_key_data2, + function_name="test_function", + ) + + assert cached_response is None diff --git a/tests/cli/eval/mocks/test_input_mocker.py b/tests/cli/eval/mocks/test_input_mocker.py index 8d14361b7..cbac51dd9 100644 --- a/tests/cli/eval/mocks/test_input_mocker.py +++ b/tests/cli/eval/mocks/test_input_mocker.py @@ -9,6 +9,7 @@ InputMockingStrategy, ModelSettings, ) +from uipath._cli._evals.mocks.cache_manager import CacheManager from uipath._cli._evals.mocks.input_mocker import generate_llm_input @@ -19,6 +20,8 @@ async def test_generate_llm_input_with_model_settings( ): monkeypatch.setenv("UIPATH_URL", "https://example.com") monkeypatch.setenv("UIPATH_ACCESS_TOKEN", "test-token") + monkeypatch.setattr(CacheManager, "get", lambda *args, **kwargs: None) + monkeypatch.setattr(CacheManager, "set", lambda *args, **kwargs: None) evaluation_item: dict[str, Any] = { "id": "test-eval-id", diff --git a/tests/cli/eval/mocks/test_mocks.py b/tests/cli/eval/mocks/test_mocks.py index ec0950dc3..caddd6f25 100644 --- a/tests/cli/eval/mocks/test_mocks.py +++ b/tests/cli/eval/mocks/test_mocks.py @@ -10,6 +10,7 @@ LLMMockingStrategy, MockitoMockingStrategy, ) +from uipath._cli._evals.mocks.cache_manager import CacheManager from uipath._cli._evals.mocks.mocker import UiPathMockResponseGenerationError from uipath._cli._evals.mocks.mocks import set_execution_context from uipath.eval.mocks import mockable @@ -140,6 +141,8 @@ async def foofoo(*args, **kwargs): def test_llm_mockable_sync(httpx_mock: HTTPXMock, monkeypatch: MonkeyPatch): monkeypatch.setenv("UIPATH_URL", "https://example.com") monkeypatch.setenv("UIPATH_ACCESS_TOKEN", "1234567890") + monkeypatch.setattr(CacheManager, "get", lambda *args, **kwargs: None) + monkeypatch.setattr(CacheManager, "set", lambda *args, **kwargs: None) # Arrange @mockable() @@ -223,6 +226,8 @@ def foofoo(*args, **kwargs): async def test_llm_mockable_async(httpx_mock: HTTPXMock, monkeypatch: MonkeyPatch): monkeypatch.setenv("UIPATH_URL", "https://example.com") monkeypatch.setenv("UIPATH_ACCESS_TOKEN", "1234567890") + monkeypatch.setattr(CacheManager, "get", lambda *args, **kwargs: None) + monkeypatch.setattr(CacheManager, "set", lambda *args, **kwargs: None) # Arrange @mockable()