Added mlflow experiment capability

naveenmiriyaluredhat · naveenmiriyaluredhat · commit cd3ae2cf2ad9 · 2025-11-01T16:14:08.000Z
diff --git a/harness/harness_llama3.1_8b.py b/harness/harness_llama3.1_8b.py
@@ -9,23 +9,34 @@
 import sys
 import time
 import logging
+import argparse
+import subprocess
 from datetime import datetime
 from pathlib import Path
 from typing import Optional, Dict, Any
-import subprocess
+
+# Matplotlib imports - set backend early for headless environments
+try:
+    import matplotlib
+    matplotlib.use('Agg')  # Use non-interactive backend
+    import matplotlib.pyplot as plt
+    MATPLOTLIB_AVAILABLE = True
+except ImportError:
+    MATPLOTLIB_AVAILABLE = False
+    plt = None
 
 # Add harness directory to path
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 
 # Import harness components
 try:
-    from backendserver import VLLMServer, create_server, start_server_from_config
+    from backendserver import VLLMServer, create_server, start_server_from_config, load_server_config
     from Client import LoadGenOfflineClient, LoadGenServerClient, create_loadgen_client
     from data.dataset_processor import DatasetProcessor
 except ImportError:
     # Try relative imports
     sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
-    from backendserver import VLLMServer, create_server, start_server_from_config
+    from backendserver import VLLMServer, create_server, start_server_from_config, load_server_config
     from Client import LoadGenOfflineClient, LoadGenServerClient, create_loadgen_client
     from data.dataset_processor import DatasetProcessor
 
@@ -48,6 +59,28 @@
     METRICS_AVAILABLE = False
     logging.warning("Metrics collection not available")
 
+# Import environment info collector
+try:
+    from environment.environment_info import EnvironmentInfoCollector
+    ENVIRONMENT_INFO_AVAILABLE = True
+except ImportError:
+    ENVIRONMENT_INFO_AVAILABLE = False
+
+# Import metrics CSVStorage if available
+try:
+    from metrics.vllm_metrics_collector import CSVStorage
+    CSV_STORAGE_AVAILABLE = True
+except ImportError:
+    CSV_STORAGE_AVAILABLE = False
+
+# Import MLflow client if available
+try:
+    sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'mlflow_tools'))
+    from mlflow_client import MLflowClient
+    MLFLOW_AVAILABLE = True
+except ImportError:
+    MLFLOW_AVAILABLE = False
+
 
 class Llama31_8BHarness:
     """
@@ -58,6 +91,7 @@ class Llama31_8BHarness:
     - LoadGen client (Offline/Server scenarios)
     - Dataset processing
     - Metrics collection and visualization
+    - MLflow tracking and artifact upload
     """
     
     def __init__(self,
@@ -71,7 +105,10 @@ def __init__(self,
                  num_samples: int = 13368,
                  output_dir: str = "./harness_output",
                  enable_metrics: bool = False,
-                 metrics_interval: int = 10):
+                 metrics_interval: int = 10,
+                 mlflow_tracking_uri: Optional[str] = None,
+                 mlflow_experiment_name: Optional[str] = None,
+                 mlflow_output_dir: Optional[str] = None):
         """
         Initialize harness.
         
@@ -87,6 +124,9 @@ def __init__(self,
             output_dir: Output directory for logs and results
             enable_metrics: Enable metrics collection
             metrics_interval: Metrics collection interval (seconds)
+            mlflow_tracking_uri: MLflow tracking server URI (e.g., http://localhost:5000)
+            mlflow_experiment_name: MLflow experiment name
+            mlflow_output_dir: Output directory to upload to MLflow (defaults to output_dir)
         """
         self.model_name = model_name
         self.dataset_path = dataset_path
@@ -100,6 +140,12 @@ def __init__(self,
         self.enable_metrics = enable_metrics
         self.metrics_interval = metrics_interval
         
+        # MLflow configuration
+        self.mlflow_tracking_uri = mlflow_tracking_uri
+        self.mlflow_experiment_name = mlflow_experiment_name
+        self.mlflow_output_dir = Path(mlflow_output_dir) if mlflow_output_dir else self.output_dir
+        self.mlflow_client = None
+        
         # Setup logging
         self.logger = logging.getLogger(self.__class__.__name__)
         
@@ -130,6 +176,10 @@ def __init__(self,
         self.logger.info(f"  - MLPerf logs: {self.mlperf_output_dir}")
         self.logger.info(f"  - Environment info: {self.environment_output_dir}")
         
+        # Initialize MLflow if configured
+        if self.mlflow_tracking_uri and self.mlflow_experiment_name:
+            self._initialize_mlflow()
+        
         # Setup stdout redirection to harness_output
         self._setup_stdout_redirection()
         
@@ -151,6 +201,25 @@ def __init__(self,
         self.original_stdout = None
         self.original_stderr = None
     
+    def _initialize_mlflow(self):
+        """Initialize MLflow client if configured."""
+        if not MLFLOW_AVAILABLE:
+            self.logger.warning("MLflow is not available. MLflow tracking will be disabled.")
+            return
+        
+        try:
+            self.mlflow_client = MLflowClient(
+                tracking_uri=self.mlflow_tracking_uri,
+                experiment_name=self.mlflow_experiment_name,
+                client_type="loadgen",
+                output_dir=str(self.mlflow_output_dir)
+            )
+            self.logger.info(f"MLflow client initialized: {self.mlflow_tracking_uri}")
+            self.logger.info(f"MLflow experiment: {self.mlflow_experiment_name}")
+        except Exception as e:
+            self.logger.warning(f"Failed to initialize MLflow client: {e}")
+            self.mlflow_client = None
+    
     def _setup_stdout_redirection(self):
         """Setup stdout and stderr redirection to harness_output directory."""
         try:
@@ -193,18 +262,18 @@ def _restore_stdout_redirection(self):
     
     def _collect_environment_info(self):
         """Collect environment information."""
+        if not ENVIRONMENT_INFO_AVAILABLE:
+            self.logger.debug("Environment info collector not available")
+            return
+        
         try:
-            from environment.environment_info import EnvironmentInfoCollector
-            
             collector = EnvironmentInfoCollector(self.environment_output_dir)
             results = collector.collect_all()
             
             self.logger.info("Environment information collected:")
             self.logger.info(f"  - Successfully collected: {list(results.get('success', {}).keys())}")
             if results.get('errors'):
                 self.logger.warning(f"  - Errors: {list(results['errors'].keys())}")
-        except ImportError as e:
-            self.logger.warning(f"Could not import environment info collector: {e}")
         except Exception as e:
             self.logger.warning(f"Error collecting environment info: {e}")
     
@@ -309,8 +378,12 @@ def initialize_metrics(self):
             # Use metrics subdirectory - default to CSV format
             timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
             metrics_file = self.metrics_output_dir / f"metrics_{timestamp}.csv"
-            from metrics.vllm_metrics_collector import CSVStorage
-            storage = CSVStorage(str(metrics_file))
+            
+            if CSV_STORAGE_AVAILABLE:
+                storage = CSVStorage(str(metrics_file))
+            else:
+                # Fallback to JSON storage
+                storage = JSONStorage(str(metrics_file).replace('.csv', '.json'))
             
             self.metrics_collector = VLLMMetricsCollector(
                 metrics_endpoint=f"{self.api_server_url}/metrics",
@@ -356,8 +429,28 @@ def run(self, user_conf: str = "user.conf", lg_model_name: str = "llama3_1-8b")
         server_started_here = False
         client_initialized = False
         metrics_started = False
+        mlflow_run_started = False
         
         try:
+            # Start MLflow run if configured
+            if self.mlflow_client:
+                try:
+                    self.mlflow_client.start_run()
+                    mlflow_run_started = True
+                    
+                    # Log parameters
+                    params = {
+                        'model_name': self.model_name,
+                        'scenario': self.scenario,
+                        'test_mode': self.test_mode,
+                        'batch_size': str(self.batch_size),
+                        'num_samples': str(self.num_samples)
+                    }
+                    self.mlflow_client.log_parameters(params)
+                except Exception as e:
+                    self.logger.warning(f"Failed to start MLflow run: {e}")
+                    mlflow_run_started = False
+            
             # Start server if needed
             if not self.api_server_url:
                 self.start_server()
@@ -459,25 +552,50 @@ def run(self, user_conf: str = "user.conf", lg_model_name: str = "llama3_1-8b")
             if self.enable_metrics and self.metrics_collector:
                 self._generate_metrics_visualizations()
             
-            return {
+            test_results = {
                 'status': 'success',
                 'duration': test_duration,
                 'scenario': self.scenario,
                 'test_mode': self.test_mode,
                 'num_samples': self.num_samples
             }
             
+            # Upload to MLflow if configured
+            if mlflow_run_started and self.mlflow_client:
+                try:
+                    self._upload_to_mlflow(test_results)
+                except Exception as e:
+                    self.logger.warning(f"Failed to upload to MLflow: {e}")
+            
+            return test_results
+            
         except Exception as e:
             self.logger.error(f"Test failed: {e}", exc_info=True)
-            return {
+            test_results = {
                 'status': 'failed',
                 'error': str(e)
             }
+            
+            # Upload failure to MLflow if configured
+            if mlflow_run_started and self.mlflow_client:
+                try:
+                    self._upload_to_mlflow(test_results)
+                except Exception as e:
+                    self.logger.warning(f"Failed to upload failure to MLflow: {e}")
+            
+            return test_results
         
         finally:
             # Cleanup in reverse order of initialization
             self.logger.info("Performing cleanup...")
             
+            # End MLflow run if started
+            if mlflow_run_started and self.mlflow_client:
+                try:
+                    self.mlflow_client.end_run()
+                except Exception as e:
+                    self.logger.warning(f"Error ending MLflow run: {e}")
+            
             # Stop metrics collector if not already stopped (in case of exception)
             if metrics_started and self.metrics_collector:
                 try:
@@ -510,6 +628,30 @@ def run(self, user_conf: str = "user.conf", lg_model_name: str = "llama3_1-8b")
             
             self.logger.info("Cleanup completed")
     
+    def _upload_to_mlflow(self, test_results: Dict[str, Any]):
+        """Upload test results and artifacts to MLflow."""
+        if not self.mlflow_client:
+            return
+        
+        try:
+            # Log client-specific metrics
+            self.mlflow_client.log_client_metrics(test_results)
+            
+            # Generate and log description
+            description = self.mlflow_client.get_client_description(test_results)
+            self.mlflow_client.log_description(description)
+            
+            # Upload artifacts - upload entire output directory
+            self.mlflow_client.upload_artifacts(
+                output_dir=str(self.mlflow_output_dir),
+                include_subdirs=True
+            )
+            
+            self.logger.info("Successfully uploaded to MLflow")
+        except Exception as e:
+            self.logger.error(f"Failed to upload to MLflow: {e}", exc_info=True)
+            raise
+    
     def _load_samples_to_ram(self, query_samples):
         """LoadGen callback - samples are pre-loaded in Dataset."""
         pass
@@ -524,11 +666,11 @@ def _generate_metrics_visualizations(self):
             self.logger.warning("Metrics collector or visualizer not available")
             return
         
+        if not MATPLOTLIB_AVAILABLE:
+            self.logger.warning("Matplotlib not available. Visualizations will not be generated.")
+            return
+        
         try:
-            # Set matplotlib backend for headless environments
-            import matplotlib
-            matplotlib.use('Agg')  # Use non-interactive backend
-            
             storage_file = self.metrics_collector._get_storage_file_path()
             if not storage_file:
                 self.logger.warning("No storage file path available from metrics collector")
@@ -615,8 +757,7 @@ def _generate_metrics_visualizations(self):
                         save_path=str(save_path),
                         show_labels=False  # Disable label grouping to avoid dict unhashable error
                     )
-                    # Close plot to free memory
-                    import matplotlib.pyplot as plt
+                    # Close all figures to free memory and ensure clean state for next plot
                     plt.close('all')
                     self.logger.info(f"✓ Generated visualization: {save_path}")
                     successful_viz += 1
@@ -627,7 +768,6 @@ def _generate_metrics_visualizations(self):
                     self.logger.error(f"Failed to generate visualization {viz['filename']}: {e}", exc_info=True)
                     # Close plot even on error
                     try:
-                        import matplotlib.pyplot as plt
                         plt.close('all')
                     except:
                         pass
@@ -643,8 +783,6 @@ def _generate_metrics_visualizations(self):
 
 def main():
     """Main entry point for harness."""
-    import argparse
-    
     parser = argparse.ArgumentParser(description="MLPerf Harness for Llama 3.1 8B")
     
     parser.add_argument("--model", type=str, required=True, help="Model name or path")
@@ -663,6 +801,16 @@ def main():
     parser.add_argument("--lg-model-name", type=str, default="llama3_1-8b", help="LoadGen model name")
     parser.add_argument("--log-level", type=str, default="INFO", help="Logging level")
     
+    # MLflow arguments
+    parser.add_argument("--mlflow-experiment-name", type=str, default=None,
+                       help="MLflow experiment name (enables MLflow tracking)")
+    parser.add_argument("--mlflow-output-dir", type=str, default=None,
+                       help="Output directory to upload to MLflow (defaults to --output-dir)")
+    parser.add_argument("--mlflow-host", type=str, default="localhost",
+                       help="MLflow tracking server hostname")
+    parser.add_argument("--mlflow-port", type=int, default=5000,
+                       help="MLflow tracking server port")
+    
     args = parser.parse_args()
     
     # Configure logging
@@ -675,10 +823,14 @@ def main():
     # Load server config if provided
     server_config = {}
     if args.server_config:
-        from backendserver import load_server_config
         server_config = load_server_config(args.server_config)
         server_config['config_file'] = args.server_config
     
+    # Construct MLflow tracking URI if experiment name is provided
+    mlflow_tracking_uri = None
+    if args.mlflow_experiment_name:
+        mlflow_tracking_uri = f"http://{args.mlflow_host}:{args.mlflow_port}"
+    
     # Create and run harness
     harness = Llama31_8BHarness(
         model_name=args.model,
@@ -690,7 +842,10 @@ def main():
         batch_size=args.batch_size,
         num_samples=args.num_samples,
         output_dir=args.output_dir,
-        enable_metrics=args.enable_metrics
+        enable_metrics=args.enable_metrics,
+        mlflow_tracking_uri=mlflow_tracking_uri,
+        mlflow_experiment_name=args.mlflow_experiment_name,
+        mlflow_output_dir=args.mlflow_output_dir
     )
     
     results = harness.run(user_conf=args.user_conf, lg_model_name=args.lg_model_name)
@@ -705,4 +860,3 @@ def main():
 
 if __name__ == "__main__":
     sys.exit(main())
-
diff --git a/harness/mlflow_tools/__init__.py b/harness/mlflow_tools/__init__.py
@@ -0,0 +1,7 @@
+# MLflow client module
+try:
+    from .mlflow_client import MLflowClient
+    __all__ = ['MLflowClient']
+except ImportError:
+    __all__ = []
+
diff --git a/harness/mlflow_tools/mlflow_client.py b/harness/mlflow_tools/mlflow_client.py