Fix test and RAM slot power

mlco2 · benoit-cty · Dec 11, 2024 · Dec 11, 2024 · Dec 11, 2024 · Dec 11, 2024
commit 0918e52529bb5e30c5be16d1c6a07f92df6b60ca
diff --git a/codecarbon/core/resource_tracker.py b/codecarbon/core/resource_tracker.py
@@ -4,8 +4,9 @@
 from codecarbon.core import cpu, gpu, powermetrics
 from codecarbon.core.config import parse_gpu_ids
 from codecarbon.core.util import detect_cpu_model, is_linux_os, is_mac_os, is_windows_os
-from codecarbon.external.hardware import CPU, GPU, MODE_CPU_LOAD, RAM, AppleSiliconChip
+from codecarbon.external.hardware import CPU, GPU, MODE_CPU_LOAD, AppleSiliconChip
 from codecarbon.external.logger import logger
+from codecarbon.external.ram import RAM
 
 
 class ResourceTracker:

diff --git a/codecarbon/emissions_tracker.py b/codecarbon/emissions_tracker.py
@@ -20,8 +20,9 @@
 from codecarbon.core.units import Energy, Power, Time
 from codecarbon.core.util import count_cpus, count_physical_cpus, suppress
 from codecarbon.external.geography import CloudMetadata, GeoMetadata
-from codecarbon.external.hardware import CPU, GPU, RAM, AppleSiliconChip
+from codecarbon.external.hardware import CPU, GPU, AppleSiliconChip
 from codecarbon.external.logger import logger, set_logger_format, set_logger_level
+from codecarbon.external.ram import RAM
 from codecarbon.external.scheduler import PeriodicScheduler
 from codecarbon.external.task import Task
 from codecarbon.input import DataSource

diff --git a/codecarbon/external/hardware.py b/codecarbon/external/hardware.py
@@ -4,7 +4,6 @@
 
 import math
 import re
-import subprocess
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from typing import Dict, Iterable, List, Optional, Tuple
@@ -15,7 +14,7 @@
 from codecarbon.core.gpu import AllGPUDevices
 from codecarbon.core.powermetrics import ApplePowermetrics
 from codecarbon.core.units import Energy, Power, Time
-from codecarbon.core.util import SLURM_JOB_ID, count_cpus, detect_cpu_model
+from codecarbon.core.util import count_cpus, detect_cpu_model
 from codecarbon.external.logger import logger
 
 # default W value for a CPU if no model is found in the ref csv
@@ -28,8 +27,6 @@
 
 MODE_CPU_LOAD = "cpu_load"
 
-RAM_SLOT_POWER_X86 = 4  # Watts
-
 
 @dataclass
 class BaseHardware(ABC):
@@ -333,339 +330,6 @@ def from_utils(
         )
 
 
-@dataclass
-class RAM(BaseHardware):
-    """
-    Before V3 heuristic:
-    # 3 watts of power for every 8GB of DDR3 or DDR4 memory
-    # https://www.crucial.com/support/articles-faq-memory/how-much-power-does-memory-use
-
-    In V3, we need to improve the accuracy of the RAM power estimation.
-    Because the power consumption of RAM is not linear with the amount of memory used,
-    for example, in servers you could have thousands of GB of RAM but the power
-    consumption would not be proportional to the amount of memory used, but to the number
-    of memory modules used.
-    But there is no way to know the memory modules used in the system, without admin rights.
-    So we need to build a heuristic that is more accurate than the previous one.
-    For example keep a minimum of 2 modules. Execept for ARM CPU like rapsberry pi where we will consider a 3W constant.
-    Then consider the max RAM per module is 128GB and that RAM module only exist in power of 2 (2, 4, 8, 16, 32, 64, 128).
-    So we can estimate the power consumption of the RAM by the number of modules used.
-
-    1. **ARM CPU Detection**:
-    - Added a `_detect_arm_cpu` method that checks if the system is using an ARM architecture
-    - For ARM CPUs (like Raspberry Pi), a constant 3W will be used as the minimum power
-
-    2. **DIMM Count Estimation**:
-    - Created a `_estimate_dimm_count` method that intelligently estimates how many memory modules might be present based on total RAM size
-    - Takes into account that servers typically have more and larger DIMMs
-    - Assumes DIMM sizes follow powers of 2 (4GB, 8GB, 16GB, 32GB, 64GB, 128GB) as specified
-
-    3. **Scaling Power Model**:
-    - Base power per DIMM is 2.5W for x86 systems and 1.5W for ARM systems
-    - For standard systems (up to 4 DIMMs): linear scaling at full power per DIMM
-    - For medium systems (5-8 DIMMs): decreasing efficiency (90% power per additional DIMM)
-    - For large systems (9-16 DIMMs): further reduced efficiency (80% power per additional DIMM)
-    - For very large systems (17+ DIMMs): highest efficiency (70% power per additional DIMM)
-
-    4. **Minimum Power Guarantees**:
-    - Ensures at least 5W for x86 systems (assuming 2 DIMMs at minimum)
-    - Ensures at least 3W for ARM systems as requested
-
-    ### Example Power Estimates:
-
-    - **Small laptop (8GB RAM)**: ~5W (2 DIMMs at 2.5W each)
-    - **Desktop (32GB RAM)**: ~10W (4 DIMMs at 2.5W each)
-    - **Small server (128GB RAM)**: ~18.6W (8 DIMMs with efficiency scaling)
-    - **Large server (1TB RAM)**: ~44W (using 16x64GB DIMMs with high efficiency scaling)
-
-    This approach significantly improves the accuracy for large servers by recognizing that RAM power consumption doesn't scale linearly with capacity, but rather with the number of physical modules. Since we don't have direct access to the actual DIMM configuration, this heuristic provides a more reasonable estimate than the previous linear model.
-
-    The model also includes detailed debug logging that will show the estimated power for given memory sizes, helping with validation and fine-tuning in the future.
-    """
-
-    memory_size = None
-    is_arm_cpu = False
-
-    def __init__(
-        self,
-        pid: int = psutil.Process().pid,
-        children: bool = True,
-        tracking_mode: str = "machine",
-    ):
-        """
-        Instantiate a RAM object from a reference pid. If none is provided, will use the
-        current process's. The `pid` is used to find children processes if `children`
-        is True.
-
-        Args:
-            pid (int, optional): Process id (with respect to which we'll look for
-                                 children). Defaults to psutil.Process().pid.
-            children (int, optional): Look for children of the process when computing
-                                      total RAM used. Defaults to True.
-        """
-        self._pid = pid
-        self._children = children
-        self._tracking_mode = tracking_mode
-        # Check if using ARM architecture
-        self.is_arm_cpu = self._detect_arm_cpu()
-
-    def _detect_arm_cpu(self) -> bool:
-        """
-        Detect if the CPU is ARM-based
-        """
-        try:
-            # Try to detect ARM architecture using platform module
-            import platform
-
-            machine = platform.machine().lower()
-            return any(arm in machine for arm in ["arm", "aarch"])
-        except Exception:
-            # Default to False if detection fails
-            return False
-
-    def _estimate_dimm_count(self, total_gb: float) -> int:
-        """
-        Estimate the number of memory DIMMs based on total memory size
-        using heuristic rules.
-
-        Args:
-            total_gb: Total RAM in GB
-
-        Returns:
-            int: Estimated number of memory DIMMs
-        """
-        # Typical DIMM sizes in GB
-        dimm_sizes = [4, 8, 16, 32, 64, 128]
-
-        # For very small amounts of RAM (e.g. embedded systems)
-        if total_gb <= 2:
-            return 1
-
-        # For standard desktop/laptop (4-32GB)
-        if total_gb <= 32:
-            # Estimate based on likely configurations (2-4 DIMMs)
-            return max(2, min(4, int(total_gb / 8) + 1))
-
-        # For workstations and small servers (32-128GB)
-        if total_gb <= 128:
-            # Likely 4-8 DIMMs
-            return max(4, min(8, int(total_gb / 16) + 1))
-
-        # For larger servers (>128GB)
-        # Estimate using larger DIMM sizes and more slots
-        # Most servers have 8-32 DIMM slots
-        # Try to find the best fit with common DIMM sizes
-        dimm_count = 8  # Minimum for a large server
-
-        # Find the largest common DIMM size that fits
-        for dimm_size in sorted(dimm_sizes, reverse=True):
-            if dimm_size <= total_gb / 8:  # Assume at least 8 DIMMs
-                # Calculate how many DIMMs of this size would be needed
-                dimm_count = math.ceil(total_gb / dimm_size)
-                # Cap at 32 DIMMs (very large server)
-                dimm_count = min(dimm_count, 32)
-                break
-
-        return dimm_count
-
-    def _calculate_ram_power(self, memory_gb: float) -> float:
-        """
-        Calculate RAM power consumption based on the total RAM size using a more
-        sophisticated model that better scales with larger memory sizes.
-
-        Args:
-            memory_gb: Total RAM in GB
-
-        Returns:
-            float: Estimated power consumption in watts
-        """
-        # Detect how many DIMMs might be present
-        dimm_count = self._estimate_dimm_count(memory_gb)
-
-        # Base power consumption per DIMM
-        if self.is_arm_cpu:
-            # ARM systems typically use lower power memory
-            base_power_per_dimm = 1.5  # Watts
-            # Minimum 3W for ARM as requested
-            min_power = 3.0
-        else:
-            # x86 systems
-            base_power_per_dimm = RAM_SLOT_POWER_X86  # Watts
-            # Minimum 5W for x86 as requested (2 sticks at 2.5W)
-            min_power = base_power_per_dimm * 2
-
-        # Estimate power based on DIMM count with decreasing marginal power per DIMM as count increases
-        if dimm_count <= 4:
-            # Small systems: full power per DIMM
-            total_power = base_power_per_dimm * dimm_count
-        elif dimm_count <= 8:
-            # Medium systems: slight efficiency at scale
-            total_power = base_power_per_dimm * 4 + base_power_per_dimm * 0.9 * (
-                dimm_count - 4
-            )
-        elif dimm_count <= 16:
-            # Larger systems: better efficiency at scale
-            total_power = (
-                base_power_per_dimm * 4
-                + base_power_per_dimm * 0.9 * 4
-                + base_power_per_dimm * 0.8 * (dimm_count - 8)
-            )
-        else:
-            # Very large systems: high efficiency at scale
-            total_power = (
-                base_power_per_dimm * 4
-                + base_power_per_dimm * 0.9 * 4
-                + base_power_per_dimm * 0.8 * 8
-                + base_power_per_dimm * 0.7 * (dimm_count - 16)
-            )
-
-        # Apply minimum power constraint
-        return max(min_power, total_power)
-
-    def _get_children_memories(self):
-        """
-        Compute the used RAM by the process's children
-
-        Returns:
-            list(int): The list of RAM values
-        """
-        current_process = psutil.Process(self._pid)
-        children = current_process.children(recursive=True)
-        return [child.memory_info().rss for child in children]
-
-    def _read_slurm_scontrol(self):
-        try:
-            logger.debug(
-                "SLURM environment detected, running `scontrol show job $SLURM_JOB_ID`..."
-            )
-            return (
-                subprocess.check_output(
-                    [f"scontrol show job {SLURM_JOB_ID}"], shell=True
-                )
-                .decode()
-                .strip()
-            )
-        except subprocess.CalledProcessError:
-            return
-
-    def _parse_scontrol_memory_GB(self, mem):
-        """
-        Parse the memory string (B) returned by scontrol to a float (GB)
-
-        Args:
-            mem (str): Memory string (B) as `[amount][unit]` (e.g. `128G`)
-
-        Returns:
-            float: Memory (GB)
-        """
-        nb = int(mem[:-1])
-        unit = mem[-1]
-        if unit == "T":
-            return nb * 1000
-        if unit == "G":
-            return nb
-        if unit == "M":
-            return nb / 1000
-        if unit == "K":
-            return nb / (1000**2)
-
-    def _parse_scontrol(self, scontrol_str):
-        mem_matches = re.findall(r"AllocTRES=.*?,mem=(\d+[A-Z])", scontrol_str)
-        if len(mem_matches) == 0:
-            # Try with TRES, see https://github.com/mlco2/codecarbon/issues/569#issuecomment-2167706145
-            mem_matches = re.findall(r"TRES=.*?,mem=(\d+[A-Z])", scontrol_str)
-        if len(mem_matches) == 0:
-            logger.warning(
-                "Could not find mem= after running `scontrol show job $SLURM_JOB_ID` "
-                + "to count SLURM-available RAM. Using the machine's total RAM."
-            )
-            return psutil.virtual_memory().total / B_TO_GB
-        if len(mem_matches) > 1:
-            logger.warning(
-                "Unexpected output after running `scontrol show job $SLURM_JOB_ID` "
-                + "to count SLURM-available RAM. Using the machine's total RAM."
-            )
-            return psutil.virtual_memory().total / B_TO_GB
-
-        return mem_matches[0].replace("mem=", "")
-
-    @property
-    def slurm_memory_GB(self):
-        """
-        Property to compute the SLURM-available RAM in GigaBytes.
-
-        Returns:
-            float: Memory allocated to the job (GB)
-        """
-        # Prevent calling scontrol at each mesure
-        if self.memory_size:
-            return self.memory_size
-        scontrol_str = self._read_slurm_scontrol()
-        if scontrol_str is None:
-            logger.warning(
-                "Error running `scontrol show job $SLURM_JOB_ID` "
-                + "to retrieve SLURM-available RAM."
-                + "Using the machine's total RAM."
-            )
-            return psutil.virtual_memory().total / B_TO_GB
-        mem = self._parse_scontrol(scontrol_str)
-        if isinstance(mem, str):
-            mem = self._parse_scontrol_memory_GB(mem)
-        self.memory_size = mem
-        return mem
-
-    @property
-    def process_memory_GB(self):
-        """
-        Property to compute the process's total memory usage in bytes.
-
-        Returns:
-            float: RAM usage (GB)
-        """
-        children_memories = self._get_children_memories() if self._children else []
-        main_memory = psutil.Process(self._pid).memory_info().rss
-        memories = children_memories + [main_memory]
-        return sum([m for m in memories if m] + [0]) / B_TO_GB
-
-    @property
-    def machine_memory_GB(self):
-        """
-        Property to compute the machine's total memory in bytes.
-
-        Returns:
-            float: Total RAM (GB)
-        """
-        return (
-            self.slurm_memory_GB
-            if SLURM_JOB_ID
-            else psutil.virtual_memory().total / B_TO_GB
-        )
-
-    def total_power(self) -> Power:
-        """
-        Compute the Power (kW) consumed by the current process (and its children if
-        `children` was True in __init__)
-
-        Returns:
-            Power: kW of power consumption, using a more sophisticated power model
-        """
-        try:
-            memory_GB = (
-                self.machine_memory_GB
-                if self._tracking_mode == "machine"
-                else self.process_memory_GB
-            )
-            ram_power = Power.from_watts(self._calculate_ram_power(memory_GB))
-            logger.debug(
-                f"RAM power estimation: {ram_power.W:.2f}W for {memory_GB:.2f}GB"
-            )
-        except Exception as e:
-            logger.warning(f"Could not measure RAM Power ({str(e)})")
-            ram_power = Power.from_watts(0)
-
-        return ram_power
-
-
 @dataclass
 class AppleSiliconChip(BaseHardware):
     def __init__(