elastic · jan-elastic · Jun 10, 2025 · May 23, 2025 · Jun 2, 2025 · Jun 2, 2025
@@ -28,6 +28,8 @@
  */
 final class JvmErgonomics {
 
+    static final double DIRECT_MEMORY_TO_HEAP_FACTOR = 0.5;
+
     private JvmErgonomics() {
         throw new AssertionError("No instances intended");
     }
@@ -44,7 +46,7 @@ static List<String> choose(final List<String> userDefinedJvmOptions, Settings no
         final long heapSize = JvmOption.extractMaxHeapSize(finalJvmOptions);
         final long maxDirectMemorySize = JvmOption.extractMaxDirectMemorySize(finalJvmOptions);
         if (maxDirectMemorySize == 0) {
-            ergonomicChoices.add("-XX:MaxDirectMemorySize=" + heapSize / 2);
+            ergonomicChoices.add("-XX:MaxDirectMemorySize=" + (long) (DIRECT_MEMORY_TO_HEAP_FACTOR * heapSize));
         }
 
         final boolean tuneG1GCForSmallHeap = tuneG1GCForSmallHeap(heapSize);

@@ -11,6 +11,7 @@
 
 import org.elasticsearch.cluster.node.DiscoveryNodeRole;
 import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.util.FeatureFlag;
 import org.elasticsearch.node.NodeRoleSettings;
 
 import java.io.IOException;
@@ -37,6 +38,8 @@ public class MachineDependentHeap {
     protected static final long MAX_HEAP_SIZE = GB * 31; // 31GB
     protected static final long MIN_HEAP_SIZE = 1024 * 1024 * 128; // 128MB
 
+    private static final FeatureFlag NEW_ML_MEMORY_COMPUTATION_FEATURE_FLAG = new FeatureFlag("new_ml_memory_computation");
+
     public MachineDependentHeap() {}
 
     /**
@@ -76,12 +79,16 @@ protected int getHeapSizeMb(Settings nodeSettings, MachineNodeRole role, long av
             /*
              * Machine learning only node.
              *
-             * <p>Heap is computed as:
-             * <ul>
-             *     <li>40% of total system memory when total system memory 16 gigabytes or less.</li>
-             *     <li>40% of the first 16 gigabytes plus 10% of memory above that when total system memory is more than 16 gigabytes.</li>
-             *     <li>The absolute maximum heap size is 31 gigabytes.</li>
-             * </ul>
+             * The memory reserved for Java is computed as:
+             *   - 40% of total system memory when total system memory 16 gigabytes or less.
+             *   - 40% of the first 16 gigabytes plus 10% of memory above that when total system memory is more than 16 gigabytes.
+             *   - The absolute maximum heap size is 31 gigabytes.
+             *
+             * This Java memory is divided as follows:
+             *     - 2/3 of the Java memory is reserved for the Java heap.
+             *     - 1/3 of the Java memory is reserved for the Java direct memory.
+             *
+             * The direct memory being half of the heap is set by the JvmErgonomics class.
              *
              * In all cases the result is rounded down to the next whole multiple of 4 megabytes.
              * The reason for doing this is that Java will round requested heap sizes to a multiple
@@ -95,13 +102,22 @@ protected int getHeapSizeMb(Settings nodeSettings, MachineNodeRole role, long av
              *
              * If this formula is changed then corresponding changes must be made to the {@code NativeMemoryCalculator} and
              * {@code MlAutoscalingDeciderServiceTests} classes in the ML plugin code. Failure to keep the logic synchronized
-             * could result in repeated autoscaling up and down.
+             * could result in ML processes crashing with OOM errors or repeated autoscaling up and down.
              */
             case ML_ONLY -> {
-                if (availableMemory <= (GB * 16)) {
-                    yield mb((long) (availableMemory * .4), 4);
+                double heapFractionBelow16GB = 0.4;
+                double heapFractionAbove16GB = 0.1;
+                if (NEW_ML_MEMORY_COMPUTATION_FEATURE_FLAG.isEnabled()) {
+                    heapFractionBelow16GB = 0.4 / (1.0 + JvmErgonomics.DIRECT_MEMORY_TO_HEAP_FACTOR);
+                    heapFractionAbove16GB = 0.1 / (1.0 + JvmErgonomics.DIRECT_MEMORY_TO_HEAP_FACTOR);
+                }
+                if (availableMemory <= GB * 16) {
+                    yield mb((long) (availableMemory * heapFractionBelow16GB), 4);
                 } else {
-                    yield mb((long) min((GB * 16) * .4 + (availableMemory - GB * 16) * .1, MAX_HEAP_SIZE), 4);
+                    yield mb(
+                        (long) min(GB * 16 * heapFractionBelow16GB + (availableMemory - GB * 16) * heapFractionAbove16GB, MAX_HEAP_SIZE),
+                        4
+                    );
                 }
             }
             /*

@@ -56,13 +56,13 @@ public void testMasterOnlyOptions() throws Exception {
     }
 
     public void testMlOnlyOptions() throws Exception {
-        assertHeapOptions(1, containsInAnyOrder("-Xmx408m", "-Xms408m"), "ml");
-        assertHeapOptions(4, containsInAnyOrder("-Xmx1636m", "-Xms1636m"), "ml");
-        assertHeapOptions(32, containsInAnyOrder("-Xmx8192m", "-Xms8192m"), "ml");
-        assertHeapOptions(64, containsInAnyOrder("-Xmx11468m", "-Xms11468m"), "ml");
+        assertHeapOptions(1, containsInAnyOrder("-Xmx272m", "-Xms272m"), "ml");
+        assertHeapOptions(4, containsInAnyOrder("-Xmx1092m", "-Xms1092m"), "ml");
+        assertHeapOptions(32, containsInAnyOrder("-Xmx5460m", "-Xms5460m"), "ml");
+        assertHeapOptions(64, containsInAnyOrder("-Xmx7644m", "-Xms7644m"), "ml");
         // We'd never see a node this big in Cloud, but this assertion proves that the 31GB absolute maximum
         // eventually kicks in (because 0.4 * 16 + 0.1 * (263 - 16) > 31)
-        assertHeapOptions(263, containsInAnyOrder("-Xmx31744m", "-Xms31744m"), "ml");
+        assertHeapOptions(263, containsInAnyOrder("-Xmx21228m", "-Xms21228m"), "ml");
     }
 
     public void testDataNodeOptions() throws Exception {

diff --git a/docs/changelog/128742.yaml b/docs/changelog/128742.yaml
@@ -0,0 +1,5 @@
+pr: 128742
+summary: "Account for Java direct memory on machine learning nodes to prevent out-of-memory crashes."
+area: Machine Learning
+type: bug
+issues: []
diff --git a/server/src/main/java/org/elasticsearch/monitor/jvm/JvmInfo.java b/server/src/main/java/org/elasticsearch/monitor/jvm/JvmInfo.java
@@ -43,14 +43,7 @@ public class JvmInfo implements ReportingService.Info {
         long nonHeapInit = memoryMXBean.getNonHeapMemoryUsage().getInit() < 0 ? 0 : memoryMXBean.getNonHeapMemoryUsage().getInit();
         long nonHeapMax = memoryMXBean.getNonHeapMemoryUsage().getMax() < 0 ? 0 : memoryMXBean.getNonHeapMemoryUsage().getMax();
         long directMemoryMax = 0;
-        try {
-            Class<?> vmClass = Class.forName("sun.misc.VM");
-            directMemoryMax = (Long) vmClass.getMethod("maxDirectMemory").invoke(null);
-        } catch (Exception t) {
-            // ignore
-        }
         String[] inputArguments = runtimeMXBean.getInputArguments().toArray(new String[runtimeMXBean.getInputArguments().size()]);
-        Mem mem = new Mem(heapInit, heapMax, nonHeapInit, nonHeapMax, directMemoryMax);
 
         String bootClassPath;
         try {
@@ -130,6 +123,11 @@ public class JvmInfo implements ReportingService.Info {
                 configuredMaxHeapSize = Long.parseLong((String) valueMethod.invoke(maxHeapSizeVmOptionObject));
             } catch (Exception ignored) {}
 
+            try {
+                Object maxDirectMemorySizeVmOptionObject = vmOptionMethod.invoke(hotSpotDiagnosticMXBean, "MaxDirectMemorySize");
+                directMemoryMax = Long.parseLong((String) valueMethod.invoke(maxDirectMemorySizeVmOptionObject));
+            } catch (Exception ignored) {}
+
             try {
                 Object useSerialGCVmOptionObject = vmOptionMethod.invoke(hotSpotDiagnosticMXBean, "UseSerialGC");
                 useSerialGC = (String) valueMethod.invoke(useSerialGCVmOptionObject);
@@ -139,6 +137,8 @@ public class JvmInfo implements ReportingService.Info {
 
         }
 
+        Mem mem = new Mem(heapInit, heapMax, nonHeapInit, nonHeapMax, directMemoryMax);
+
         INSTANCE = new JvmInfo(
             ProcessHandle.current().pid(),
             System.getProperty("java.version"),
@@ -496,5 +496,8 @@ public ByteSizeValue getHeapMax() {
             return ByteSizeValue.ofBytes(heapMax);
         }
 
+        public ByteSizeValue getTotalMax() {
+            return ByteSizeValue.ofBytes(heapMax + nonHeapMax + directMemoryMax);
+        }
     }
 }
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java
@@ -41,6 +41,7 @@
 import org.elasticsearch.common.settings.SettingsModule;
 import org.elasticsearch.common.unit.ByteSizeValue;
 import org.elasticsearch.common.unit.Processors;
+import org.elasticsearch.common.util.FeatureFlag;
 import org.elasticsearch.common.util.concurrent.EsExecutors;
 import org.elasticsearch.core.TimeValue;
 import org.elasticsearch.env.Environment;
@@ -557,6 +558,8 @@ public class MachineLearning extends Plugin
         License.OperationMode.PLATINUM
     );
 
+    private static final FeatureFlag NEW_ML_MEMORY_COMPUTATION_FEATURE_FLAG = new FeatureFlag("new_ml_memory_computation");
+
     @Override
     public Map<String, Processor.Factory> getProcessors(Processor.Parameters parameters) {
         if (this.enabled == false) {
@@ -845,7 +848,12 @@ public Settings additionalSettings() {
                 machineMemoryAttrName,
                 Long.toString(OsProbe.getInstance().osStats().getMem().getAdjustedTotal().getBytes())
             );
-            addMlNodeAttribute(additionalSettings, jvmSizeAttrName, Long.toString(Runtime.getRuntime().maxMemory()));
+
+            long jvmSize = Runtime.getRuntime().maxMemory();
+            if (NEW_ML_MEMORY_COMPUTATION_FEATURE_FLAG.isEnabled()) {
+                jvmSize = JvmInfo.jvmInfo().getMem().getTotalMax().getBytes();
+            }
+            addMlNodeAttribute(additionalSettings, jvmSizeAttrName, Long.toString(jvmSize));
             addMlNodeAttribute(
                 additionalSettings,
                 deprecatedAllocatedProcessorsAttrName,