Skip to content

Include direct memory and non-heap memory in ML memory calculations (take #2) #128742

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jun 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
*/
final class JvmErgonomics {

static final double DIRECT_MEMORY_TO_HEAP_FACTOR = 0.5;

private JvmErgonomics() {
throw new AssertionError("No instances intended");
}
Expand All @@ -44,7 +46,7 @@ static List<String> choose(final List<String> userDefinedJvmOptions, Settings no
final long heapSize = JvmOption.extractMaxHeapSize(finalJvmOptions);
final long maxDirectMemorySize = JvmOption.extractMaxDirectMemorySize(finalJvmOptions);
if (maxDirectMemorySize == 0) {
ergonomicChoices.add("-XX:MaxDirectMemorySize=" + heapSize / 2);
ergonomicChoices.add("-XX:MaxDirectMemorySize=" + (long) (DIRECT_MEMORY_TO_HEAP_FACTOR * heapSize));
}

final boolean tuneG1GCForSmallHeap = tuneG1GCForSmallHeap(heapSize);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import org.elasticsearch.cluster.node.DiscoveryNodeRole;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.util.FeatureFlag;
import org.elasticsearch.node.NodeRoleSettings;

import java.io.IOException;
Expand All @@ -37,6 +38,8 @@ public class MachineDependentHeap {
protected static final long MAX_HEAP_SIZE = GB * 31; // 31GB
protected static final long MIN_HEAP_SIZE = 1024 * 1024 * 128; // 128MB

private static final FeatureFlag NEW_ML_MEMORY_COMPUTATION_FEATURE_FLAG = new FeatureFlag("new_ml_memory_computation");
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@davidkyle Added a feature flag instead of a kill switch, because that seems to be the pattern. All feature flags need to be enabled in snapshot mode, which doesn't work with a kill switch.


public MachineDependentHeap() {}

/**
Expand Down Expand Up @@ -76,12 +79,16 @@ protected int getHeapSizeMb(Settings nodeSettings, MachineNodeRole role, long av
/*
* Machine learning only node.
*
* <p>Heap is computed as:
* <ul>
* <li>40% of total system memory when total system memory 16 gigabytes or less.</li>
* <li>40% of the first 16 gigabytes plus 10% of memory above that when total system memory is more than 16 gigabytes.</li>
* <li>The absolute maximum heap size is 31 gigabytes.</li>
* </ul>
* The memory reserved for Java is computed as:
* - 40% of total system memory when total system memory 16 gigabytes or less.
* - 40% of the first 16 gigabytes plus 10% of memory above that when total system memory is more than 16 gigabytes.
* - The absolute maximum heap size is 31 gigabytes.
*
* This Java memory is divided as follows:
* - 2/3 of the Java memory is reserved for the Java heap.
* - 1/3 of the Java memory is reserved for the Java direct memory.
*
* The direct memory being half of the heap is set by the JvmErgonomics class.
*
* In all cases the result is rounded down to the next whole multiple of 4 megabytes.
* The reason for doing this is that Java will round requested heap sizes to a multiple
Expand All @@ -95,13 +102,22 @@ protected int getHeapSizeMb(Settings nodeSettings, MachineNodeRole role, long av
*
* If this formula is changed then corresponding changes must be made to the {@code NativeMemoryCalculator} and
* {@code MlAutoscalingDeciderServiceTests} classes in the ML plugin code. Failure to keep the logic synchronized
* could result in repeated autoscaling up and down.
* could result in ML processes crashing with OOM errors or repeated autoscaling up and down.
*/
case ML_ONLY -> {
if (availableMemory <= (GB * 16)) {
yield mb((long) (availableMemory * .4), 4);
double heapFractionBelow16GB = 0.4;
double heapFractionAbove16GB = 0.1;
if (NEW_ML_MEMORY_COMPUTATION_FEATURE_FLAG.isEnabled()) {
heapFractionBelow16GB = 0.4 / (1.0 + JvmErgonomics.DIRECT_MEMORY_TO_HEAP_FACTOR);
heapFractionAbove16GB = 0.1 / (1.0 + JvmErgonomics.DIRECT_MEMORY_TO_HEAP_FACTOR);
}
if (availableMemory <= GB * 16) {
yield mb((long) (availableMemory * heapFractionBelow16GB), 4);
} else {
yield mb((long) min((GB * 16) * .4 + (availableMemory - GB * 16) * .1, MAX_HEAP_SIZE), 4);
yield mb(
(long) min(GB * 16 * heapFractionBelow16GB + (availableMemory - GB * 16) * heapFractionAbove16GB, MAX_HEAP_SIZE),
4
);
}
}
/*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,13 @@ public void testMasterOnlyOptions() throws Exception {
}

public void testMlOnlyOptions() throws Exception {
assertHeapOptions(1, containsInAnyOrder("-Xmx408m", "-Xms408m"), "ml");
assertHeapOptions(4, containsInAnyOrder("-Xmx1636m", "-Xms1636m"), "ml");
assertHeapOptions(32, containsInAnyOrder("-Xmx8192m", "-Xms8192m"), "ml");
assertHeapOptions(64, containsInAnyOrder("-Xmx11468m", "-Xms11468m"), "ml");
assertHeapOptions(1, containsInAnyOrder("-Xmx272m", "-Xms272m"), "ml");
assertHeapOptions(4, containsInAnyOrder("-Xmx1092m", "-Xms1092m"), "ml");
assertHeapOptions(32, containsInAnyOrder("-Xmx5460m", "-Xms5460m"), "ml");
assertHeapOptions(64, containsInAnyOrder("-Xmx7644m", "-Xms7644m"), "ml");
// We'd never see a node this big in Cloud, but this assertion proves that the 31GB absolute maximum
// eventually kicks in (because 0.4 * 16 + 0.1 * (263 - 16) > 31)
assertHeapOptions(263, containsInAnyOrder("-Xmx31744m", "-Xms31744m"), "ml");
assertHeapOptions(263, containsInAnyOrder("-Xmx21228m", "-Xms21228m"), "ml");
}

public void testDataNodeOptions() throws Exception {
Expand Down
5 changes: 5 additions & 0 deletions docs/changelog/128742.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 128742
summary: "Account for Java direct memory on machine learning nodes to prevent out-of-memory crashes."
area: Machine Learning
type: bug
issues: []
17 changes: 10 additions & 7 deletions server/src/main/java/org/elasticsearch/monitor/jvm/JvmInfo.java
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,7 @@ public class JvmInfo implements ReportingService.Info {
long nonHeapInit = memoryMXBean.getNonHeapMemoryUsage().getInit() < 0 ? 0 : memoryMXBean.getNonHeapMemoryUsage().getInit();
long nonHeapMax = memoryMXBean.getNonHeapMemoryUsage().getMax() < 0 ? 0 : memoryMXBean.getNonHeapMemoryUsage().getMax();
long directMemoryMax = 0;
try {
Class<?> vmClass = Class.forName("sun.misc.VM");
directMemoryMax = (Long) vmClass.getMethod("maxDirectMemory").invoke(null);
} catch (Exception t) {
// ignore
}
String[] inputArguments = runtimeMXBean.getInputArguments().toArray(new String[runtimeMXBean.getInputArguments().size()]);
Mem mem = new Mem(heapInit, heapMax, nonHeapInit, nonHeapMax, directMemoryMax);

String bootClassPath;
try {
Expand Down Expand Up @@ -130,6 +123,11 @@ public class JvmInfo implements ReportingService.Info {
configuredMaxHeapSize = Long.parseLong((String) valueMethod.invoke(maxHeapSizeVmOptionObject));
} catch (Exception ignored) {}

try {
Object maxDirectMemorySizeVmOptionObject = vmOptionMethod.invoke(hotSpotDiagnosticMXBean, "MaxDirectMemorySize");
directMemoryMax = Long.parseLong((String) valueMethod.invoke(maxDirectMemorySizeVmOptionObject));
} catch (Exception ignored) {}

try {
Object useSerialGCVmOptionObject = vmOptionMethod.invoke(hotSpotDiagnosticMXBean, "UseSerialGC");
useSerialGC = (String) valueMethod.invoke(useSerialGCVmOptionObject);
Expand All @@ -139,6 +137,8 @@ public class JvmInfo implements ReportingService.Info {

}

Mem mem = new Mem(heapInit, heapMax, nonHeapInit, nonHeapMax, directMemoryMax);

INSTANCE = new JvmInfo(
ProcessHandle.current().pid(),
System.getProperty("java.version"),
Expand Down Expand Up @@ -496,5 +496,8 @@ public ByteSizeValue getHeapMax() {
return ByteSizeValue.ofBytes(heapMax);
}

public ByteSizeValue getTotalMax() {
return ByteSizeValue.ofBytes(heapMax + nonHeapMax + directMemoryMax);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import org.elasticsearch.common.settings.SettingsModule;
import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.unit.Processors;
import org.elasticsearch.common.util.FeatureFlag;
import org.elasticsearch.common.util.concurrent.EsExecutors;
import org.elasticsearch.core.TimeValue;
import org.elasticsearch.env.Environment;
Expand Down Expand Up @@ -557,6 +558,8 @@ public class MachineLearning extends Plugin
License.OperationMode.PLATINUM
);

private static final FeatureFlag NEW_ML_MEMORY_COMPUTATION_FEATURE_FLAG = new FeatureFlag("new_ml_memory_computation");

@Override
public Map<String, Processor.Factory> getProcessors(Processor.Parameters parameters) {
if (this.enabled == false) {
Expand Down Expand Up @@ -845,7 +848,12 @@ public Settings additionalSettings() {
machineMemoryAttrName,
Long.toString(OsProbe.getInstance().osStats().getMem().getAdjustedTotal().getBytes())
);
addMlNodeAttribute(additionalSettings, jvmSizeAttrName, Long.toString(Runtime.getRuntime().maxMemory()));

long jvmSize = Runtime.getRuntime().maxMemory();
if (NEW_ML_MEMORY_COMPUTATION_FEATURE_FLAG.isEnabled()) {
jvmSize = JvmInfo.jvmInfo().getMem().getTotalMax().getBytes();
}
addMlNodeAttribute(additionalSettings, jvmSizeAttrName, Long.toString(jvmSize));
addMlNodeAttribute(
additionalSettings,
deprecatedAllocatedProcessorsAttrName,
Expand Down
Loading