Skip to content

Commit ec87265

Browse files
Include direct memory and non-heap memory in ML memory calculations (take elastic#2) (elastic#128742)
* Include direct memory and non-heap memory in ML memory calculations. * Reduce ML_ONLY heap size, so that direct memory is accounted for. * [CI] Auto commit changes from spotless * changelog * improve docs * Reuse direct memory to heap factor * feature flag --------- Co-authored-by: elasticsearchmachine <[email protected]>
1 parent 57921fb commit ec87265

File tree

6 files changed

+58
-24
lines changed

6 files changed

+58
-24
lines changed

distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/JvmErgonomics.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
*/
2929
final class JvmErgonomics {
3030

31+
static final double DIRECT_MEMORY_TO_HEAP_FACTOR = 0.5;
32+
3133
private JvmErgonomics() {
3234
throw new AssertionError("No instances intended");
3335
}
@@ -44,7 +46,7 @@ static List<String> choose(final List<String> userDefinedJvmOptions, Settings no
4446
final long heapSize = JvmOption.extractMaxHeapSize(finalJvmOptions);
4547
final long maxDirectMemorySize = JvmOption.extractMaxDirectMemorySize(finalJvmOptions);
4648
if (maxDirectMemorySize == 0) {
47-
ergonomicChoices.add("-XX:MaxDirectMemorySize=" + heapSize / 2);
49+
ergonomicChoices.add("-XX:MaxDirectMemorySize=" + (long) (DIRECT_MEMORY_TO_HEAP_FACTOR * heapSize));
4850
}
4951

5052
final boolean tuneG1GCForSmallHeap = tuneG1GCForSmallHeap(heapSize);

distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/MachineDependentHeap.java

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
import org.elasticsearch.cluster.node.DiscoveryNodeRole;
1313
import org.elasticsearch.common.settings.Settings;
14+
import org.elasticsearch.common.util.FeatureFlag;
1415
import org.elasticsearch.node.NodeRoleSettings;
1516

1617
import java.io.IOException;
@@ -37,6 +38,8 @@ public class MachineDependentHeap {
3738
protected static final long MAX_HEAP_SIZE = GB * 31; // 31GB
3839
protected static final long MIN_HEAP_SIZE = 1024 * 1024 * 128; // 128MB
3940

41+
private static final FeatureFlag NEW_ML_MEMORY_COMPUTATION_FEATURE_FLAG = new FeatureFlag("new_ml_memory_computation");
42+
4043
public MachineDependentHeap() {}
4144

4245
/**
@@ -76,12 +79,16 @@ protected int getHeapSizeMb(Settings nodeSettings, MachineNodeRole role, long av
7679
/*
7780
* Machine learning only node.
7881
*
79-
* <p>Heap is computed as:
80-
* <ul>
81-
* <li>40% of total system memory when total system memory 16 gigabytes or less.</li>
82-
* <li>40% of the first 16 gigabytes plus 10% of memory above that when total system memory is more than 16 gigabytes.</li>
83-
* <li>The absolute maximum heap size is 31 gigabytes.</li>
84-
* </ul>
82+
* The memory reserved for Java is computed as:
83+
* - 40% of total system memory when total system memory 16 gigabytes or less.
84+
* - 40% of the first 16 gigabytes plus 10% of memory above that when total system memory is more than 16 gigabytes.
85+
* - The absolute maximum heap size is 31 gigabytes.
86+
*
87+
* This Java memory is divided as follows:
88+
* - 2/3 of the Java memory is reserved for the Java heap.
89+
* - 1/3 of the Java memory is reserved for the Java direct memory.
90+
*
91+
* The direct memory being half of the heap is set by the JvmErgonomics class.
8592
*
8693
* In all cases the result is rounded down to the next whole multiple of 4 megabytes.
8794
* The reason for doing this is that Java will round requested heap sizes to a multiple
@@ -95,13 +102,22 @@ protected int getHeapSizeMb(Settings nodeSettings, MachineNodeRole role, long av
95102
*
96103
* If this formula is changed then corresponding changes must be made to the {@code NativeMemoryCalculator} and
97104
* {@code MlAutoscalingDeciderServiceTests} classes in the ML plugin code. Failure to keep the logic synchronized
98-
* could result in repeated autoscaling up and down.
105+
* could result in ML processes crashing with OOM errors or repeated autoscaling up and down.
99106
*/
100107
case ML_ONLY -> {
101-
if (availableMemory <= (GB * 16)) {
102-
yield mb((long) (availableMemory * .4), 4);
108+
double heapFractionBelow16GB = 0.4;
109+
double heapFractionAbove16GB = 0.1;
110+
if (NEW_ML_MEMORY_COMPUTATION_FEATURE_FLAG.isEnabled()) {
111+
heapFractionBelow16GB = 0.4 / (1.0 + JvmErgonomics.DIRECT_MEMORY_TO_HEAP_FACTOR);
112+
heapFractionAbove16GB = 0.1 / (1.0 + JvmErgonomics.DIRECT_MEMORY_TO_HEAP_FACTOR);
113+
}
114+
if (availableMemory <= GB * 16) {
115+
yield mb((long) (availableMemory * heapFractionBelow16GB), 4);
103116
} else {
104-
yield mb((long) min((GB * 16) * .4 + (availableMemory - GB * 16) * .1, MAX_HEAP_SIZE), 4);
117+
yield mb(
118+
(long) min(GB * 16 * heapFractionBelow16GB + (availableMemory - GB * 16) * heapFractionAbove16GB, MAX_HEAP_SIZE),
119+
4
120+
);
105121
}
106122
}
107123
/*

distribution/tools/server-cli/src/test/java/org/elasticsearch/server/cli/MachineDependentHeapTests.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,13 +58,13 @@ public void testMasterOnlyOptions() throws Exception {
5858
}
5959

6060
public void testMlOnlyOptions() throws Exception {
61-
assertHeapOptions(1, containsInAnyOrder("-Xmx408m", "-Xms408m"), "ml");
62-
assertHeapOptions(4, containsInAnyOrder("-Xmx1636m", "-Xms1636m"), "ml");
63-
assertHeapOptions(32, containsInAnyOrder("-Xmx8192m", "-Xms8192m"), "ml");
64-
assertHeapOptions(64, containsInAnyOrder("-Xmx11468m", "-Xms11468m"), "ml");
61+
assertHeapOptions(1, containsInAnyOrder("-Xmx272m", "-Xms272m"), "ml");
62+
assertHeapOptions(4, containsInAnyOrder("-Xmx1092m", "-Xms1092m"), "ml");
63+
assertHeapOptions(32, containsInAnyOrder("-Xmx5460m", "-Xms5460m"), "ml");
64+
assertHeapOptions(64, containsInAnyOrder("-Xmx7644m", "-Xms7644m"), "ml");
6565
// We'd never see a node this big in Cloud, but this assertion proves that the 31GB absolute maximum
6666
// eventually kicks in (because 0.4 * 16 + 0.1 * (263 - 16) > 31)
67-
assertHeapOptions(263, containsInAnyOrder("-Xmx31744m", "-Xms31744m"), "ml");
67+
assertHeapOptions(263, containsInAnyOrder("-Xmx21228m", "-Xms21228m"), "ml");
6868
}
6969

7070
public void testDataNodeOptions() throws Exception {

docs/changelog/128742.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 128742
2+
summary: "Account for Java direct memory on machine learning nodes to prevent out-of-memory crashes."
3+
area: Machine Learning
4+
type: bug
5+
issues: []

server/src/main/java/org/elasticsearch/monitor/jvm/JvmInfo.java

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -46,14 +46,7 @@ public class JvmInfo implements ReportingService.Info {
4646
long nonHeapInit = memoryMXBean.getNonHeapMemoryUsage().getInit() < 0 ? 0 : memoryMXBean.getNonHeapMemoryUsage().getInit();
4747
long nonHeapMax = memoryMXBean.getNonHeapMemoryUsage().getMax() < 0 ? 0 : memoryMXBean.getNonHeapMemoryUsage().getMax();
4848
long directMemoryMax = 0;
49-
try {
50-
Class<?> vmClass = Class.forName("sun.misc.VM");
51-
directMemoryMax = (Long) vmClass.getMethod("maxDirectMemory").invoke(null);
52-
} catch (Exception t) {
53-
// ignore
54-
}
5549
String[] inputArguments = runtimeMXBean.getInputArguments().toArray(new String[runtimeMXBean.getInputArguments().size()]);
56-
Mem mem = new Mem(heapInit, heapMax, nonHeapInit, nonHeapMax, directMemoryMax);
5750

5851
String bootClassPath;
5952
try {
@@ -133,6 +126,11 @@ public class JvmInfo implements ReportingService.Info {
133126
configuredMaxHeapSize = Long.parseLong((String) valueMethod.invoke(maxHeapSizeVmOptionObject));
134127
} catch (Exception ignored) {}
135128

129+
try {
130+
Object maxDirectMemorySizeVmOptionObject = vmOptionMethod.invoke(hotSpotDiagnosticMXBean, "MaxDirectMemorySize");
131+
directMemoryMax = Long.parseLong((String) valueMethod.invoke(maxDirectMemorySizeVmOptionObject));
132+
} catch (Exception ignored) {}
133+
136134
try {
137135
Object useSerialGCVmOptionObject = vmOptionMethod.invoke(hotSpotDiagnosticMXBean, "UseSerialGC");
138136
useSerialGC = (String) valueMethod.invoke(useSerialGCVmOptionObject);
@@ -142,6 +140,8 @@ public class JvmInfo implements ReportingService.Info {
142140

143141
}
144142

143+
Mem mem = new Mem(heapInit, heapMax, nonHeapInit, nonHeapMax, directMemoryMax);
144+
145145
INSTANCE = new JvmInfo(
146146
ProcessHandle.current().pid(),
147147
System.getProperty("java.version"),
@@ -510,5 +510,8 @@ public ByteSizeValue getHeapMax() {
510510
return ByteSizeValue.ofBytes(heapMax);
511511
}
512512

513+
public ByteSizeValue getTotalMax() {
514+
return ByteSizeValue.ofBytes(heapMax + nonHeapMax + directMemoryMax);
515+
}
513516
}
514517
}

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
import org.elasticsearch.common.settings.SettingsModule;
4444
import org.elasticsearch.common.unit.ByteSizeValue;
4545
import org.elasticsearch.common.unit.Processors;
46+
import org.elasticsearch.common.util.FeatureFlag;
4647
import org.elasticsearch.common.util.concurrent.EsExecutors;
4748
import org.elasticsearch.core.TimeValue;
4849
import org.elasticsearch.env.Environment;
@@ -562,6 +563,8 @@ public class MachineLearning extends Plugin
562563
License.OperationMode.PLATINUM
563564
);
564565

566+
private static final FeatureFlag NEW_ML_MEMORY_COMPUTATION_FEATURE_FLAG = new FeatureFlag("new_ml_memory_computation");
567+
565568
@Override
566569
public Map<String, Processor.Factory> getProcessors(Processor.Parameters parameters) {
567570
if (this.enabled == false) {
@@ -848,7 +851,12 @@ public Settings additionalSettings() {
848851
machineMemoryAttrName,
849852
Long.toString(OsProbe.getInstance().osStats().getMem().getAdjustedTotal().getBytes())
850853
);
851-
addMlNodeAttribute(additionalSettings, jvmSizeAttrName, Long.toString(Runtime.getRuntime().maxMemory()));
854+
855+
long jvmSize = Runtime.getRuntime().maxMemory();
856+
if (NEW_ML_MEMORY_COMPUTATION_FEATURE_FLAG.isEnabled()) {
857+
jvmSize = JvmInfo.jvmInfo().getMem().getTotalMax().getBytes();
858+
}
859+
addMlNodeAttribute(additionalSettings, jvmSizeAttrName, Long.toString(jvmSize));
852860
addMlNodeAttribute(
853861
additionalSettings,
854862
deprecatedAllocatedProcessorsAttrName,

0 commit comments

Comments
 (0)