Skip to content

Commit adf4d10

Browse files
authored
Setting for estimated shard heap allocation decider (#128722)
This PR adds a new setting to toggle the collection for shard heap usages as well as wiring ShardHeapUsage into ClusterInfoSimulator. Relates: #128723
1 parent 7fb130c commit adf4d10

File tree

8 files changed

+90
-17
lines changed

8 files changed

+90
-17
lines changed

modules/data-streams/src/test/java/org/elasticsearch/datastreams/LookAHeadTimeTests.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,8 @@ public void testLookAheadTimeSettingHigherThanTimeSeriesPollIntervalSetting() {
118118
updateIndexSettings(indexSettings);
119119
}
120120

121-
private void updateClusterSettings(Settings settings) {
121+
@Override
122+
protected void updateClusterSettings(Settings settings) {
122123
clusterAdmin().updateSettings(
123124
new ClusterUpdateSettingsRequest(TEST_REQUEST_TIMEOUT, TEST_REQUEST_TIMEOUT).persistentSettings(settings)
124125
).actionGet();

server/src/internalClusterTest/java/org/elasticsearch/index/shard/IndexShardIT.java

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -264,14 +264,33 @@ public void testExpectedShardSizeIsPresent() throws InterruptedException {
264264
public void testHeapUsageEstimateIsPresent() {
265265
InternalClusterInfoService clusterInfoService = (InternalClusterInfoService) getInstanceFromNode(ClusterInfoService.class);
266266
ClusterInfoServiceUtils.refresh(clusterInfoService);
267-
ClusterState state = getInstanceFromNode(ClusterService.class).state();
268267
Map<String, ShardHeapUsage> shardHeapUsages = clusterInfoService.getClusterInfo().getShardHeapUsages();
269268
assertNotNull(shardHeapUsages);
270-
assertEquals(state.nodes().size(), shardHeapUsages.size());
271-
for (DiscoveryNode node : state.nodes()) {
272-
assertTrue(shardHeapUsages.containsKey(node.getId()));
273-
ShardHeapUsage shardHeapUsage = shardHeapUsages.get(node.getId());
274-
assertThat(shardHeapUsage.estimatedFreeBytes(), lessThanOrEqualTo(shardHeapUsage.totalBytes()));
269+
// Not collecting yet because it is disabled
270+
assertTrue(shardHeapUsages.isEmpty());
271+
272+
// Enable collection for shard heap usages
273+
updateClusterSettings(
274+
Settings.builder()
275+
.put(InternalClusterInfoService.CLUSTER_ROUTING_ALLOCATION_SHARD_HEAP_THRESHOLD_DECIDER_ENABLED.getKey(), true)
276+
.build()
277+
);
278+
try {
279+
ClusterInfoServiceUtils.refresh(clusterInfoService);
280+
ClusterState state = getInstanceFromNode(ClusterService.class).state();
281+
shardHeapUsages = clusterInfoService.getClusterInfo().getShardHeapUsages();
282+
assertEquals(state.nodes().size(), shardHeapUsages.size());
283+
for (DiscoveryNode node : state.nodes()) {
284+
assertTrue(shardHeapUsages.containsKey(node.getId()));
285+
ShardHeapUsage shardHeapUsage = shardHeapUsages.get(node.getId());
286+
assertThat(shardHeapUsage.estimatedFreeBytes(), lessThanOrEqualTo(shardHeapUsage.totalBytes()));
287+
}
288+
} finally {
289+
updateClusterSettings(
290+
Settings.builder()
291+
.putNull(InternalClusterInfoService.CLUSTER_ROUTING_ALLOCATION_SHARD_HEAP_THRESHOLD_DECIDER_ENABLED.getKey())
292+
.build()
293+
);
275294
}
276295
}
277296

server/src/main/java/org/elasticsearch/cluster/ClusterInfoSimulator.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ public class ClusterInfoSimulator {
3333
private final CopyOnFirstWriteMap<String, Long> shardSizes;
3434
private final Map<ShardId, Long> shardDataSetSizes;
3535
private final Map<NodeAndShard, String> dataPath;
36+
private final Map<String, ShardHeapUsage> shardHeapUsages;
3637

3738
public ClusterInfoSimulator(RoutingAllocation allocation) {
3839
this.allocation = allocation;
@@ -41,6 +42,7 @@ public ClusterInfoSimulator(RoutingAllocation allocation) {
4142
this.shardSizes = new CopyOnFirstWriteMap<>(allocation.clusterInfo().shardSizes);
4243
this.shardDataSetSizes = Map.copyOf(allocation.clusterInfo().shardDataSetSizes);
4344
this.dataPath = Map.copyOf(allocation.clusterInfo().dataPath);
45+
this.shardHeapUsages = allocation.clusterInfo().getShardHeapUsages();
4446
}
4547

4648
/**
@@ -154,7 +156,7 @@ public ClusterInfo getClusterInfo() {
154156
shardDataSetSizes,
155157
dataPath,
156158
Map.of(),
157-
Map.of()
159+
shardHeapUsages
158160
);
159161
}
160162
}

server/src/main/java/org/elasticsearch/cluster/InternalClusterInfoService.java

Lines changed: 47 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,15 @@ public class InternalClusterInfoService implements ClusterInfoService, ClusterSt
8383
Property.NodeScope
8484
);
8585

86+
public static final Setting<Boolean> CLUSTER_ROUTING_ALLOCATION_SHARD_HEAP_THRESHOLD_DECIDER_ENABLED = Setting.boolSetting(
87+
"cluster.routing.allocation.shard_heap.threshold_enabled",
88+
false,
89+
Property.Dynamic,
90+
Property.NodeScope
91+
);
92+
8693
private volatile boolean diskThresholdEnabled;
94+
private volatile boolean shardHeapThresholdEnabled;
8795
private volatile TimeValue updateFrequency;
8896
private volatile TimeValue fetchTimeout;
8997

@@ -130,12 +138,20 @@ public InternalClusterInfoService(
130138
DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING,
131139
this::setDiskThresholdEnabled
132140
);
141+
clusterSettings.initializeAndWatch(
142+
CLUSTER_ROUTING_ALLOCATION_SHARD_HEAP_THRESHOLD_DECIDER_ENABLED,
143+
this::setShardHeapThresholdEnabled
144+
);
133145
}
134146

135147
private void setDiskThresholdEnabled(boolean diskThresholdEnabled) {
136148
this.diskThresholdEnabled = diskThresholdEnabled;
137149
}
138150

151+
private void setShardHeapThresholdEnabled(boolean shardHeapThresholdEnabled) {
152+
this.shardHeapThresholdEnabled = shardHeapThresholdEnabled;
153+
}
154+
139155
private void setFetchTimeout(TimeValue fetchTimeout) {
140156
this.fetchTimeout = fetchTimeout;
141157
}
@@ -185,20 +201,44 @@ void execute() {
185201
logger.trace("starting async refresh");
186202

187203
try (var ignoredRefs = fetchRefs) {
188-
if (diskThresholdEnabled) {
189-
try (var ignored = threadPool.getThreadContext().clearTraceContext()) {
190-
fetchIndicesStats();
191-
}
192-
} else {
193-
logger.trace("skipping collecting disk usage info from cluster, notifying listeners with empty cluster info");
194-
indicesStatsSummary = IndicesStatsSummary.EMPTY;
204+
maybeFetchIndicesStats(diskThresholdEnabled);
205+
maybeFetchNodeStats(diskThresholdEnabled || shardHeapThresholdEnabled);
206+
maybeFetchNodesHeapUsage(shardHeapThresholdEnabled);
207+
}
208+
}
209+
210+
private void maybeFetchIndicesStats(boolean shouldFetch) {
211+
if (shouldFetch) {
212+
try (var ignored = threadPool.getThreadContext().clearTraceContext()) {
213+
fetchIndicesStats();
195214
}
215+
} else {
216+
logger.trace("skipping collecting disk usage info from cluster, notifying listeners with empty indices stats");
217+
indicesStatsSummary = IndicesStatsSummary.EMPTY;
218+
}
219+
}
220+
221+
private void maybeFetchNodeStats(boolean shouldFetch) {
222+
if (shouldFetch) {
196223
try (var ignored = threadPool.getThreadContext().clearTraceContext()) {
197224
fetchNodeStats();
198225
}
226+
} else {
227+
logger.trace("skipping collecting node stats from cluster, notifying listeners with empty node stats");
228+
leastAvailableSpaceUsages = Map.of();
229+
mostAvailableSpaceUsages = Map.of();
230+
maxHeapPerNode = Map.of();
231+
}
232+
}
233+
234+
private void maybeFetchNodesHeapUsage(boolean shouldFetch) {
235+
if (shouldFetch) {
199236
try (var ignored = threadPool.getThreadContext().clearTraceContext()) {
200237
fetchNodesHeapUsage();
201238
}
239+
} else {
240+
logger.trace("skipping collecting shard heap usage from cluster, notifying listeners with empty shard heap usage");
241+
shardHeapUsagePerNode = Map.of();
202242
}
203243
}
204244

server/src/main/java/org/elasticsearch/cluster/ShardHeapUsage.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@ public double estimatedFreeBytesAsPercentage() {
4545
}
4646

4747
public double estimatedUsageAsPercentage() {
48-
return 100.0 * estimatedUsageBytes / (double) totalBytes;
48+
return 100.0 * estimatedUsageAsRatio();
49+
}
50+
51+
public double estimatedUsageAsRatio() {
52+
return estimatedUsageBytes / (double) totalBytes;
4953
}
5054
}

server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,7 @@ public void apply(Settings value, Settings current, Settings previous) {
292292
DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_MAX_HEADROOM_SETTING,
293293
DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING,
294294
DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING,
295+
InternalClusterInfoService.CLUSTER_ROUTING_ALLOCATION_SHARD_HEAP_THRESHOLD_DECIDER_ENABLED,
295296
SameShardAllocationDecider.CLUSTER_ROUTING_ALLOCATION_SAME_HOST_SETTING,
296297
InternalClusterInfoService.INTERNAL_CLUSTER_INFO_UPDATE_INTERVAL_SETTING,
297298
InternalClusterInfoService.INTERNAL_CLUSTER_INFO_TIMEOUT_SETTING,

server/src/test/java/org/elasticsearch/cluster/InternalClusterInfoServiceSchedulingTests.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,9 @@ public void testScheduling() {
5353
final DiscoveryNodes noMaster = DiscoveryNodes.builder().add(discoveryNode).localNodeId(discoveryNode.getId()).build();
5454
final DiscoveryNodes localMaster = noMaster.withMasterNodeId(discoveryNode.getId());
5555

56-
final Settings.Builder settingsBuilder = Settings.builder().put(Node.NODE_NAME_SETTING.getKey(), discoveryNode.getName());
56+
final Settings.Builder settingsBuilder = Settings.builder()
57+
.put(Node.NODE_NAME_SETTING.getKey(), discoveryNode.getName())
58+
.put(InternalClusterInfoService.CLUSTER_ROUTING_ALLOCATION_SHARD_HEAP_THRESHOLD_DECIDER_ENABLED.getKey(), true);
5759
if (randomBoolean()) {
5860
settingsBuilder.put(INTERNAL_CLUSTER_INFO_UPDATE_INTERVAL_SETTING.getKey(), randomIntBetween(10000, 60000) + "ms");
5961
}

test/framework/src/main/java/org/elasticsearch/test/ESSingleNodeTestCase.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -536,4 +536,8 @@ protected final void deletePipeline(String id) {
536536
)
537537
);
538538
}
539+
540+
protected void updateClusterSettings(Settings settings) {
541+
safeGet(clusterAdmin().prepareUpdateSettings(TEST_REQUEST_TIMEOUT, TEST_REQUEST_TIMEOUT).setPersistentSettings(settings).execute());
542+
}
539543
}

0 commit comments

Comments
 (0)