Skip to content

Commit a75fc33

Browse files
authored
Manage retention of partial snapshots in SLM (#47833)
Currently, partial snapshots will eventually build up unless they are manually deleted. Partial snapshots may be useful if there is not a more recent successful snapshot, but should eventually be deleted if they are no longer useful. With this change, partial snapshots are deleted using the following strategy: PARTIAL snapshots will be kept until the configured expire_after period has passed, if present, and then be deleted. If there is no configured expire_after in the retention policy, then they will be deleted if there is at least one more recent successful snapshot from this policy (as they may otherwise be useful for troubleshooting purposes). Partial snapshots are not counted towards either min_count or max_count.
1 parent 9686324 commit a75fc33

File tree

4 files changed

+141
-40
lines changed

4 files changed

+141
-40
lines changed

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/slm/SnapshotRetentionConfiguration.java

+9-7
Original file line numberDiff line numberDiff line change
@@ -127,14 +127,15 @@ public Predicate<SnapshotInfo> getSnapshotDeletionPredicate(final List<SnapshotI
127127
.mapToLong(SnapshotInfo::startTime)
128128
.max()
129129
.orElse(Long.MIN_VALUE);
130+
final Set<SnapshotState> unsuccessfulStates = Set.of(SnapshotState.FAILED, SnapshotState.PARTIAL);
130131

131132
return si -> {
132133
final String snapName = si.snapshotId().getName();
133134

134135
// First, if there's no expire_after and a more recent successful snapshot, we can delete all the failed ones
135-
if (this.expireAfter == null && SnapshotState.FAILED.equals(si.state()) && newestSuccessfulTimestamp > si.startTime()) {
136+
if (this.expireAfter == null && unsuccessfulStates.contains(si.state()) && newestSuccessfulTimestamp > si.startTime()) {
136137
// There's no expire_after and there's a more recent successful snapshot, delete this failed one
137-
logger.trace("[{}]: ELIGIBLE as it is FAILED and there is a more recent successful snapshot", snapName);
138+
logger.trace("[{}]: ELIGIBLE as it is {} and there is a more recent successful snapshot", snapName, si.state());
138139
return true;
139140
}
140141

@@ -167,13 +168,13 @@ public Predicate<SnapshotInfo> getSnapshotDeletionPredicate(final List<SnapshotI
167168
// expiration time
168169
if (this.minimumSnapshotCount != null) {
169170
if (successfulSnapshotCount <= this.minimumSnapshotCount)
170-
if (SnapshotState.FAILED.equals(si.state()) == false) {
171+
if (unsuccessfulStates.contains(si.state()) == false) {
171172
logger.trace("[{}]: INELIGIBLE as there are {} non-failed snapshots ({} total) and {} minimum snapshots needed",
172173
snapName, successfulSnapshotCount, totalSnapshotCount, this.minimumSnapshotCount);
173174
return false;
174175
} else {
175176
logger.trace("[{}]: SKIPPING minimum snapshot count check as this snapshot is {} and not counted " +
176-
"towards the minimum snapshot count.", snapName, SnapshotState.FAILED);
177+
"towards the minimum snapshot count.", snapName, si.state());
177178
}
178179
}
179180

@@ -190,10 +191,11 @@ public Predicate<SnapshotInfo> getSnapshotDeletionPredicate(final List<SnapshotI
190191
final Stream<SnapshotInfo> successfulSnapsEligibleForExpiration = sortedSnapshots.stream()
191192
.filter(snap -> SnapshotState.SUCCESS.equals(snap.state()))
192193
.limit(eligibleForExpiration);
193-
final Stream<SnapshotInfo> failedSnaps = sortedSnapshots.stream()
194-
.filter(snap -> SnapshotState.FAILED.equals(snap.state()));
194+
final Stream<SnapshotInfo> unsucessfulSnaps = sortedSnapshots.stream()
195+
.filter(snap -> unsuccessfulStates.contains(snap.state()));
195196

196-
final Set<SnapshotInfo> snapsEligibleForExpiration = Stream.concat(successfulSnapsEligibleForExpiration, failedSnaps)
197+
final Set<SnapshotInfo> snapsEligibleForExpiration = Stream
198+
.concat(successfulSnapsEligibleForExpiration, unsucessfulSnaps)
197199
.collect(Collectors.toSet());
198200

199201
if (snapsEligibleForExpiration.contains(si) == false) {

x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/slm/SnapshotRetentionConfigurationTests.java

+92-14
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import org.elasticsearch.snapshots.SnapshotId;
1212
import org.elasticsearch.snapshots.SnapshotInfo;
1313
import org.elasticsearch.snapshots.SnapshotShardFailure;
14+
import org.elasticsearch.snapshots.SnapshotState;
1415
import org.elasticsearch.test.ESTestCase;
1516
import org.elasticsearch.xpack.core.slm.SnapshotLifecyclePolicy;
1617
import org.elasticsearch.xpack.core.slm.SnapshotRetentionConfiguration;
@@ -103,13 +104,21 @@ public void testMaximum() {
103104
}
104105

105106
public void testFailuresDeletedIfExpired() {
107+
assertUnsuccessfulDeletedIfExpired(true);
108+
}
109+
110+
public void testPartialsDeletedIfExpired() {
111+
assertUnsuccessfulDeletedIfExpired(false);
112+
}
113+
114+
private void assertUnsuccessfulDeletedIfExpired(boolean failure) {
106115
SnapshotRetentionConfiguration conf = new SnapshotRetentionConfiguration(
107116
() -> TimeValue.timeValueDays(1).millis() + 1,
108117
TimeValue.timeValueDays(1), null, null);
109-
SnapshotInfo oldInfo = makeFailureInfo(0);
118+
SnapshotInfo oldInfo = makeFailureOrPartial(0, failure);
110119
assertThat(conf.getSnapshotDeletionPredicate(Collections.singletonList(oldInfo)).test(oldInfo), equalTo(true));
111120

112-
SnapshotInfo newInfo = makeFailureInfo(1);
121+
SnapshotInfo newInfo = makeFailureOrPartial(1, failure);
113122
assertThat(conf.getSnapshotDeletionPredicate(Collections.singletonList(newInfo)).test(newInfo), equalTo(false));
114123

115124
List<SnapshotInfo> infos = new ArrayList<>();
@@ -120,10 +129,18 @@ public void testFailuresDeletedIfExpired() {
120129
}
121130

122131
public void testFailuresDeletedIfNoExpiryAndMoreRecentSuccessExists() {
132+
assertUnsuccessfulDeletedIfNoExpiryAndMoreRecentSuccessExists(true);
133+
}
134+
135+
public void testPartialsDeletedIfNoExpiryAndMoreRecentSuccessExists() {
136+
assertUnsuccessfulDeletedIfNoExpiryAndMoreRecentSuccessExists(false);
137+
}
138+
139+
private void assertUnsuccessfulDeletedIfNoExpiryAndMoreRecentSuccessExists(boolean failure) {
123140
SnapshotRetentionConfiguration conf = new SnapshotRetentionConfiguration(() -> 1, null, 2, 5);
124141
SnapshotInfo s1 = makeInfo(1);
125142
SnapshotInfo s2 = makeInfo(2);
126-
SnapshotInfo s3 = makeFailureInfo(3);
143+
SnapshotInfo s3 = makeFailureOrPartial(3, failure);
127144
SnapshotInfo s4 = makeInfo(4);
128145

129146
List<SnapshotInfo> infos = Arrays.asList(s1 , s2, s3, s4);
@@ -134,12 +151,20 @@ public void testFailuresDeletedIfNoExpiryAndMoreRecentSuccessExists() {
134151
}
135152

136153
public void testFailuresKeptIfNoExpiryAndNoMoreRecentSuccess() {
154+
assertUnsuccessfulKeptIfNoExpiryAndNoMoreRecentSuccess(true);
155+
}
156+
157+
public void testPartialsKeptIfNoExpiryAndNoMoreRecentSuccess() {
158+
assertUnsuccessfulKeptIfNoExpiryAndNoMoreRecentSuccess(false);
159+
}
160+
161+
private void assertUnsuccessfulKeptIfNoExpiryAndNoMoreRecentSuccess(boolean failure) {
137162
// Also tests that failures are not counted towards the maximum
138163
SnapshotRetentionConfiguration conf = new SnapshotRetentionConfiguration(() -> 1, null, 2, 3);
139164
SnapshotInfo s1 = makeInfo(1);
140165
SnapshotInfo s2 = makeInfo(2);
141166
SnapshotInfo s3 = makeInfo(3);
142-
SnapshotInfo s4 = makeFailureInfo(4);
167+
SnapshotInfo s4 = makeFailureOrPartial(4, failure);
143168

144169
List<SnapshotInfo> infos = Arrays.asList(s1 , s2, s3, s4);
145170
assertThat(conf.getSnapshotDeletionPredicate(infos).test(s1), equalTo(false));
@@ -149,11 +174,19 @@ public void testFailuresKeptIfNoExpiryAndNoMoreRecentSuccess() {
149174
}
150175

151176
public void testFailuresNotCountedTowardsMaximum() {
177+
assertUnsuccessfulNotCountedTowardsMaximum(true);
178+
}
179+
180+
public void testPartialsNotCountedTowardsMaximum() {
181+
assertUnsuccessfulNotCountedTowardsMaximum(false);
182+
}
183+
184+
private void assertUnsuccessfulNotCountedTowardsMaximum(boolean failure) {
152185
SnapshotRetentionConfiguration conf = new SnapshotRetentionConfiguration(() -> 1, TimeValue.timeValueDays(1), 2, 2);
153186
SnapshotInfo s1 = makeInfo(1);
154-
SnapshotInfo s2 = makeFailureInfo(2);
155-
SnapshotInfo s3 = makeFailureInfo(3);
156-
SnapshotInfo s4 = makeFailureInfo(4);
187+
SnapshotInfo s2 = makeFailureOrPartial(2, failure);
188+
SnapshotInfo s3 = makeFailureOrPartial(3, failure);
189+
SnapshotInfo s4 = makeFailureOrPartial(4, failure);
157190
SnapshotInfo s5 = makeInfo(5);
158191

159192
List<SnapshotInfo> infos = Arrays.asList(s1 , s2, s3, s4, s5);
@@ -165,10 +198,18 @@ public void testFailuresNotCountedTowardsMaximum() {
165198
}
166199

167200
public void testFailuresNotCountedTowardsMinimum() {
201+
assertUnsuccessfulNotCountedTowardsMinimum(true);
202+
}
203+
204+
public void testPartialsNotCountedTowardsMinimum() {
205+
assertUnsuccessfulNotCountedTowardsMinimum(false);
206+
}
207+
208+
private void assertUnsuccessfulNotCountedTowardsMinimum(boolean failure) {
168209
SnapshotRetentionConfiguration conf = new SnapshotRetentionConfiguration(() -> TimeValue.timeValueDays(1).millis() + 1,
169210
TimeValue.timeValueDays(1), 2, null);
170211
SnapshotInfo oldInfo = makeInfo(0);
171-
SnapshotInfo failureInfo = makeFailureInfo( 1);
212+
SnapshotInfo failureInfo = makeFailureOrPartial(1, failure);
172213
SnapshotInfo newInfo = makeInfo(2);
173214

174215
List<SnapshotInfo> infos = new ArrayList<>();
@@ -186,12 +227,14 @@ public void testFailuresNotCountedTowardsMinimum() {
186227
assertThat(conf.getSnapshotDeletionPredicate(infos).test(oldInfo), equalTo(true));
187228
}
188229

230+
189231
public void testMostRecentSuccessfulTimestampIsUsed() {
232+
boolean failureBeforePartial = randomBoolean();
190233
SnapshotRetentionConfiguration conf = new SnapshotRetentionConfiguration(() -> 1, null, 2, 2);
191234
SnapshotInfo s1 = makeInfo(1);
192235
SnapshotInfo s2 = makeInfo(2);
193-
SnapshotInfo s3 = makeFailureInfo(3);
194-
SnapshotInfo s4 = makeFailureInfo(4);
236+
SnapshotInfo s3 = makeFailureOrPartial(3, failureBeforePartial);
237+
SnapshotInfo s4 = makeFailureOrPartial(4, failureBeforePartial == false);
195238

196239
List<SnapshotInfo> infos = Arrays.asList(s1 , s2, s3, s4);
197240
assertThat(conf.getSnapshotDeletionPredicate(infos).test(s1), equalTo(false));
@@ -204,15 +247,25 @@ private SnapshotInfo makeInfo(long startTime) {
204247
final Map<String, Object> meta = new HashMap<>();
205248
meta.put(SnapshotLifecyclePolicy.POLICY_ID_METADATA_FIELD, REPO);
206249
final int totalShards = between(1,20);
207-
return new SnapshotInfo(new SnapshotId("snap-" + randomAlphaOfLength(3), "uuid"),
250+
SnapshotInfo snapInfo = new SnapshotInfo(new SnapshotId("snap-" + randomAlphaOfLength(3), "uuid"),
208251
Collections.singletonList("foo"),
209252
startTime,
210253
null,
211-
startTime + between(1,10000),
254+
startTime + between(1, 10000),
212255
totalShards,
213256
new ArrayList<>(),
214257
false,
215258
meta);
259+
assertThat(snapInfo.state(), equalTo(SnapshotState.SUCCESS));
260+
return snapInfo;
261+
}
262+
263+
private SnapshotInfo makeFailureOrPartial(long startTime, boolean failure) {
264+
if (failure) {
265+
return makeFailureInfo(startTime);
266+
} else {
267+
return makePartialInfo(startTime);
268+
}
216269
}
217270

218271
private SnapshotInfo makeFailureInfo(long startTime) {
@@ -225,14 +278,39 @@ private SnapshotInfo makeFailureInfo(long startTime) {
225278
failures.add(new SnapshotShardFailure("nodeId", new ShardId("index-name", "index-uuid", i), "failed"));
226279
}
227280
assert failureCount == failures.size();
228-
return new SnapshotInfo(new SnapshotId("snap-fail-" + randomAlphaOfLength(3), "uuid-fail"),
281+
SnapshotInfo snapInfo = new SnapshotInfo(new SnapshotId("snap-fail-" + randomAlphaOfLength(3), "uuid-fail"),
229282
Collections.singletonList("foo-fail"),
230283
startTime,
231284
"forced-failure",
232-
startTime + between(1,10000),
285+
startTime + between(1, 10000),
286+
totalShards,
287+
failures,
288+
randomBoolean(),
289+
meta);
290+
assertThat(snapInfo.state(), equalTo(SnapshotState.FAILED));
291+
return snapInfo;
292+
}
293+
294+
private SnapshotInfo makePartialInfo(long startTime) {
295+
final Map<String, Object> meta = new HashMap<>();
296+
meta.put(SnapshotLifecyclePolicy.POLICY_ID_METADATA_FIELD, REPO);
297+
final int totalShards = between(2,20);
298+
final List<SnapshotShardFailure> failures = new ArrayList<>();
299+
final int failureCount = between(1,totalShards - 1);
300+
for (int i = 0; i < failureCount; i++) {
301+
failures.add(new SnapshotShardFailure("nodeId", new ShardId("index-name", "index-uuid", i), "failed"));
302+
}
303+
assert failureCount == failures.size();
304+
SnapshotInfo snapInfo = new SnapshotInfo(new SnapshotId("snap-fail-" + randomAlphaOfLength(3), "uuid-fail"),
305+
Collections.singletonList("foo-fail"),
306+
startTime,
307+
null,
308+
startTime + between(1, 10000),
233309
totalShards,
234310
failures,
235311
randomBoolean(),
236312
meta);
313+
assertThat(snapInfo.state(), equalTo(SnapshotState.PARTIAL));
314+
return snapInfo;
237315
}
238316
}

x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/slm/SnapshotRetentionTask.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ void getAllRetainableSnapshots(Collection<String> repositories, ActionListener<M
240240
@Override
241241
public void onResponse(final GetSnapshotsResponse resp) {
242242
Map<String, List<SnapshotInfo>> snapshots = new HashMap<>();
243-
final Set<SnapshotState> retainableStates = Set.of(SnapshotState.SUCCESS, SnapshotState.FAILED);
243+
final Set<SnapshotState> retainableStates = Set.of(SnapshotState.SUCCESS, SnapshotState.FAILED, SnapshotState.PARTIAL);
244244
repositories.forEach(repo -> {
245245
snapshots.put(repo,
246246
// Only return snapshots in the SUCCESS state

0 commit comments

Comments
 (0)