Skip to content

Commit 831f2a4

Browse files
authored
Fix flakiness of test_storcon_create_delete_sk_down (neondatabase#12040)
The `test_storcon_create_delete_sk_down` test is still flaky. This test addresses two possible causes for flakiness. both causes are related to deletion racing with `pull_timeline` which hasn't finished yet. * the first cause is timeline deletion racing with `pull_timeline`: * the first deletion attempt doesn't contain the line because the timeline doesn't exist yet * the subsequent deletion attempts don't contain it either, only a note that the timeline is already deleted. * so this patch adds the note that the timeline is already deleted to the regex * the second cause is about tenant deletion racing with `pull_timeline`: * there were no tenant specific tombstones so if a tenant was deleted, we only added tombstones for the specific timelines being deleted, not for the tenant itself. * This patch changes this, so we now have tenant specific tombstones as well as timeline specific ones, and creation of a timeline checks both. * we also don't see any retries of the tenant deletion in the logs. once it's done it's done. so extend the regex to contain the tenant deletion message as well. One could wonder why the regex and why not using the API to check whether the timeline is just "gone". The issue with the API is that it doesn't allow one to distinguish between "deleted" and "has never existed", and latter case might race with `pull_timeline`. I.e. the second case flakiness helped in the discovery of a real bug (no tenant tombstones), so the more precise check was helpful. Before, I could easily reproduce 2-9 occurences of flakiness when running the test with an additional `range(128)` parameter (i.e. 218 times 4 times). With this patch, I ran it three times, not a single failure. Fixes neondatabase#11838
1 parent eadabed commit 831f2a4

File tree

2 files changed

+32
-6
lines changed

2 files changed

+32
-6
lines changed

safekeeper/src/timelines_global_map.rs

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ struct GlobalTimelinesState {
4444
// on-demand timeline creation from recreating deleted timelines. This is only soft-enforced, as
4545
// this map is dropped on restart.
4646
tombstones: HashMap<TenantTimelineId, Instant>,
47+
tenant_tombstones: HashMap<TenantId, Instant>,
4748

4849
conf: Arc<SafeKeeperConf>,
4950
broker_active_set: Arc<TimelinesSet>,
@@ -81,10 +82,25 @@ impl GlobalTimelinesState {
8182
}
8283
}
8384

85+
fn has_tombstone(&self, ttid: &TenantTimelineId) -> bool {
86+
self.tombstones.contains_key(ttid) || self.tenant_tombstones.contains_key(&ttid.tenant_id)
87+
}
88+
89+
/// Removes all blocking tombstones for the given timeline ID.
90+
/// Returns `true` if there have been actual changes.
91+
fn remove_tombstone(&mut self, ttid: &TenantTimelineId) -> bool {
92+
self.tombstones.remove(ttid).is_some()
93+
|| self.tenant_tombstones.remove(&ttid.tenant_id).is_some()
94+
}
95+
8496
fn delete(&mut self, ttid: TenantTimelineId) {
8597
self.timelines.remove(&ttid);
8698
self.tombstones.insert(ttid, Instant::now());
8799
}
100+
101+
fn add_tenant_tombstone(&mut self, tenant_id: TenantId) {
102+
self.tenant_tombstones.insert(tenant_id, Instant::now());
103+
}
88104
}
89105

90106
/// A struct used to manage access to the global timelines map.
@@ -99,6 +115,7 @@ impl GlobalTimelines {
99115
state: Mutex::new(GlobalTimelinesState {
100116
timelines: HashMap::new(),
101117
tombstones: HashMap::new(),
118+
tenant_tombstones: HashMap::new(),
102119
conf,
103120
broker_active_set: Arc::new(TimelinesSet::default()),
104121
global_rate_limiter: RateLimiter::new(1, 1),
@@ -245,7 +262,7 @@ impl GlobalTimelines {
245262
return Ok(timeline);
246263
}
247264

248-
if state.tombstones.contains_key(&ttid) {
265+
if state.has_tombstone(&ttid) {
249266
anyhow::bail!("Timeline {ttid} is deleted, refusing to recreate");
250267
}
251268

@@ -295,13 +312,14 @@ impl GlobalTimelines {
295312
_ => {}
296313
}
297314
if check_tombstone {
298-
if state.tombstones.contains_key(&ttid) {
315+
if state.has_tombstone(&ttid) {
299316
anyhow::bail!("timeline {ttid} is deleted, refusing to recreate");
300317
}
301318
} else {
302319
// We may be have been asked to load a timeline that was previously deleted (e.g. from `pull_timeline.rs`). We trust
303320
// that the human doing this manual intervention knows what they are doing, and remove its tombstone.
304-
if state.tombstones.remove(&ttid).is_some() {
321+
// It's also possible that we enter this when the tenant has been deleted, even if the timeline itself has never existed.
322+
if state.remove_tombstone(&ttid) {
305323
warn!("un-deleted timeline {ttid}");
306324
}
307325
}
@@ -482,6 +500,7 @@ impl GlobalTimelines {
482500
let tli_res = {
483501
let state = self.state.lock().unwrap();
484502

503+
// Do NOT check tenant tombstones here: those were set earlier
485504
if state.tombstones.contains_key(ttid) {
486505
// Presence of a tombstone guarantees that a previous deletion has completed and there is no work to do.
487506
info!("Timeline {ttid} was already deleted");
@@ -557,6 +576,10 @@ impl GlobalTimelines {
557576
action: DeleteOrExclude,
558577
) -> Result<HashMap<TenantTimelineId, TimelineDeleteResult>> {
559578
info!("deleting all timelines for tenant {}", tenant_id);
579+
580+
// Adding a tombstone before getting the timelines to prevent new timeline additions
581+
self.state.lock().unwrap().add_tenant_tombstone(*tenant_id);
582+
560583
let to_delete = self.get_all_for_tenant(*tenant_id);
561584

562585
let mut err = None;
@@ -600,6 +623,9 @@ impl GlobalTimelines {
600623
state
601624
.tombstones
602625
.retain(|_, v| now.duration_since(*v) < *tombstone_ttl);
626+
state
627+
.tenant_tombstones
628+
.retain(|_, v| now.duration_since(*v) < *tombstone_ttl);
603629
}
604630
}
605631

test_runner/regress/test_storage_controller.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4192,10 +4192,10 @@ def logged_contains_on_sk():
41924192
# ensure the safekeeper deleted the timeline
41934193
def timeline_deleted_on_active_sks():
41944194
env.safekeepers[0].assert_log_contains(
4195-
f"deleting timeline {tenant_id}/{child_timeline_id} from disk"
4195+
f"((deleting timeline|Timeline) {tenant_id}/{child_timeline_id} (from disk|was already deleted)|DELETE.*tenant/{tenant_id} .*status: 200 OK)"
41964196
)
41974197
env.safekeepers[2].assert_log_contains(
4198-
f"deleting timeline {tenant_id}/{child_timeline_id} from disk"
4198+
f"((deleting timeline|Timeline) {tenant_id}/{child_timeline_id} (from disk|was already deleted)|DELETE.*tenant/{tenant_id} .*status: 200 OK)"
41994199
)
42004200

42014201
wait_until(timeline_deleted_on_active_sks)
@@ -4210,7 +4210,7 @@ def timeline_deleted_on_active_sks():
42104210
# ensure that there is log msgs for the third safekeeper too
42114211
def timeline_deleted_on_sk():
42124212
env.safekeepers[1].assert_log_contains(
4213-
f"deleting timeline {tenant_id}/{child_timeline_id} from disk"
4213+
f"((deleting timeline|Timeline) {tenant_id}/{child_timeline_id} (from disk|was already deleted)|DELETE.*tenant/{tenant_id} .*status: 200 OK)"
42144214
)
42154215

42164216
wait_until(timeline_deleted_on_sk)

0 commit comments

Comments
 (0)