Skip to content

Commit 4a4a457

Browse files
authored
fix(pageserver): frozen->L0 flush failure causes data loss (neondatabase#12043)
This patch is a fixup for - neondatabase#6788 Background ---------- That PR 6788 added artificial advancement of `disk_consistent_lsn` and `remote_consistent_lsn` for shards that weren't written to while other shards _were_ written to. See the PR description for more context. At the time of that PR, Pageservers shards were doing WAL filtering. Nowadays, the WAL filtering happens in Safekeepers. Shards learn about the WAL gaps via `InterpretedWalRecords::next_record_lsn`. The Bug ------- That artificial advancement code also runs if the flush failed. So, we advance the disk_consistent_lsn / remote_consistent_lsn, without having the corresponding L0 to the `index_part.json`. The frozen layer remains in the layer map until detach, so we continue to serve data correctly. We're not advancing flush loop variable `flushed_to_lsn` either, so, subsequent flush requests will retry the flush and repair the situation if they succeed. But if there aren't any successful retries, eventually the tenant will be detached and when it is attached somewhere else, the `index_part.json` and therefore layer map... 1. ... does not contain the frozen layer that failed to flush and 2. ... won't re-ingest that WAL either because walreceiver starts up with the advanced disk_consistent_lsn/remote_consistent_lsn. The result is that the read path will have a gap in the reconstruct data for the keys whose modifications were lost, resulting in a) either walredo failure b) or an incorrect page@lsn image if walredo doesn't error. The Fix ------- The fix is to only do the artificial advancement if `result.is_ok()`. Misc ---- As an aside, I took some time to re-review the flush loop and its callers. I found one more bug related to error handling that I filed here: - neondatabase#12025 ## Problem ## Summary of changes
1 parent e78d1e2 commit 4a4a457

File tree

2 files changed

+83
-3
lines changed

2 files changed

+83
-3
lines changed

pageserver/src/tenant.rs

Lines changed: 75 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5832,6 +5832,7 @@ pub(crate) mod harness {
58325832
pub conf: &'static PageServerConf,
58335833
pub tenant_conf: pageserver_api::models::TenantConfig,
58345834
pub tenant_shard_id: TenantShardId,
5835+
pub shard_identity: ShardIdentity,
58355836
pub generation: Generation,
58365837
pub shard: ShardIndex,
58375838
pub remote_storage: GenericRemoteStorage,
@@ -5899,6 +5900,7 @@ pub(crate) mod harness {
58995900
conf,
59005901
tenant_conf,
59015902
tenant_shard_id,
5903+
shard_identity,
59025904
generation,
59035905
shard,
59045906
remote_storage,
@@ -5960,8 +5962,7 @@ pub(crate) mod harness {
59605962
&ShardParameters::default(),
59615963
))
59625964
.unwrap(),
5963-
// This is a legacy/test code path: sharding isn't supported here.
5964-
ShardIdentity::unsharded(),
5965+
self.shard_identity,
59655966
Some(walredo_mgr),
59665967
self.tenant_shard_id,
59675968
self.remote_storage.clone(),
@@ -6083,6 +6084,7 @@ mod tests {
60836084
use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
60846085
use timeline::{CompactOptions, DeltaLayerTestDesc, VersionedKeySpaceQuery};
60856086
use utils::id::TenantId;
6087+
use utils::shard::{ShardCount, ShardNumber};
60866088

60876089
use super::*;
60886090
use crate::DEFAULT_PG_VERSION;
@@ -9418,6 +9420,77 @@ mod tests {
94189420
Ok(())
94199421
}
94209422

9423+
#[tokio::test]
9424+
async fn test_failed_flush_should_not_update_disk_consistent_lsn() -> anyhow::Result<()> {
9425+
//
9426+
// Setup
9427+
//
9428+
let harness = TenantHarness::create_custom(
9429+
"test_failed_flush_should_not_upload_disk_consistent_lsn",
9430+
pageserver_api::models::TenantConfig::default(),
9431+
TenantId::generate(),
9432+
ShardIdentity::new(ShardNumber(0), ShardCount(4), ShardStripeSize(128)).unwrap(),
9433+
Generation::new(1),
9434+
)
9435+
.await?;
9436+
let (tenant, ctx) = harness.load().await;
9437+
9438+
let timeline = tenant
9439+
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
9440+
.await?;
9441+
assert_eq!(timeline.get_shard_identity().count, ShardCount(4));
9442+
let mut writer = timeline.writer().await;
9443+
writer
9444+
.put(
9445+
*TEST_KEY,
9446+
Lsn(0x20),
9447+
&Value::Image(test_img("foo at 0x20")),
9448+
&ctx,
9449+
)
9450+
.await?;
9451+
writer.finish_write(Lsn(0x20));
9452+
drop(writer);
9453+
timeline.freeze_and_flush().await.unwrap();
9454+
9455+
timeline.remote_client.wait_completion().await.unwrap();
9456+
let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
9457+
let remote_consistent_lsn = timeline.get_remote_consistent_lsn_projected();
9458+
assert_eq!(Some(disk_consistent_lsn), remote_consistent_lsn);
9459+
9460+
//
9461+
// Test
9462+
//
9463+
9464+
let mut writer = timeline.writer().await;
9465+
writer
9466+
.put(
9467+
*TEST_KEY,
9468+
Lsn(0x30),
9469+
&Value::Image(test_img("foo at 0x30")),
9470+
&ctx,
9471+
)
9472+
.await?;
9473+
writer.finish_write(Lsn(0x30));
9474+
drop(writer);
9475+
9476+
fail::cfg(
9477+
"flush-layer-before-update-remote-consistent-lsn",
9478+
"return()",
9479+
)
9480+
.unwrap();
9481+
9482+
let flush_res = timeline.freeze_and_flush().await;
9483+
// if flush failed, the disk/remote consistent LSN should not be updated
9484+
assert!(flush_res.is_err());
9485+
assert_eq!(disk_consistent_lsn, timeline.get_disk_consistent_lsn());
9486+
assert_eq!(
9487+
remote_consistent_lsn,
9488+
timeline.get_remote_consistent_lsn_projected()
9489+
);
9490+
9491+
Ok(())
9492+
}
9493+
94219494
#[cfg(feature = "testing")]
94229495
#[tokio::test]
94239496
async fn test_simple_bottom_most_compaction_deltas_1() -> anyhow::Result<()> {

pageserver/src/tenant/timeline.rs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4767,7 +4767,10 @@ impl Timeline {
47674767
|| !flushed_to_lsn.is_valid()
47684768
);
47694769

4770-
if flushed_to_lsn < frozen_to_lsn && self.shard_identity.count.count() > 1 {
4770+
if flushed_to_lsn < frozen_to_lsn
4771+
&& self.shard_identity.count.count() > 1
4772+
&& result.is_ok()
4773+
{
47714774
// If our layer flushes didn't carry disk_consistent_lsn up to the `to_lsn` advertised
47724775
// to us via layer_flush_start_rx, then advance it here.
47734776
//
@@ -4946,6 +4949,10 @@ impl Timeline {
49464949
return Err(FlushLayerError::Cancelled);
49474950
}
49484951

4952+
fail_point!("flush-layer-before-update-remote-consistent-lsn", |_| {
4953+
Err(FlushLayerError::Other(anyhow!("failpoint").into()))
4954+
});
4955+
49494956
let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1);
49504957

49514958
// The new on-disk layers are now in the layer map. We can remove the

0 commit comments

Comments
 (0)