Skip to content

Commit 67ddf1d

Browse files
authored
feat(pageserver): create image layers at L0-L1 boundary (neondatabase#12023)
## Problem Previous attempt neondatabase#10548 caused some issues in staging and we reverted it. This is a re-attempt to address neondatabase#11063. Currently we create image layers at latest record LSN. We would create "future image layers" (i.e., image layers with LSN larger than disk consistent LSN) that need special handling at startup. We also waste a lot of read operations to reconstruct from L0 layers while we could have compacted all of the L0 layers and operate on a flat level of historic layers. ## Summary of changes * Run repartition at L0-L1 boundary. * Roll out with feature flags. * Piggyback a change that downgrades "image layer creating below gc_cutoff" to debug level. --------- Signed-off-by: Alex Chi Z <[email protected]>
1 parent 541fcd8 commit 67ddf1d

File tree

4 files changed

+90
-15
lines changed

4 files changed

+90
-15
lines changed

pageserver/src/tenant.rs

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5315,6 +5315,7 @@ impl TenantShard {
53155315
l0_compaction_trigger: self.l0_compaction_trigger.clone(),
53165316
l0_flush_global_state: self.l0_flush_global_state.clone(),
53175317
basebackup_prepare_sender: self.basebackup_prepare_sender.clone(),
5318+
feature_resolver: self.feature_resolver.clone(),
53185319
}
53195320
}
53205321

@@ -8359,10 +8360,24 @@ mod tests {
83598360
}
83608361

83618362
tline.freeze_and_flush().await?;
8363+
// Force layers to L1
8364+
tline
8365+
.compact(
8366+
&cancel,
8367+
{
8368+
let mut flags = EnumSet::new();
8369+
flags.insert(CompactFlags::ForceL0Compaction);
8370+
flags
8371+
},
8372+
&ctx,
8373+
)
8374+
.await?;
83628375

83638376
if iter % 5 == 0 {
8377+
let scan_lsn = Lsn(lsn.0 + 1);
8378+
info!("scanning at {}", scan_lsn);
83648379
let (_, before_delta_file_accessed) =
8365-
scan_with_statistics(&tline, &keyspace, lsn, &ctx, io_concurrency.clone())
8380+
scan_with_statistics(&tline, &keyspace, scan_lsn, &ctx, io_concurrency.clone())
83668381
.await?;
83678382
tline
83688383
.compact(
@@ -8371,13 +8386,14 @@ mod tests {
83718386
let mut flags = EnumSet::new();
83728387
flags.insert(CompactFlags::ForceImageLayerCreation);
83738388
flags.insert(CompactFlags::ForceRepartition);
8389+
flags.insert(CompactFlags::ForceL0Compaction);
83748390
flags
83758391
},
83768392
&ctx,
83778393
)
83788394
.await?;
83798395
let (_, after_delta_file_accessed) =
8380-
scan_with_statistics(&tline, &keyspace, lsn, &ctx, io_concurrency.clone())
8396+
scan_with_statistics(&tline, &keyspace, scan_lsn, &ctx, io_concurrency.clone())
83818397
.await?;
83828398
assert!(
83838399
after_delta_file_accessed < before_delta_file_accessed,
@@ -8818,6 +8834,8 @@ mod tests {
88188834

88198835
let cancel = CancellationToken::new();
88208836

8837+
// Image layer creation happens on the disk_consistent_lsn so we need to force set it now.
8838+
tline.force_set_disk_consistent_lsn(Lsn(0x40));
88218839
tline
88228840
.compact(
88238841
&cancel,
@@ -8831,8 +8849,7 @@ mod tests {
88318849
)
88328850
.await
88338851
.unwrap();
8834-
8835-
// Image layers are created at last_record_lsn
8852+
// Image layers are created at repartition LSN
88368853
let images = tline
88378854
.inspect_image_layers(Lsn(0x40), &ctx, io_concurrency.clone())
88388855
.await

pageserver/src/tenant/timeline.rs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ use crate::context::{
103103
DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
104104
};
105105
use crate::disk_usage_eviction_task::{DiskUsageEvictionInfo, EvictionCandidate, finite_f32};
106+
use crate::feature_resolver::FeatureResolver;
106107
use crate::keyspace::{KeyPartitioning, KeySpace};
107108
use crate::l0_flush::{self, L0FlushGlobalState};
108109
use crate::metrics::{
@@ -198,6 +199,7 @@ pub struct TimelineResources {
198199
pub l0_compaction_trigger: Arc<Notify>,
199200
pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
200201
pub basebackup_prepare_sender: BasebackupPrepareSender,
202+
pub feature_resolver: FeatureResolver,
201203
}
202204

203205
pub struct Timeline {
@@ -444,6 +446,8 @@ pub struct Timeline {
444446

445447
/// A channel to send async requests to prepare a basebackup for the basebackup cache.
446448
basebackup_prepare_sender: BasebackupPrepareSender,
449+
450+
feature_resolver: FeatureResolver,
447451
}
448452

449453
pub(crate) enum PreviousHeatmap {
@@ -3072,6 +3076,8 @@ impl Timeline {
30723076
wait_lsn_log_slow: tokio::sync::Semaphore::new(1),
30733077

30743078
basebackup_prepare_sender: resources.basebackup_prepare_sender,
3079+
3080+
feature_resolver: resources.feature_resolver,
30753081
};
30763082

30773083
result.repartition_threshold =
@@ -4906,6 +4912,7 @@ impl Timeline {
49064912
LastImageLayerCreationStatus::Initial,
49074913
false, // don't yield for L0, we're flushing L0
49084914
)
4915+
.instrument(info_span!("create_image_layers", mode = %ImageLayerCreationMode::Initial, partition_mode = "initial", lsn = %self.initdb_lsn))
49094916
.await?;
49104917
debug_assert!(
49114918
matches!(is_complete, LastImageLayerCreationStatus::Complete),
@@ -5462,7 +5469,8 @@ impl Timeline {
54625469

54635470
/// Returns the image layers generated and an enum indicating whether the process is fully completed.
54645471
/// true = we have generate all image layers, false = we preempt the process for L0 compaction.
5465-
#[tracing::instrument(skip_all, fields(%lsn, %mode))]
5472+
///
5473+
/// `partition_mode` is only for logging purpose and is not used anywhere in this function.
54665474
async fn create_image_layers(
54675475
self: &Arc<Timeline>,
54685476
partitioning: &KeyPartitioning,

pageserver/src/tenant/timeline/compaction.rs

Lines changed: 57 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1278,11 +1278,55 @@ impl Timeline {
12781278
}
12791279

12801280
let gc_cutoff = *self.applied_gc_cutoff_lsn.read();
1281+
let l0_l1_boundary_lsn = {
1282+
// We do the repartition on the L0-L1 boundary. All data below the boundary
1283+
// are compacted by L0 with low read amplification, thus making the `repartition`
1284+
// function run fast.
1285+
let guard = self.layers.read().await;
1286+
guard
1287+
.all_persistent_layers()
1288+
.iter()
1289+
.map(|x| {
1290+
// Use the end LSN of delta layers OR the start LSN of image layers.
1291+
if x.is_delta {
1292+
x.lsn_range.end
1293+
} else {
1294+
x.lsn_range.start
1295+
}
1296+
})
1297+
.max()
1298+
};
1299+
1300+
let (partition_mode, partition_lsn) = if cfg!(test)
1301+
|| cfg!(feature = "testing")
1302+
|| self
1303+
.feature_resolver
1304+
.evaluate_boolean("image-compaction-boundary", self.tenant_shard_id.tenant_id)
1305+
.is_ok()
1306+
{
1307+
let last_repartition_lsn = self.partitioning.read().1;
1308+
let lsn = match l0_l1_boundary_lsn {
1309+
Some(boundary) => gc_cutoff
1310+
.max(boundary)
1311+
.max(last_repartition_lsn)
1312+
.max(self.initdb_lsn)
1313+
.max(self.ancestor_lsn),
1314+
None => self.get_last_record_lsn(),
1315+
};
1316+
if lsn <= self.initdb_lsn || lsn <= self.ancestor_lsn {
1317+
// Do not attempt to create image layers below the initdb or ancestor LSN -- no data below it
1318+
("l0_l1_boundary", self.get_last_record_lsn())
1319+
} else {
1320+
("l0_l1_boundary", lsn)
1321+
}
1322+
} else {
1323+
("latest_record", self.get_last_record_lsn())
1324+
};
12811325

12821326
// 2. Repartition and create image layers if necessary
12831327
match self
12841328
.repartition(
1285-
self.get_last_record_lsn(),
1329+
partition_lsn,
12861330
self.get_compaction_target_size(),
12871331
options.flags,
12881332
ctx,
@@ -1301,25 +1345,27 @@ impl Timeline {
13011345
.extend(sparse_partitioning.into_dense().parts);
13021346

13031347
// 3. Create new image layers for partitions that have been modified "enough".
1348+
let mode = if options
1349+
.flags
1350+
.contains(CompactFlags::ForceImageLayerCreation)
1351+
{
1352+
ImageLayerCreationMode::Force
1353+
} else {
1354+
ImageLayerCreationMode::Try
1355+
};
13041356
let (image_layers, outcome) = self
13051357
.create_image_layers(
13061358
&partitioning,
13071359
lsn,
1308-
if options
1309-
.flags
1310-
.contains(CompactFlags::ForceImageLayerCreation)
1311-
{
1312-
ImageLayerCreationMode::Force
1313-
} else {
1314-
ImageLayerCreationMode::Try
1315-
},
1360+
mode,
13161361
&image_ctx,
13171362
self.last_image_layer_creation_status
13181363
.load()
13191364
.as_ref()
13201365
.clone(),
13211366
options.flags.contains(CompactFlags::YieldForL0),
13221367
)
1368+
.instrument(info_span!("create_image_layers", mode = %mode, partition_mode = %partition_mode, lsn = %lsn))
13231369
.await
13241370
.inspect_err(|err| {
13251371
if let CreateImageLayersError::GetVectoredError(
@@ -1344,7 +1390,8 @@ impl Timeline {
13441390
}
13451391

13461392
Ok(_) => {
1347-
info!("skipping repartitioning due to image compaction LSN being below GC cutoff");
1393+
// This happens very frequently so we don't want to log it.
1394+
debug!("skipping repartitioning due to image compaction LSN being below GC cutoff");
13481395
}
13491396

13501397
// Suppress errors when cancelled.

test_runner/regress/test_layers_from_future.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@
2020
from fixtures.utils import query_scalar, wait_until
2121

2222

23+
@pytest.mark.skip(
24+
reason="We won't create future layers any more after https://github.com/neondatabase/neon/pull/10548"
25+
)
2326
@pytest.mark.parametrize(
2427
"attach_mode",
2528
["default_generation", "same_generation"],

0 commit comments

Comments
 (0)