#endif
/*
- * Deduplicate items on a leaf page. The page will have to be split by caller
- * if we cannot successfully free at least newitemsz (we also need space for
- * newitem's line pointer, which isn't included in caller's newitemsz).
+ * Perform a deduplication pass.
*
* The general approach taken here is to perform as much deduplication as
* possible to free as much space as possible. Note, however, that "single
* handle those if and when the anticipated right half page gets its own
* deduplication pass, following further inserts of duplicates.)
*
- * This function should be called during insertion, when the page doesn't have
- * enough space to fit an incoming newitem. If the BTP_HAS_GARBAGE page flag
- * was set, caller should have removed any LP_DEAD items by calling
- * _bt_vacuum_one_page() before calling here. We may still have to kill
- * LP_DEAD items here when the page's BTP_HAS_GARBAGE hint is falsely unset,
- * but that should be rare. Also, _bt_vacuum_one_page() won't unset the
- * BTP_HAS_GARBAGE flag when it finds no LP_DEAD items, so a successful
- * deduplication pass will always clear it, just to keep things tidy.
+ * The page will have to be split if we cannot successfully free at least
+ * newitemsz (we also need space for newitem's line pointer, which isn't
+ * included in caller's newitemsz).
+ *
+ * Note: Caller should have already deleted all existing items with their
+ * LP_DEAD bits set.
*/
void
-_bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel,
- IndexTuple newitem, Size newitemsz, bool checkingunique)
+_bt_dedup_pass(Relation rel, Buffer buf, Relation heapRel, IndexTuple newitem,
+ Size newitemsz, bool checkingunique)
{
OffsetNumber offnum,
minoff,
maxoff;
Page page = BufferGetPage(buf);
- BTPageOpaque opaque;
+ BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
Page newpage;
- OffsetNumber deletable[MaxIndexTuplesPerPage];
BTDedupState state;
- int ndeletable = 0;
Size pagesaving = 0;
bool singlevalstrat = false;
int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
- /*
- * We can't assume that there are no LP_DEAD items. For one thing, VACUUM
- * will clear the BTP_HAS_GARBAGE hint without reliably removing items
- * that are marked LP_DEAD. We don't want to unnecessarily unset LP_DEAD
- * bits when deduplicating items. Allowing it would be correct, though
- * wasteful.
- */
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- minoff = P_FIRSTDATAKEY(opaque);
- maxoff = PageGetMaxOffsetNumber(page);
- for (offnum = minoff;
- offnum <= maxoff;
- offnum = OffsetNumberNext(offnum))
- {
- ItemId itemid = PageGetItemId(page, offnum);
-
- if (ItemIdIsDead(itemid))
- deletable[ndeletable++] = offnum;
- }
-
- if (ndeletable > 0)
- {
- _bt_delitems_delete(rel, buf, deletable, ndeletable, heapRel);
-
- /*
- * Return when a split will be avoided. This is equivalent to
- * avoiding a split using the usual _bt_vacuum_one_page() path.
- */
- if (PageGetFreeSpace(page) >= newitemsz)
- return;
-
- /*
- * Reconsider number of items on page, in case _bt_delitems_delete()
- * managed to delete an item or two
- */
- minoff = P_FIRSTDATAKEY(opaque);
- maxoff = PageGetMaxOffsetNumber(page);
- }
-
/* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
newitemsz += sizeof(ItemIdData);
/*
- * By here, it's clear that deduplication will definitely be attempted.
* Initialize deduplication state.
*
* It would be possible for maxpostingsize (limit on posting list tuple
/* nintervals should be initialized to zero */
state->nintervals = 0;
+ minoff = P_FIRSTDATAKEY(opaque);
+ maxoff = PageGetMaxOffsetNumber(page);
+
/* Determine if "single value" strategy should be used */
if (!checkingunique)
singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
/*
* By here, it's clear that deduplication will definitely go ahead.
*
- * Clear the BTP_HAS_GARBAGE page flag in the unlikely event that it is
- * still falsely set, just to keep things tidy. (We can't rely on
- * _bt_vacuum_one_page() having done this already, and we can't rely on a
- * page split or VACUUM getting to it in the near future.)
+ * Clear the BTP_HAS_GARBAGE page flag. The index must be a heapkeyspace
+ * index, and as such we'll never pay attention to BTP_HAS_GARBAGE anyway.
+ * But keep things tidy.
*/
if (P_HAS_GARBAGE(opaque))
{
static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
static inline bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
OffsetNumber itup_off, bool newfirstdataitem);
-static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel);
+static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
+ BTInsertState insertstate,
+ bool lpdeadonly, bool checkingunique,
+ bool uniquedup);
/*
* _bt_doinsert() -- Handle insertion of a single index tuple in the tree.
}
/*
- * If the target page is full, see if we can obtain enough space by
- * erasing LP_DEAD items. If that fails to free enough space, see if
- * we can avoid a page split by performing a deduplication pass over
- * the page.
- *
- * We only perform a deduplication pass for a checkingunique caller
- * when the incoming item is a duplicate of an existing item on the
- * leaf page. This heuristic avoids wasting cycles -- we only expect
- * to benefit from deduplicating a unique index page when most or all
- * recently added items are duplicates. See nbtree/README.
+ * If the target page is full, see if we can obtain enough space using
+ * one or more strategies (e.g. erasing LP_DEAD items, deduplication).
+ * Page splits are expensive, and should only go ahead when truly
+ * necessary.
*/
if (PageGetFreeSpace(page) < insertstate->itemsz)
- {
- if (P_HAS_GARBAGE(opaque))
- {
- _bt_vacuum_one_page(rel, insertstate->buf, heapRel);
- insertstate->bounds_valid = false;
-
- /* Might as well assume duplicates (if checkingunique) */
- uniquedup = true;
- }
-
- if (itup_key->allequalimage && BTGetDeduplicateItems(rel) &&
- (!checkingunique || uniquedup) &&
- PageGetFreeSpace(page) < insertstate->itemsz)
- {
- _bt_dedup_one_page(rel, insertstate->buf, heapRel,
- insertstate->itup, insertstate->itemsz,
- checkingunique);
- insertstate->bounds_valid = false;
- }
- }
+ _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false,
+ checkingunique, uniquedup);
}
else
{
*/
if (P_HAS_GARBAGE(opaque))
{
- _bt_vacuum_one_page(rel, insertstate->buf, heapRel);
- insertstate->bounds_valid = false;
+ /* Erase LP_DEAD items (won't deduplicate) */
+ _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
+ checkingunique, false);
if (PageGetFreeSpace(page) >= insertstate->itemsz)
break; /* OK, now we have enough space */
* performing a posting list split, so delete all LP_DEAD items early.
* This is the only case where LP_DEAD deletes happen even though
* there is space for newitem on the page.
+ *
+ * This can only erase LP_DEAD items (it won't deduplicate).
*/
- _bt_vacuum_one_page(rel, insertstate->buf, heapRel);
+ _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
+ checkingunique, false);
/*
* Do new binary search. New insert location cannot overlap with any
* posting list now.
*/
- insertstate->bounds_valid = false;
+ Assert(!insertstate->bounds_valid);
insertstate->postingoff = 0;
newitemoff = _bt_binsrch_insert(rel, insertstate);
Assert(insertstate->postingoff == 0);
}
/*
- * _bt_vacuum_one_page - vacuum just one index page.
+ * _bt_delete_or_dedup_one_page - Try to avoid a leaf page split by attempting
+ * a variety of operations.
+ *
+ * There are two operations performed here: deleting items already marked
+ * LP_DEAD, and deduplication. If both operations fail to free enough space
+ * for the incoming item then caller will go on to split the page. We always
+ * attempt our preferred strategy (which is to delete items whose LP_DEAD bit
+ * are set) first. If that doesn't work out we move on to deduplication.
+ *
+ * Caller's checkingunique and uniquedup arguments help us decide if we should
+ * perform deduplication, which is primarily useful with low cardinality data,
+ * but can sometimes absorb version churn.
+ *
+ * Callers that only want us to look for/delete LP_DEAD items can ask for that
+ * directly by passing true 'lpdeadonly' argument.
*
- * Try to remove LP_DEAD items from the given page. The passed buffer
- * must be exclusive-locked, but unlike a real VACUUM, we don't need a
- * super-exclusive "cleanup" lock (see nbtree/README).
+ * Note: We used to only delete LP_DEAD items when the BTP_HAS_GARBAGE page
+ * level flag was found set. The flag was useful back when there wasn't
+ * necessarily one single page for a duplicate tuple to go on (before heap TID
+ * became a part of the key space in version 4 indexes). But we don't
+ * actually look at the flag anymore (it's not a gating condition for our
+ * caller). That would cause us to miss tuples that are safe to delete,
+ * without getting any benefit in return. We know that the alternative is to
+ * split the page; scanning the line pointer array in passing won't have
+ * noticeable overhead. (We still maintain the BTP_HAS_GARBAGE flag despite
+ * all this because !heapkeyspace indexes must still do a "getting tired"
+ * linear search, and so are likely to get some benefit from using it as a
+ * gating condition.)
*/
static void
-_bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel)
+_bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
+ BTInsertState insertstate,
+ bool lpdeadonly, bool checkingunique,
+ bool uniquedup)
{
OffsetNumber deletable[MaxIndexTuplesPerPage];
int ndeletable = 0;
OffsetNumber offnum,
- minoff,
maxoff;
+ Buffer buffer = insertstate->buf;
+ BTScanInsert itup_key = insertstate->itup_key;
Page page = BufferGetPage(buffer);
BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
Assert(P_ISLEAF(opaque));
+ Assert(lpdeadonly || itup_key->heapkeyspace);
/*
* Scan over all items to see which ones need to be deleted according to
* LP_DEAD flags.
*/
- minoff = P_FIRSTDATAKEY(opaque);
maxoff = PageGetMaxOffsetNumber(page);
- for (offnum = minoff;
+ for (offnum = P_FIRSTDATAKEY(opaque);
offnum <= maxoff;
offnum = OffsetNumberNext(offnum))
{
}
if (ndeletable > 0)
+ {
_bt_delitems_delete(rel, buffer, deletable, ndeletable, heapRel);
+ insertstate->bounds_valid = false;
+
+ /* Return when a page split has already been avoided */
+ if (PageGetFreeSpace(page) >= insertstate->itemsz)
+ return;
+
+ /* Might as well assume duplicates (if checkingunique) */
+ uniquedup = true;
+ }
+
+ /*
+ * Some callers only want to delete LP_DEAD items. Return early for these
+ * callers.
+ *
+ * Note: The page's BTP_HAS_GARBAGE hint flag may still be set when we
+ * return at this point (or when we go on the try either or both of our
+ * other strategies and they also fail). We do not bother expending a
+ * separate write to clear it, however. Caller will definitely clear it
+ * when it goes on to split the page (plus deduplication knows to clear
+ * the flag when it actually modifies the page).
+ */
+ if (lpdeadonly)
+ return;
+
+ /*
+ * We can get called in the checkingunique case when there is no reason to
+ * believe that there are any duplicates on the page; we should at least
+ * still check for LP_DEAD items. If that didn't work out, give up and
+ * let caller split the page. Deduplication cannot be justified given
+ * there is no reason to think that there are duplicates.
+ */
+ if (checkingunique && !uniquedup)
+ return;
+
+ /* Assume bounds about to be invalidated (this is almost certain now) */
+ insertstate->bounds_valid = false;
/*
- * Note: if we didn't find any LP_DEAD items, then the page's
- * BTP_HAS_GARBAGE hint bit is falsely set. We do not bother expending a
- * separate write to clear it, however. We will clear it when we split
- * the page, or when deduplication runs.
+ * Perform deduplication pass, though only when it is enabled for the
+ * index and known to be safe (it must be an allequalimage index).
*/
+ if (BTGetDeduplicateItems(rel) && itup_key->allequalimage)
+ _bt_dedup_pass(rel, buffer, heapRel, insertstate->itup,
+ insertstate->itemsz, checkingunique);
}
* array of offset numbers.
*
* PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
- * happens to already be set. Although we unset the BTP_HAS_GARBAGE page
- * level flag, unsetting individual LP_DEAD bits should still be avoided.
+ * happens to already be set. It's important that we not interfere with
+ * _bt_delitems_delete().
*/
for (int i = 0; i < nupdatable; i++)
{
opaque->btpo_cycleid = 0;
/*
- * Mark the page as not containing any LP_DEAD items. This is not
- * certainly true (there might be some that have recently been marked, but
- * weren't targeted by VACUUM's heap scan), but it will be true often
- * enough. VACUUM does not delete items purely because they have their
- * LP_DEAD bit set, since doing so would necessitate explicitly logging a
- * latestRemovedXid cutoff (this is how _bt_delitems_delete works).
+ * Clear the BTP_HAS_GARBAGE page flag.
*
- * The consequences of falsely unsetting BTP_HAS_GARBAGE should be fairly
- * limited, since we never falsely unset an LP_DEAD bit. Workloads that
- * are particularly dependent on LP_DEAD bits being set quickly will
- * usually manage to set the BTP_HAS_GARBAGE flag before the page fills up
- * again anyway. Furthermore, attempting a deduplication pass will remove
- * all LP_DEAD items, regardless of whether the BTP_HAS_GARBAGE hint bit
- * is set or not.
+ * This flag indicates the presence of LP_DEAD items on the page (though
+ * not reliably). Note that we only trust it with pg_upgrade'd
+ * !heapkeyspace indexes. That's why clearing it here won't usually
+ * interfere with _bt_delitems_delete().
*/
opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
/*
* Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID,
- * because this is not called by VACUUM. Just clear the BTP_HAS_GARBAGE
- * page flag, since we deleted all items with their LP_DEAD bit set.
+ * because this is not called by VACUUM
*/
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ /*
+ * Clear the BTP_HAS_GARBAGE page flag.
+ *
+ * This flag indicates the presence of LP_DEAD items on the page (though
+ * not reliably). Note that we only trust it with pg_upgrade'd
+ * !heapkeyspace indexes.
+ */
opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
MarkBufferDirty(buf);