Skip to content

Commit eebda74

Browse files
denis-chudovsergey-chugunov-1985
authored andcommitted
IGNITE-15227 Better diagnostic information for PDS corruption scenarios. Fixes apache#9292
Signed-off-by: Sergey Chugunov <[email protected]>
1 parent 5b051a2 commit eebda74

File tree

16 files changed

+642
-89
lines changed

16 files changed

+642
-89
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.ignite.internal.processors.cache.persistence;
18+
19+
import java.util.HashSet;
20+
import java.util.Set;
21+
import java.util.stream.Collectors;
22+
import org.apache.ignite.IgniteCheckedException;
23+
import org.apache.ignite.internal.pagemem.PageIdAllocator;
24+
import org.apache.ignite.internal.pagemem.PageIdUtils;
25+
import org.apache.ignite.internal.processors.cache.persistence.tree.BPlusTreeRuntimeException;
26+
import org.apache.ignite.internal.util.typedef.T2;
27+
import org.apache.ignite.internal.util.typedef.X;
28+
import org.jetbrains.annotations.Nullable;
29+
30+
import static java.util.Arrays.asList;
31+
32+
/**
33+
* Abstract exception for exceptions related to persistence corruption.
34+
*/
35+
public abstract class AbstractCorruptedPersistenceException extends IgniteCheckedException implements CorruptedPersistenceException {
36+
/** */
37+
protected final T2<Integer, Long>[] pages;
38+
39+
/**
40+
* @param msg Message.
41+
* @param cause Cause.
42+
* @param pages (groupId, pageId) pairs for pages that might be corrupted.
43+
*/
44+
protected AbstractCorruptedPersistenceException(String msg, @Nullable Throwable cause, T2<Integer, Long>[] pages) {
45+
super(msg, cause);
46+
47+
this.pages = expandPagesArray(pages, cause);
48+
}
49+
50+
/**
51+
* @param grpId Group id.
52+
* @param pageIds Pages ids.
53+
* @return Pairs of (groupId, pageId).
54+
*/
55+
protected static T2<Integer, Long>[] toPagesArray(int grpId, long[] pageIds) {
56+
T2<Integer, Long>[] res = (T2<Integer, Long>[])new T2[pageIds.length];
57+
58+
for (int i = 0; i < pageIds.length; i++)
59+
res[i] = new T2<>(grpId, pageIds[i]);
60+
61+
return res;
62+
}
63+
64+
/**
65+
* Add partition meta pages and related pages.
66+
* @param pages Pages with group ids.
67+
* @param cause Cause exception.
68+
* @return Extended list of pages.
69+
*/
70+
protected T2<Integer, Long>[] expandPagesArray(T2<Integer, Long>[] pages, Throwable cause) {
71+
Set<T2<Integer, Long>> res = new HashSet<>(asList(pages));
72+
73+
BPlusTreeRuntimeException treeRuntimeException = X.cause(cause, BPlusTreeRuntimeException.class);
74+
75+
// Add root exception pages ids if we have.
76+
if (treeRuntimeException != null)
77+
res.addAll(treeRuntimeException.pages());
78+
79+
Set<T2<Integer, Long>> partMetaPages = partitionMetaPages(res);
80+
81+
// Add meta pages for all (group,partition) pairs.
82+
res.addAll(partMetaPages);
83+
84+
return (T2<Integer, Long>[])res.toArray(new T2[0]);
85+
}
86+
87+
/**
88+
* @param pages Pages with group ids.
89+
* @return Partition meta pages with group ids, for given pages.
90+
*/
91+
protected Set<T2<Integer, Long>> partitionMetaPages(Set<T2<Integer, Long>> pages) {
92+
return pages.stream().map(page -> {
93+
int grpId = page.get1();
94+
int partId = PageIdUtils.partId(page.get2());
95+
96+
final long partMetaPageId = PageIdUtils.pageId(partId, PageIdAllocator.FLAG_DATA, 0);
97+
98+
return new T2<>(grpId, partMetaPageId);
99+
}).collect(Collectors.toSet());
100+
}
101+
102+
/** {@inheritDoc} */
103+
@Override public T2<Integer, Long>[] pages() {
104+
return pages;
105+
}
106+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.ignite.internal.processors.cache.persistence;
18+
19+
import org.apache.ignite.internal.util.typedef.T2;
20+
import org.jetbrains.annotations.Nullable;
21+
22+
/**
23+
* Exception to distinguish partition meta page broken invariants.
24+
*/
25+
public class CorruptedPartitionMetaPageException extends AbstractCorruptedPersistenceException {
26+
/** */
27+
private static final long serialVersionUID = 0L;
28+
29+
/**
30+
* @param msg Message.
31+
* @param cause Cause.
32+
* @param grpId Group id.
33+
* @param pages Ids of pages that might be corrupted.
34+
*/
35+
protected CorruptedPartitionMetaPageException(String msg, @Nullable Throwable cause, int grpId, long... pages) {
36+
this(msg, cause, toPagesArray(grpId, pages));
37+
}
38+
39+
/**
40+
* @param msg Message.
41+
* @param cause Cause.
42+
* @param pages (groupId, pageId) pairs for pages that might be corrupted.
43+
*/
44+
protected CorruptedPartitionMetaPageException(String msg,
45+
@Nullable Throwable cause,
46+
T2<Integer, Long>[] pages
47+
) {
48+
super(msg, cause, pages);
49+
}
50+
}

modules/core/src/main/java/org/apache/ignite/internal/processors/cache/persistence/CorruptedPersistenceException.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,14 @@
1717

1818
package org.apache.ignite.internal.processors.cache.persistence;
1919

20+
import org.apache.ignite.internal.util.typedef.T2;
21+
2022
/**
21-
* Marker interface to distinguish exceptions that were caused by broken persistence datastructures invariants.
23+
* Interface to distinguish exceptions that were caused by broken persistence datastructures invariants.
2224
*/
2325
public interface CorruptedPersistenceException {
26+
/**
27+
* @return (groupId, pageId) pairs for pages that might be corrupted.
28+
*/
29+
public T2<Integer, Long>[] pages();
2430
}

modules/core/src/main/java/org/apache/ignite/internal/processors/cache/persistence/GridCacheOffheapManager.java

Lines changed: 47 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
package org.apache.ignite.internal.processors.cache.persistence;
1919

20+
import java.util.ArrayList;
2021
import java.util.Arrays;
2122
import java.util.Collection;
2223
import java.util.Collections;
@@ -41,7 +42,6 @@
4142
import org.apache.ignite.IgniteSystemProperties;
4243
import org.apache.ignite.SystemProperty;
4344
import org.apache.ignite.failure.FailureContext;
44-
import org.apache.ignite.failure.FailureType;
4545
import org.apache.ignite.internal.managers.encryption.GridEncryptionManager;
4646
import org.apache.ignite.internal.managers.encryption.ReencryptStateUtils;
4747
import org.apache.ignite.internal.pagemem.FullPageId;
@@ -127,6 +127,7 @@
127127
import org.apache.ignite.lang.IgniteBiTuple;
128128
import org.jetbrains.annotations.Nullable;
129129

130+
import static org.apache.ignite.failure.FailureType.CRITICAL_ERROR;
130131
import static org.apache.ignite.internal.processors.cache.GridCacheTtlManager.DFLT_UNWIND_THROTTLING_TIMEOUT;
131132
import static org.apache.ignite.internal.processors.cache.distributed.dht.topology.GridDhtPartitionState.EVICTED;
132133
import static org.apache.ignite.internal.processors.cache.distributed.dht.topology.GridDhtPartitionState.MOVING;
@@ -598,6 +599,12 @@ else if (state == MOVING || state == RENTING) {
598599
encryptIdx,
599600
encryptCnt
600601
));
602+
603+
if (changed) {
604+
partStore.saveMetadata(grp.statisticsHolderData());
605+
606+
io.setPartitionMetaStoreReuseListRoot(partMetaPageAddr, partStore.metaPageId());
607+
}
601608
}
602609
finally {
603610
pageMem.writeUnlock(grpId, partMetaId, partMetaPage, null, changed);
@@ -2198,7 +2205,7 @@ private CacheDataStore init0(boolean checkExists) throws IgniteCheckedException
21982205
U.error(log, "Unhandled exception during page store initialization. All further operations will " +
21992206
"be failed and local node will be stopped.", ex);
22002207

2201-
ctx.kernalContext().failure().process(new FailureContext(FailureType.CRITICAL_ERROR, ex));
2208+
ctx.kernalContext().failure().process(new FailureContext(CRITICAL_ERROR, ex));
22022209

22032210
throw ex;
22042211
}
@@ -2227,6 +2234,7 @@ private CacheDataStore init0(boolean checkExists) throws IgniteCheckedException
22272234
*/
22282235
private Metas getOrAllocatePartitionMetas() throws IgniteCheckedException {
22292236
PageMemoryEx pageMem = (PageMemoryEx)grp.dataRegion().pageMemory();
2237+
22302238
IgniteWriteAheadLogManager wal = grp.shared().wal();
22312239

22322240
int grpId = grp.groupId();
@@ -2300,6 +2308,8 @@ private Metas getOrAllocatePartitionMetas() throws IgniteCheckedException {
23002308
pendingTreeAllocated = true;
23012309
}
23022310

2311+
checkGapsLinkAndPartMetaStorage(io, partMetaId, pageAddr, grpId, partId);
2312+
23032313
if ((partMetaStoreReuseListRoot = io.getPartitionMetaStoreReuseListRoot(pageAddr)) == 0) {
23042314
partMetaStoreReuseListRoot = pageMem.allocatePage(grpId, partId, PageMemory.FLAG_AUX);
23052315

@@ -2351,6 +2361,41 @@ && isWalDeltaRecordNeeded(pageMem, grpId, partMetaId, partMetaPage, wal, null))
23512361
}
23522362
}
23532363

2364+
/**
2365+
* Checks that links to counter data page and partition meta store are both present or both absent in partition.
2366+
*
2367+
* @param io Meta page io.
2368+
* @param pageAddr Meta page address.
2369+
* @param grpId Group id.
2370+
* @param partId Partition id.
2371+
*/
2372+
private void checkGapsLinkAndPartMetaStorage(PagePartitionMetaIOV3 io, long pageId, long pageAddr, int grpId, int partId) {
2373+
if (io.getPartitionMetaStoreReuseListRoot(pageAddr) == 0 && io.getGapsLink(pageAddr) != 0) {
2374+
String msg = "Partition meta page corruption: links to counter data page and partition " +
2375+
"meta store must both be present, or both be absent in partition [" +
2376+
"grpId=" + grpId +
2377+
", partId=" + partId +
2378+
", cntrUpdDataPageId=" + io.getGapsLink(pageAddr) +
2379+
", partitionMetaStoreReuseListRoot=" + io.getPartitionMetaStoreReuseListRoot(pageAddr) +
2380+
']';
2381+
2382+
List<Long> pages = new ArrayList<>();
2383+
2384+
pages.add(pageId);
2385+
2386+
if (io.getPartitionMetaStoreReuseListRoot(pageAddr) != 0)
2387+
pages.add(io.getPartitionMetaStoreReuseListRoot(pageAddr));
2388+
2389+
if (io.getGapsLink(pageAddr) != 0)
2390+
pages.add(io.getGapsLink(pageAddr));
2391+
2392+
CorruptedPartitionMetaPageException e =
2393+
new CorruptedPartitionMetaPageException(msg, null, grpId, pages.stream().mapToLong(l -> l).toArray());
2394+
2395+
grp.shared().kernalContext().failure().process(new FailureContext(CRITICAL_ERROR, e));
2396+
}
2397+
}
2398+
23542399
/** {@inheritDoc} */
23552400
@Override public CacheDataTree tree() {
23562401
return dataTree;

modules/core/src/main/java/org/apache/ignite/internal/processors/cache/persistence/defragmentation/CachePartitionDefragmentationManager.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -808,6 +808,9 @@ private void copyCacheMetadata(
808808
long rmvId = oldPartMetaIo.getGlobalRemoveId(oldPartMetaPageAddr);
809809
newPartMetaIo.setGlobalRemoveId(newPartMetaPageAddr, rmvId);
810810

811+
long reuseListRoot = oldPartMetaIo.getPartitionMetaStoreReuseListRoot(oldPartMetaPageAddr);
812+
newPartMetaIo.setPartitionMetaStoreReuseListRoot(newPartMetaPageAddr, reuseListRoot);
813+
811814
// Copy cache sizes for shared cache group.
812815
long oldCountersPageId = oldPartMetaIo.getCountersPageId(oldPartMetaPageAddr);
813816
if (oldCountersPageId != 0L) {
@@ -838,6 +841,10 @@ private void copyCacheMetadata(
838841
partCtx.newCacheDataStore.partStorage().insertDataRow(gapsDataRow, IoStatisticsHolderNoOp.INSTANCE);
839842

840843
newPartMetaIo.setGapsLink(newPartMetaPageAddr, gapsDataRow.link());
844+
845+
newPartMetaIo.setPartitionMetaStoreReuseListRoot(newPartMetaPageAddr,
846+
oldPartMetaIo.getPartitionMetaStoreReuseListRoot(oldPartMetaPageAddr)
847+
);
841848
}
842849

843850
// Encryption stuff.

0 commit comments

Comments
 (0)