Skip to content

Commit 3edb8ad

Browse files
committed
HDFS-5840. Follow-up to HDFS-5138 to improve error handling during partial upgrade failures. Contributed by Aaron T. Myers, Suresh Srinivas, and Jing Zhao.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1581260 13f79535-47bb-0310-9956-ffa450edef68
1 parent a9b9bea commit 3edb8ad

File tree

12 files changed

+154
-49
lines changed

12 files changed

+154
-49
lines changed

hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -687,6 +687,9 @@ Release 2.4.0 - UNRELEASED
687687

688688
HDFS-5846. Assigning DEFAULT_RACK in resolveNetworkLocation method can break
689689
data resiliency. (Nikola Vujic via cnauroth)
690+
691+
HDFS-5840. Follow-up to HDFS-5138 to improve error handling during partial
692+
upgrade failures. (atm, jing9 and suresh via jing9)
690693

691694
BREAKDOWN OF HDFS-5698 SUBTASKS AND RELATED JIRAS
692695

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSUtil.java

Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1680,23 +1680,19 @@ public static HttpServer2.Builder httpServerTemplateForNNAndJN(
16801680
*
16811681
* @param objects the collection of objects to check for equality.
16821682
*/
1683-
public static void assertAllResultsEqual(Collection<?> objects) {
1684-
Object[] resultsArray = objects.toArray();
1685-
1686-
if (resultsArray.length == 0)
1683+
public static void assertAllResultsEqual(Collection<?> objects)
1684+
throws AssertionError {
1685+
if (objects.size() == 0 || objects.size() == 1)
16871686
return;
16881687

1689-
for (int i = 0; i < resultsArray.length; i++) {
1690-
if (i == 0)
1691-
continue;
1692-
else {
1693-
Object currElement = resultsArray[i];
1694-
Object lastElement = resultsArray[i - 1];
1695-
if ((currElement == null && currElement != lastElement) ||
1696-
(currElement != null && !currElement.equals(lastElement))) {
1697-
throw new AssertionError("Not all elements match in results: " +
1698-
Arrays.toString(resultsArray));
1699-
}
1688+
Object[] resultsArray = objects.toArray();
1689+
for (int i = 1; i < resultsArray.length; i++) {
1690+
Object currElement = resultsArray[i];
1691+
Object lastElement = resultsArray[i - 1];
1692+
if ((currElement == null && currElement != lastElement) ||
1693+
(currElement != null && !currElement.equals(lastElement))) {
1694+
throw new AssertionError("Not all elements match in results: " +
1695+
Arrays.toString(resultsArray));
17001696
}
17011697
}
17021698
}

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/client/QuorumJournalManager.java

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -571,7 +571,11 @@ public boolean canRollBack(StorageInfo storage, StorageInfo prevStorage,
571571

572572
// Either they all return the same thing or this call fails, so we can
573573
// just return the first result.
574-
DFSUtil.assertAllResultsEqual(call.getResults().values());
574+
try {
575+
DFSUtil.assertAllResultsEqual(call.getResults().values());
576+
} catch (AssertionError ae) {
577+
throw new IOException("Results differed for canRollBack", ae);
578+
}
575579
for (Boolean result : call.getResults().values()) {
576580
return result;
577581
}
@@ -636,7 +640,11 @@ public long getJournalCTime() throws IOException {
636640

637641
// Either they all return the same thing or this call fails, so we can
638642
// just return the first result.
639-
DFSUtil.assertAllResultsEqual(call.getResults().values());
643+
try {
644+
DFSUtil.assertAllResultsEqual(call.getResults().values());
645+
} catch (AssertionError ae) {
646+
throw new IOException("Results differed for getJournalCTime", ae);
647+
}
640648
for (Long result : call.getResults().values()) {
641649
return result;
642650
}

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/server/JNStorage.java

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,15 +65,15 @@ class JNStorage extends Storage {
6565
* @param errorReporter a callback to report errors
6666
* @throws IOException
6767
*/
68-
protected JNStorage(Configuration conf, File logDir,
68+
protected JNStorage(Configuration conf, File logDir, StartupOption startOpt,
6969
StorageErrorReporter errorReporter) throws IOException {
7070
super(NodeType.JOURNAL_NODE);
7171

7272
sd = new StorageDirectory(logDir);
7373
this.addStorageDir(sd);
7474
this.fjm = new FileJournalManager(conf, sd, errorReporter);
75-
76-
analyzeStorage();
75+
76+
analyzeAndRecoverStorage(startOpt);
7777
}
7878

7979
FileJournalManager getJournalManager() {
@@ -216,6 +216,18 @@ protected void setLayoutVersion(Properties props, StorageDirectory sd)
216216
layoutVersion = lv;
217217
}
218218

219+
void analyzeAndRecoverStorage(StartupOption startOpt) throws IOException {
220+
this.state = sd.analyzeStorage(startOpt, this);
221+
final boolean needRecover = state != StorageState.NORMAL
222+
&& state != StorageState.NON_EXISTENT
223+
&& state != StorageState.NOT_FORMATTED;
224+
if (state == StorageState.NORMAL && startOpt != StartupOption.ROLLBACK) {
225+
readProperties(sd);
226+
} else if (needRecover) {
227+
sd.doRecover(state);
228+
}
229+
}
230+
219231
void checkConsistentNamespace(NamespaceInfo nsInfo)
220232
throws IOException {
221233
if (nsInfo.getNamespaceID() != getNamespaceID()) {

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/server/Journal.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PrepareRecoveryResponseProto;
4444
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.SegmentStateProto;
4545
import org.apache.hadoop.hdfs.qjournal.protocol.RequestInfo;
46+
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
4647
import org.apache.hadoop.hdfs.server.common.StorageErrorReporter;
4748
import org.apache.hadoop.hdfs.server.common.StorageInfo;
4849
import org.apache.hadoop.hdfs.server.namenode.EditLogOutputStream;
@@ -138,8 +139,9 @@ public class Journal implements Closeable {
138139
private static final int WARN_SYNC_MILLIS_THRESHOLD = 1000;
139140

140141
Journal(Configuration conf, File logDir, String journalId,
141-
StorageErrorReporter errorReporter) throws IOException {
142-
storage = new JNStorage(conf, logDir, errorReporter);
142+
StartupOption startOpt, StorageErrorReporter errorReporter)
143+
throws IOException {
144+
storage = new JNStorage(conf, logDir, startOpt, errorReporter);
143145
this.journalId = journalId;
144146

145147
refreshCachedData();

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/server/JournalNode.java

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
import org.apache.hadoop.hdfs.DFSConfigKeys;
3535
import org.apache.hadoop.hdfs.HdfsConfiguration;
3636
import org.apache.hadoop.hdfs.qjournal.client.QuorumJournalManager;
37+
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
3738
import org.apache.hadoop.hdfs.server.common.StorageErrorReporter;
3839
import org.apache.hadoop.hdfs.server.common.StorageInfo;
3940
import org.apache.hadoop.io.IOUtils;
@@ -77,19 +78,24 @@ public class JournalNode implements Tool, Configurable, JournalNodeMXBean {
7778
*/
7879
private int resultCode = 0;
7980

80-
synchronized Journal getOrCreateJournal(String jid) throws IOException {
81+
synchronized Journal getOrCreateJournal(String jid, StartupOption startOpt)
82+
throws IOException {
8183
QuorumJournalManager.checkJournalId(jid);
8284

8385
Journal journal = journalsById.get(jid);
8486
if (journal == null) {
8587
File logDir = getLogDir(jid);
8688
LOG.info("Initializing journal in directory " + logDir);
87-
journal = new Journal(conf, logDir, jid, new ErrorReporter());
89+
journal = new Journal(conf, logDir, jid, startOpt, new ErrorReporter());
8890
journalsById.put(jid, journal);
8991
}
9092

9193
return journal;
9294
}
95+
96+
Journal getOrCreateJournal(String jid) throws IOException {
97+
return getOrCreateJournal(jid, StartupOption.REGULAR);
98+
}
9399

94100
@Override
95101
public void setConf(Configuration conf) {
@@ -301,12 +307,12 @@ public void doFinalize(String journalId) throws IOException {
301307

302308
public Boolean canRollBack(String journalId, StorageInfo storage,
303309
StorageInfo prevStorage, int targetLayoutVersion) throws IOException {
304-
return getOrCreateJournal(journalId).canRollBack(storage, prevStorage,
305-
targetLayoutVersion);
310+
return getOrCreateJournal(journalId, StartupOption.ROLLBACK).canRollBack(
311+
storage, prevStorage, targetLayoutVersion);
306312
}
307313

308314
public void doRollback(String journalId) throws IOException {
309-
getOrCreateJournal(journalId).doRollback();
315+
getOrCreateJournal(journalId, StartupOption.ROLLBACK).doRollback();
310316
}
311317

312318
public void discardSegments(String journalId, long startTxId)

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,9 @@
4444
import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier;
4545
import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo;
4646
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole;
47-
import org.apache.hadoop.hdfs.server.common.Storage;
4847
import org.apache.hadoop.hdfs.server.common.Storage.FormatConfirmable;
4948
import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
49+
import org.apache.hadoop.hdfs.server.common.StorageInfo;
5050
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.AddBlockOp;
5151
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.AddCacheDirectiveInfoOp;
5252
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.AddCachePoolOp;
@@ -1411,7 +1411,7 @@ public synchronized void doFinalizeOfSharedLog() throws IOException {
14111411
}
14121412
}
14131413

1414-
public synchronized boolean canRollBackSharedLog(Storage prevStorage,
1414+
public synchronized boolean canRollBackSharedLog(StorageInfo prevStorage,
14151415
int targetLayoutVersion) throws IOException {
14161416
for (JournalAndStream jas : journalSet.getAllJournalStreams()) {
14171417
if (jas.isShared()) {

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,10 @@ void doUpgrade(FSNamesystem target) throws IOException {
393393

394394
saveFSImageInAllDirs(target, editLog.getLastWrittenTxId());
395395

396+
// upgrade shared edit storage first
397+
if (target.isHaEnabled()) {
398+
editLog.doUpgradeOfSharedLog();
399+
}
396400
for (Iterator<StorageDirectory> it = storage.dirIterator(false); it.hasNext();) {
397401
StorageDirectory sd = it.next();
398402
try {
@@ -402,9 +406,6 @@ void doUpgrade(FSNamesystem target) throws IOException {
402406
continue;
403407
}
404408
}
405-
if (target.isHaEnabled()) {
406-
editLog.doUpgradeOfSharedLog();
407-
}
408409
storage.reportErrorsOnDirectories(errorSDs);
409410

410411
isUpgradeFinalized = false;
@@ -430,14 +431,19 @@ void doRollback(FSNamesystem fsns) throws IOException {
430431
HdfsConstants.NAMENODE_LAYOUT_VERSION)) {
431432
continue;
432433
}
434+
LOG.info("Can perform rollback for " + sd);
433435
canRollback = true;
434436
}
435437

436438
if (fsns.isHaEnabled()) {
437439
// If HA is enabled, check if the shared log can be rolled back as well.
438440
editLog.initJournalsForWrite();
439-
canRollback |= editLog.canRollBackSharedLog(prevState.getStorage(),
440-
HdfsConstants.NAMENODE_LAYOUT_VERSION);
441+
boolean canRollBackSharedEditLog = editLog.canRollBackSharedLog(
442+
prevState.getStorage(), HdfsConstants.NAMENODE_LAYOUT_VERSION);
443+
if (canRollBackSharedEditLog) {
444+
LOG.info("Can perform rollback for shared edit log.");
445+
canRollback = true;
446+
}
441447
}
442448

443449
if (!canRollback)

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NNUpgradeUtil.java

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
2727
import org.apache.hadoop.hdfs.server.common.StorageInfo;
2828

29+
import com.google.common.base.Preconditions;
30+
2931
abstract class NNUpgradeUtil {
3032

3133
private static final Log LOG = LogFactory.getLog(NNUpgradeUtil.class);
@@ -82,7 +84,8 @@ static void doFinalize(StorageDirectory sd) throws IOException {
8284
return;
8385
}
8486
LOG.info("Finalizing upgrade of storage directory " + sd.getRoot());
85-
assert sd.getCurrentDir().exists() : "Current directory must exist.";
87+
Preconditions.checkState(sd.getCurrentDir().exists(),
88+
"Current directory must exist.");
8689
final File tmpDir = sd.getFinalizedTmp();
8790
// rename previous to tmp and remove
8891
NNStorage.rename(prevDir, tmpDir);
@@ -105,9 +108,14 @@ static void doPreUpgrade(StorageDirectory sd) throws IOException {
105108
File curDir = sd.getCurrentDir();
106109
File prevDir = sd.getPreviousDir();
107110
File tmpDir = sd.getPreviousTmp();
108-
assert curDir.exists() : "Current directory must exist.";
109-
assert !prevDir.exists() : "previous directory must not exist.";
110-
assert !tmpDir.exists() : "previous.tmp directory must not exist.";
111+
112+
Preconditions.checkState(curDir.exists(),
113+
"Current directory must exist for preupgrade.");
114+
Preconditions.checkState(!prevDir.exists(),
115+
"Previous directory must not exist for preupgrade.");
116+
Preconditions.checkState(!tmpDir.exists(),
117+
"Previous.tmp directory must not exist for preupgrade."
118+
+ "Consider restarting for recovery.");
111119

112120
// rename current to tmp
113121
NNStorage.rename(curDir, tmpDir);
@@ -136,6 +144,11 @@ static void doUpgrade(StorageDirectory sd, Storage storage) throws
136144

137145
File prevDir = sd.getPreviousDir();
138146
File tmpDir = sd.getPreviousTmp();
147+
Preconditions.checkState(!prevDir.exists(),
148+
"previous directory must not exist for upgrade.");
149+
Preconditions.checkState(tmpDir.exists(),
150+
"previous.tmp directory must exist for upgrade.");
151+
139152
// rename tmp to previous
140153
NNStorage.rename(tmpDir, prevDir);
141154
} catch (IOException ioe) {
@@ -154,14 +167,19 @@ static void doUpgrade(StorageDirectory sd, Storage storage) throws
154167
static void doRollBack(StorageDirectory sd)
155168
throws IOException {
156169
File prevDir = sd.getPreviousDir();
157-
if (!prevDir.exists())
170+
if (!prevDir.exists()) {
158171
return;
172+
}
159173

160174
File tmpDir = sd.getRemovedTmp();
161-
assert !tmpDir.exists() : "removed.tmp directory must not exist.";
175+
Preconditions.checkState(!tmpDir.exists(),
176+
"removed.tmp directory must not exist for rollback."
177+
+ "Consider restarting for recovery.");
162178
// rename current to tmp
163179
File curDir = sd.getCurrentDir();
164-
assert curDir.exists() : "Current directory must exist.";
180+
Preconditions.checkState(curDir.exists(),
181+
"Current directory must exist for rollback.");
182+
165183
NNStorage.rename(curDir, tmpDir);
166184
// rename previous to current
167185
NNStorage.rename(prevDir, curDir);

hadoop-hdfs-project/hadoop-hdfs/src/site/apt/HDFSHighAvailabilityWithQJM.apt.vm

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -780,14 +780,19 @@ digest:hdfs-zkfcs:vlUvLnd8MlacsE80rDuu6ONESbM=:rwcda
780780

781781
[[1]] Shut down all of the NNs as normal, and install the newer software.
782782

783-
[[2]] Start one of the NNs with the <<<'-upgrade'>>> flag.
783+
[[2]] Start up all of the JNs. Note that it is <<critical>> that all the
784+
JNs be running when performing the upgrade, rollback, or finalization
785+
operations. If any of the JNs are down at the time of running any of these
786+
operations, the operation will fail.
787+
788+
[[3]] Start one of the NNs with the <<<'-upgrade'>>> flag.
784789

785-
[[3]] On start, this NN will not enter the standby state as usual in an HA
790+
[[4]] On start, this NN will not enter the standby state as usual in an HA
786791
setup. Rather, this NN will immediately enter the active state, perform an
787792
upgrade of its local storage dirs, and also perform an upgrade of the shared
788793
edit log.
789794

790-
[[4]] At this point the other NN in the HA pair will be out of sync with
795+
[[5]] At this point the other NN in the HA pair will be out of sync with
791796
the upgraded NN. In order to bring it back in sync and once again have a highly
792797
available setup, you should re-bootstrap this NameNode by running the NN with
793798
the <<<'-bootstrapStandby'>>> flag. It is an error to start this second NN with

0 commit comments

Comments
 (0)