Skip to content

Commit c3fad88

Browse files
committed
HDFS-3026. HA: Handle failure during HA state transition. Contributed by Aaron T. Myers.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2.0.0-alpha@1337033 13f79535-47bb-0310-9956-ffa450edef68
1 parent 90a5522 commit c3fad88

File tree

3 files changed

+149
-18
lines changed

3 files changed

+149
-18
lines changed

hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -511,6 +511,8 @@ Release 2.0.0 - UNRELEASED
511511

512512
HDFS-3395. NN doesn't start with HA+security enabled and HTTP address set to 0.0.0.0. (atm)
513513

514+
HDFS-3026. HA: Handle failure during HA state transition. (atm)
515+
514516
BREAKDOWN OF HDFS-1623 SUBTASKS
515517

516518
HDFS-2179. Add fencing framework and mechanisms for NameNode HA. (todd)

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java

Lines changed: 67 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,7 @@ public long getProtocolVersion(String protocol,
206206
private final boolean haEnabled;
207207
private final HAContext haContext;
208208
protected boolean allowStaleStandbyReads;
209+
private Runtime runtime = Runtime.getRuntime();
209210

210211

211212
/** httpServer */
@@ -481,11 +482,16 @@ private void stopCommonServices() {
481482
}
482483

483484
private void startTrashEmptier(Configuration conf) throws IOException {
484-
long trashInterval
485-
= conf.getLong(CommonConfigurationKeys.FS_TRASH_INTERVAL_KEY,
486-
CommonConfigurationKeys.FS_TRASH_INTERVAL_DEFAULT);
487-
if(trashInterval == 0)
485+
long trashInterval = conf.getLong(
486+
CommonConfigurationKeys.FS_TRASH_INTERVAL_KEY,
487+
CommonConfigurationKeys.FS_TRASH_INTERVAL_DEFAULT);
488+
if (trashInterval == 0) {
488489
return;
490+
} else if (trashInterval < 0) {
491+
throw new IOException("Cannot start tresh emptier with negative interval."
492+
+ " Set " + CommonConfigurationKeys.FS_TRASH_INTERVAL_KEY + " to a"
493+
+ " positive value.");
494+
}
489495
this.emptier = new Thread(new Trash(conf).getEmptier(), "Trash Emptier");
490496
this.emptier.setDaemon(true);
491497
this.emptier.start();
@@ -1235,14 +1241,37 @@ synchronized HAServiceState getServiceState() {
12351241
}
12361242
return state.getServiceState();
12371243
}
1244+
1245+
@VisibleForTesting
1246+
public synchronized void setRuntimeForTesting(Runtime runtime) {
1247+
this.runtime = runtime;
1248+
}
12381249

12391250
/**
1240-
* Class used as expose {@link NameNode} as context to {@link HAState}
1251+
* Shutdown the NN immediately in an ungraceful way. Used when it would be
1252+
* unsafe for the NN to continue operating, e.g. during a failed HA state
1253+
* transition.
12411254
*
1242-
* TODO(HA):
1243-
* When entering and exiting state, on failing to start services,
1244-
* appropriate action is needed todo either shutdown the node or recover
1245-
* from failure.
1255+
* @param t exception which warrants the shutdown. Printed to the NN log
1256+
* before exit.
1257+
* @throws ServiceFailedException thrown only for testing.
1258+
*/
1259+
private synchronized void doImmediateShutdown(Throwable t)
1260+
throws ServiceFailedException {
1261+
String message = "Error encountered requiring NN shutdown. " +
1262+
"Shutting down immediately.";
1263+
try {
1264+
LOG.fatal(message, t);
1265+
} catch (Throwable ignored) {
1266+
// This is unlikely to happen, but there's nothing we can do if it does.
1267+
}
1268+
runtime.exit(1);
1269+
// This code is only reached during testing, when runtime is stubbed out.
1270+
throw new ServiceFailedException(message, t);
1271+
}
1272+
1273+
/**
1274+
* Class used to expose {@link NameNode} as context to {@link HAState}
12461275
*/
12471276
protected class NameNodeHAContext implements HAContext {
12481277
@Override
@@ -1257,32 +1286,52 @@ public HAState getState() {
12571286

12581287
@Override
12591288
public void startActiveServices() throws IOException {
1260-
namesystem.startActiveServices();
1261-
startTrashEmptier(conf);
1289+
try {
1290+
namesystem.startActiveServices();
1291+
startTrashEmptier(conf);
1292+
} catch (Throwable t) {
1293+
doImmediateShutdown(t);
1294+
}
12621295
}
12631296

12641297
@Override
12651298
public void stopActiveServices() throws IOException {
1266-
if (namesystem != null) {
1267-
namesystem.stopActiveServices();
1299+
try {
1300+
if (namesystem != null) {
1301+
namesystem.stopActiveServices();
1302+
}
1303+
stopTrashEmptier();
1304+
} catch (Throwable t) {
1305+
doImmediateShutdown(t);
12681306
}
1269-
stopTrashEmptier();
12701307
}
12711308

12721309
@Override
12731310
public void startStandbyServices() throws IOException {
1274-
namesystem.startStandbyServices(conf);
1311+
try {
1312+
namesystem.startStandbyServices(conf);
1313+
} catch (Throwable t) {
1314+
doImmediateShutdown(t);
1315+
}
12751316
}
12761317

12771318
@Override
12781319
public void prepareToStopStandbyServices() throws ServiceFailedException {
1279-
namesystem.prepareToStopStandbyServices();
1320+
try {
1321+
namesystem.prepareToStopStandbyServices();
1322+
} catch (Throwable t) {
1323+
doImmediateShutdown(t);
1324+
}
12801325
}
12811326

12821327
@Override
12831328
public void stopStandbyServices() throws IOException {
1284-
if (namesystem != null) {
1285-
namesystem.stopStandbyServices();
1329+
try {
1330+
if (namesystem != null) {
1331+
namesystem.stopStandbyServices();
1332+
}
1333+
} catch (Throwable t) {
1334+
doImmediateShutdown(t);
12861335
}
12871336
}
12881337

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
/**
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.hadoop.hdfs.server.namenode.ha;
19+
20+
import static org.apache.hadoop.test.GenericTestUtils.assertExceptionContains;
21+
import static org.junit.Assert.fail;
22+
import static org.mockito.Matchers.anyInt;
23+
import static org.mockito.Mockito.mock;
24+
import static org.mockito.Mockito.times;
25+
import static org.mockito.Mockito.verify;
26+
27+
import java.io.IOException;
28+
29+
import org.apache.commons.logging.Log;
30+
import org.apache.commons.logging.LogFactory;
31+
import org.apache.hadoop.conf.Configuration;
32+
import org.apache.hadoop.fs.CommonConfigurationKeys;
33+
import org.apache.hadoop.ha.ServiceFailedException;
34+
import org.apache.hadoop.hdfs.MiniDFSCluster;
35+
import org.apache.hadoop.hdfs.MiniDFSNNTopology;
36+
import org.junit.Test;
37+
38+
/**
39+
* Tests to verify the behavior of failing to fully start transition HA states.
40+
*/
41+
public class TestStateTransitionFailure {
42+
43+
public static final Log LOG = LogFactory.getLog(TestStateTransitionFailure.class);
44+
45+
/**
46+
* Ensure that a failure to fully transition to the active state causes a
47+
* shutdown of the NameNode.
48+
*/
49+
@Test
50+
public void testFailureToTransitionCausesShutdown() throws IOException {
51+
MiniDFSCluster cluster = null;
52+
try {
53+
Configuration conf = new Configuration();
54+
// Set an illegal value for the trash emptier interval. This will cause
55+
// the NN to fail to transition to the active state.
56+
conf.setLong(CommonConfigurationKeys.FS_TRASH_INTERVAL_KEY, -1);
57+
cluster = new MiniDFSCluster.Builder(conf)
58+
.nnTopology(MiniDFSNNTopology.simpleHATopology())
59+
.numDataNodes(0)
60+
.build();
61+
cluster.waitActive();
62+
Runtime mockRuntime = mock(Runtime.class);
63+
cluster.getNameNode(0).setRuntimeForTesting(mockRuntime);
64+
verify(mockRuntime, times(0)).exit(anyInt());
65+
try {
66+
cluster.transitionToActive(0);
67+
fail("Transitioned to active but should not have been able to.");
68+
} catch (ServiceFailedException sfe) {
69+
assertExceptionContains("Error encountered requiring NN shutdown. " +
70+
"Shutting down immediately.", sfe);
71+
LOG.info("got expected exception", sfe);
72+
}
73+
verify(mockRuntime, times(1)).exit(anyInt());
74+
} finally {
75+
if (cluster != null) {
76+
cluster.shutdown();
77+
}
78+
}
79+
}
80+
}

0 commit comments

Comments
 (0)