Skip to content

Commit f439eb2

Browse files
committed
HDFS-6160. TestSafeMode occasionally fails. (Contributed by Arpit Agarwal)
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1586007 13f79535-47bb-0310-9956-ffa450edef68
1 parent df3b245 commit f439eb2

File tree

4 files changed

+43
-14
lines changed

4 files changed

+43
-14
lines changed

hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,8 @@ Release 2.5.0 - UNRELEASED
322322

323323
HDFS-6169. Move the address in WebImageViewer. (Akira Ajisaka via wheat9)
324324

325+
HDFS-6160. TestSafeMode occasionally fails. (Arpit Agarwal)
326+
325327
Release 2.4.1 - UNRELEASED
326328

327329
INCOMPATIBLE CHANGES

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1022,6 +1022,7 @@ public DatanodeCommand blockReport(DatanodeRegistration nodeReg,
10221022
for(StorageBlockReport r : reports) {
10231023
final BlockListAsLongs blocks = new BlockListAsLongs(r.getBlocks());
10241024
hasStaleStorages = bm.processReport(nodeReg, r.getStorage(), poolId, blocks);
1025+
metrics.incrStorageBlockReportOps();
10251026
}
10261027

10271028
if (nn.getFSImage().isUpgradeFinalized() &&

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/NameNodeMetrics.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ public class NameNodeMetrics {
7373
MutableCounterLong snapshotDiffReportOps;
7474
@Metric("Number of blockReceivedAndDeleted calls")
7575
MutableCounterLong blockReceivedAndDeletedOps;
76+
@Metric("Number of blockReports from individual storages")
77+
MutableCounterLong storageBlockReportOps;
7678

7779
@Metric("Journal transactions") MutableRate transactions;
7880
@Metric("Journal syncs") MutableRate syncs;
@@ -221,6 +223,10 @@ public void incrSnapshotDiffReportOps() {
221223
public void incrBlockReceivedAndDeletedOps() {
222224
blockReceivedAndDeletedOps.incr();
223225
}
226+
227+
public void incrStorageBlockReportOps() {
228+
storageBlockReportOps.incr();
229+
}
224230

225231
public void addTransaction(long latency) {
226232
transactions.add(latency);

hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestSafeMode.java

Lines changed: 34 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818

1919
package org.apache.hadoop.hdfs;
2020

21+
import static org.apache.hadoop.test.MetricsAsserts.getLongCounter;
22+
import static org.apache.hadoop.test.MetricsAsserts.getMetrics;
2123
import static org.junit.Assert.assertEquals;
2224
import static org.junit.Assert.assertFalse;
2325
import static org.junit.Assert.assertTrue;
@@ -26,6 +28,8 @@
2628
import java.io.IOException;
2729
import java.util.List;
2830

31+
import org.apache.commons.logging.Log;
32+
import org.apache.commons.logging.LogFactory;
2933
import org.apache.hadoop.conf.Configuration;
3034
import org.apache.hadoop.fs.FSDataOutputStream;
3135
import org.apache.hadoop.fs.FileStatus;
@@ -55,12 +59,14 @@
5559
* Tests to verify safe mode correctness.
5660
*/
5761
public class TestSafeMode {
62+
public static final Log LOG = LogFactory.getLog(TestSafeMode.class);
5863
private static final Path TEST_PATH = new Path("/test");
5964
private static final int BLOCK_SIZE = 1024;
6065
Configuration conf;
6166
MiniDFSCluster cluster;
6267
FileSystem fs;
6368
DistributedFileSystem dfs;
69+
private static final String NN_METRICS = "NameNodeActivity";
6470

6571
@Before
6672
public void startUp() throws IOException {
@@ -158,16 +164,19 @@ public void testNoExtensionIfNoBlocks() throws IOException {
158164
*/
159165
@Test(timeout=45000)
160166
public void testInitializeReplQueuesEarly() throws Exception {
167+
LOG.info("Starting testInitializeReplQueuesEarly");
161168
// Spray the blocks around the cluster when we add DNs instead of
162169
// concentrating all blocks on the first node.
163170
BlockManagerTestUtil.setWritingPrefersLocalNode(
164171
cluster.getNamesystem().getBlockManager(), false);
165172

166173
cluster.startDataNodes(conf, 2, true, StartupOption.REGULAR, null);
167174
cluster.waitActive();
175+
176+
LOG.info("Creating files");
168177
DFSTestUtil.createFile(fs, TEST_PATH, 15*BLOCK_SIZE, (short)1, 1L);
169178

170-
179+
LOG.info("Stopping all DataNodes");
171180
List<DataNodeProperties> dnprops = Lists.newLinkedList();
172181
dnprops.add(cluster.stopDataNode(0));
173182
dnprops.add(cluster.stopDataNode(0));
@@ -176,6 +185,7 @@ public void testInitializeReplQueuesEarly() throws Exception {
176185
cluster.getConfiguration(0).setFloat(
177186
DFSConfigKeys.DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY, 1f/15f);
178187

188+
LOG.info("Restarting NameNode");
179189
cluster.restartNameNode();
180190
final NameNode nn = cluster.getNameNode();
181191

@@ -189,27 +199,37 @@ public void testInitializeReplQueuesEarly() throws Exception {
189199
"until threshold is crossed",
190200
NameNodeAdapter.safeModeInitializedReplQueues(nn));
191201

202+
LOG.info("Restarting one DataNode");
192203
cluster.restartDataNode(dnprops.remove(0));
193204

194-
// Wait for the block report from the restarted DN to come in.
205+
// Wait for block reports from all attached storages of
206+
// the restarted DN to come in.
195207
GenericTestUtils.waitFor(new Supplier<Boolean>() {
196208
@Override
197209
public Boolean get() {
198-
return NameNodeAdapter.getSafeModeSafeBlocks(nn) > 0;
210+
return getLongCounter("StorageBlockReportOps", getMetrics(NN_METRICS)) ==
211+
MiniDFSCluster.DIRS_PER_DATANODE;
199212
}
200213
}, 10, 10000);
201-
// SafeMode is fine-grain synchronized, so the processMisReplicatedBlocks
202-
// call is still going on at this point - wait until it's done by grabbing
203-
// the lock.
204-
nn.getNamesystem().writeLock();
205-
nn.getNamesystem().writeUnlock();
206-
int safe = NameNodeAdapter.getSafeModeSafeBlocks(nn);
207-
assertTrue("Expected first block report to make some but not all blocks " +
208-
"safe. Got: " + safe, safe >= 1 && safe < 15);
209-
BlockManagerTestUtil.updateState(nn.getNamesystem().getBlockManager());
210-
214+
215+
final int safe = NameNodeAdapter.getSafeModeSafeBlocks(nn);
216+
assertTrue("Expected first block report to make some blocks safe.", safe > 0);
217+
assertTrue("Did not expect first block report to make all blocks safe.", safe < 15);
218+
211219
assertTrue(NameNodeAdapter.safeModeInitializedReplQueues(nn));
212-
assertEquals(15 - safe, nn.getNamesystem().getUnderReplicatedBlocks());
220+
221+
// Ensure that UnderReplicatedBlocks goes up to 15 - safe. Misreplicated
222+
// blocks are processed asynchronously so this may take a few seconds.
223+
// Failure here will manifest as a test timeout.
224+
BlockManagerTestUtil.updateState(nn.getNamesystem().getBlockManager());
225+
long underReplicatedBlocks = nn.getNamesystem().getUnderReplicatedBlocks();
226+
while (underReplicatedBlocks != (15 - safe)) {
227+
LOG.info("UnderReplicatedBlocks expected=" + (15 - safe) +
228+
", actual=" + underReplicatedBlocks);
229+
Thread.sleep(100);
230+
BlockManagerTestUtil.updateState(nn.getNamesystem().getBlockManager());
231+
underReplicatedBlocks = nn.getNamesystem().getUnderReplicatedBlocks();
232+
}
213233

214234
cluster.restartDataNodes();
215235
}

0 commit comments

Comments
 (0)