1818
1919package org .apache .hadoop .hdfs ;
2020
21+ import static org .apache .hadoop .test .MetricsAsserts .getLongCounter ;
22+ import static org .apache .hadoop .test .MetricsAsserts .getMetrics ;
2123import static org .junit .Assert .assertEquals ;
2224import static org .junit .Assert .assertFalse ;
2325import static org .junit .Assert .assertTrue ;
2628import java .io .IOException ;
2729import java .util .List ;
2830
31+ import org .apache .commons .logging .Log ;
32+ import org .apache .commons .logging .LogFactory ;
2933import org .apache .hadoop .conf .Configuration ;
3034import org .apache .hadoop .fs .FSDataOutputStream ;
3135import org .apache .hadoop .fs .FileStatus ;
5559 * Tests to verify safe mode correctness.
5660 */
5761public class TestSafeMode {
62+ public static final Log LOG = LogFactory .getLog (TestSafeMode .class );
5863 private static final Path TEST_PATH = new Path ("/test" );
5964 private static final int BLOCK_SIZE = 1024 ;
6065 Configuration conf ;
6166 MiniDFSCluster cluster ;
6267 FileSystem fs ;
6368 DistributedFileSystem dfs ;
69+ private static final String NN_METRICS = "NameNodeActivity" ;
6470
6571 @ Before
6672 public void startUp () throws IOException {
@@ -158,16 +164,19 @@ public void testNoExtensionIfNoBlocks() throws IOException {
158164 */
159165 @ Test (timeout =45000 )
160166 public void testInitializeReplQueuesEarly () throws Exception {
167+ LOG .info ("Starting testInitializeReplQueuesEarly" );
161168 // Spray the blocks around the cluster when we add DNs instead of
162169 // concentrating all blocks on the first node.
163170 BlockManagerTestUtil .setWritingPrefersLocalNode (
164171 cluster .getNamesystem ().getBlockManager (), false );
165172
166173 cluster .startDataNodes (conf , 2 , true , StartupOption .REGULAR , null );
167174 cluster .waitActive ();
175+
176+ LOG .info ("Creating files" );
168177 DFSTestUtil .createFile (fs , TEST_PATH , 15 *BLOCK_SIZE , (short )1 , 1L );
169178
170-
179+ LOG . info ( "Stopping all DataNodes" );
171180 List <DataNodeProperties > dnprops = Lists .newLinkedList ();
172181 dnprops .add (cluster .stopDataNode (0 ));
173182 dnprops .add (cluster .stopDataNode (0 ));
@@ -176,6 +185,7 @@ public void testInitializeReplQueuesEarly() throws Exception {
176185 cluster .getConfiguration (0 ).setFloat (
177186 DFSConfigKeys .DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY , 1f /15f );
178187
188+ LOG .info ("Restarting NameNode" );
179189 cluster .restartNameNode ();
180190 final NameNode nn = cluster .getNameNode ();
181191
@@ -189,27 +199,37 @@ public void testInitializeReplQueuesEarly() throws Exception {
189199 "until threshold is crossed" ,
190200 NameNodeAdapter .safeModeInitializedReplQueues (nn ));
191201
202+ LOG .info ("Restarting one DataNode" );
192203 cluster .restartDataNode (dnprops .remove (0 ));
193204
194- // Wait for the block report from the restarted DN to come in.
205+ // Wait for block reports from all attached storages of
206+ // the restarted DN to come in.
195207 GenericTestUtils .waitFor (new Supplier <Boolean >() {
196208 @ Override
197209 public Boolean get () {
198- return NameNodeAdapter .getSafeModeSafeBlocks (nn ) > 0 ;
210+ return getLongCounter ("StorageBlockReportOps" , getMetrics (NN_METRICS )) ==
211+ MiniDFSCluster .DIRS_PER_DATANODE ;
199212 }
200213 }, 10 , 10000 );
201- // SafeMode is fine-grain synchronized, so the processMisReplicatedBlocks
202- // call is still going on at this point - wait until it's done by grabbing
203- // the lock.
204- nn .getNamesystem ().writeLock ();
205- nn .getNamesystem ().writeUnlock ();
206- int safe = NameNodeAdapter .getSafeModeSafeBlocks (nn );
207- assertTrue ("Expected first block report to make some but not all blocks " +
208- "safe. Got: " + safe , safe >= 1 && safe < 15 );
209- BlockManagerTestUtil .updateState (nn .getNamesystem ().getBlockManager ());
210-
214+
215+ final int safe = NameNodeAdapter .getSafeModeSafeBlocks (nn );
216+ assertTrue ("Expected first block report to make some blocks safe." , safe > 0 );
217+ assertTrue ("Did not expect first block report to make all blocks safe." , safe < 15 );
218+
211219 assertTrue (NameNodeAdapter .safeModeInitializedReplQueues (nn ));
212- assertEquals (15 - safe , nn .getNamesystem ().getUnderReplicatedBlocks ());
220+
221+ // Ensure that UnderReplicatedBlocks goes up to 15 - safe. Misreplicated
222+ // blocks are processed asynchronously so this may take a few seconds.
223+ // Failure here will manifest as a test timeout.
224+ BlockManagerTestUtil .updateState (nn .getNamesystem ().getBlockManager ());
225+ long underReplicatedBlocks = nn .getNamesystem ().getUnderReplicatedBlocks ();
226+ while (underReplicatedBlocks != (15 - safe )) {
227+ LOG .info ("UnderReplicatedBlocks expected=" + (15 - safe ) +
228+ ", actual=" + underReplicatedBlocks );
229+ Thread .sleep (100 );
230+ BlockManagerTestUtil .updateState (nn .getNamesystem ().getBlockManager ());
231+ underReplicatedBlocks = nn .getNamesystem ().getUnderReplicatedBlocks ();
232+ }
213233
214234 cluster .restartDataNodes ();
215235 }
0 commit comments