@@ -190,6 +190,8 @@ public class Balancer {
190190 */
191191 public static final int MAX_NUM_CONCURRENT_MOVES = 5 ;
192192 private static final int MAX_NO_PENDING_BLOCK_ITERATIONS = 5 ;
193+ public static final long DELAY_AFTER_ERROR = 10 * 1000L ; //10 seconds
194+ public static final int BLOCK_MOVE_READ_TIMEOUT =20 *60 *1000 ; // 20 minutes
193195
194196 private static final String USAGE = "Usage: java "
195197 + Balancer .class .getSimpleName ()
@@ -337,7 +339,14 @@ private void dispatch() {
337339 sock .connect (
338340 NetUtils .createSocketAddr (target .datanode .getXferAddr ()),
339341 HdfsServerConstants .READ_TIMEOUT );
340- sock .setSoTimeout (HdfsServerConstants .READ_TIMEOUT );
342+ /* Unfortunately we don't have a good way to know if the Datanode is
343+ * taking a really long time to move a block, OR something has
344+ * gone wrong and it's never going to finish. To deal with this
345+ * scenario, we set a long timeout (20 minutes) to avoid hanging
346+ * the balancer indefinitely.
347+ */
348+ sock .setSoTimeout (BLOCK_MOVE_READ_TIMEOUT );
349+
341350 sock .setKeepAlive (true );
342351
343352 OutputStream unbufOut = sock .getOutputStream ();
@@ -360,6 +369,13 @@ private void dispatch() {
360369 LOG .info ("Successfully moved " + this );
361370 } catch (IOException e ) {
362371 LOG .warn ("Failed to move " + this + ": " + e .getMessage ());
372+ /* proxy or target may have an issue, insert a small delay
373+ * before using these nodes further. This avoids a potential storm
374+ * of "threads quota exceeded" Warnings when the balancer
375+ * gets out of sync with work going on in datanode.
376+ */
377+ proxySource .activateDelay (DELAY_AFTER_ERROR );
378+ target .activateDelay (DELAY_AFTER_ERROR );
363379 } finally {
364380 IOUtils .closeStream (out );
365381 IOUtils .closeStream (in );
@@ -497,6 +513,7 @@ private static class BalancerDatanode {
497513 final double utilization ;
498514 final long maxSize2Move ;
499515 private long scheduledSize = 0L ;
516+ protected long delayUntil = 0L ;
500517 // blocks being moved but not confirmed yet
501518 private final List <PendingBlockMove > pendingBlocks =
502519 new ArrayList <PendingBlockMove >(MAX_NUM_CONCURRENT_MOVES );
@@ -573,6 +590,18 @@ protected synchronized long getScheduledSize(){
573590 protected synchronized void setScheduledSize (long size ){
574591 scheduledSize = size ;
575592 }
593+
594+ synchronized private void activateDelay (long delta ) {
595+ delayUntil = Time .now () + delta ;
596+ }
597+
598+ synchronized private boolean isDelayActive () {
599+ if (delayUntil == 0 || Time .now () > delayUntil ){
600+ delayUntil = 0 ;
601+ return false ;
602+ }
603+ return true ;
604+ }
576605
577606 /* Check if the node can schedule more blocks to move */
578607 synchronized private boolean isPendingQNotFull () {
@@ -590,7 +619,7 @@ synchronized private boolean isPendingQEmpty() {
590619 /* Add a scheduled block move to the node */
591620 private synchronized boolean addPendingBlock (
592621 PendingBlockMove pendingBlock ) {
593- if (isPendingQNotFull ()) {
622+ if (! isDelayActive () && isPendingQNotFull ()) {
594623 return pendingBlocks .add (pendingBlock );
595624 }
596625 return false ;
0 commit comments