Skip to content

Commit 900fbfe

Browse files
committed
HDFS-6166. Change Balancer socket read timeout to 20 minutes and add 10 seconds delay after error. Contributed by Nathan Roberts
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1583018 13f79535-47bb-0310-9956-ffa450edef68
1 parent 212425b commit 900fbfe

File tree

2 files changed

+34
-2
lines changed

2 files changed

+34
-2
lines changed

hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -735,6 +735,9 @@ Release 2.4.0 - UNRELEASED
735735
HDFS-6163. Fix a minor bug in the HA upgrade document. (Fengdong Yu via
736736
jing9)
737737

738+
HDFS-6166. Change Balancer socket read timeout to 20 minutes and add
739+
10 seconds delay after error. (Nathan Roberts via szetszwo)
740+
738741
BREAKDOWN OF HDFS-5698 SUBTASKS AND RELATED JIRAS
739742

740743
HDFS-5717. Save FSImage header in protobuf. (Haohui Mai via jing9)

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/Balancer.java

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,8 @@ public class Balancer {
190190
*/
191191
public static final int MAX_NUM_CONCURRENT_MOVES = 5;
192192
private static final int MAX_NO_PENDING_BLOCK_ITERATIONS = 5;
193+
public static final long DELAY_AFTER_ERROR = 10 * 1000L; //10 seconds
194+
public static final int BLOCK_MOVE_READ_TIMEOUT=20*60*1000; // 20 minutes
193195

194196
private static final String USAGE = "Usage: java "
195197
+ Balancer.class.getSimpleName()
@@ -337,7 +339,14 @@ private void dispatch() {
337339
sock.connect(
338340
NetUtils.createSocketAddr(target.datanode.getXferAddr()),
339341
HdfsServerConstants.READ_TIMEOUT);
340-
sock.setSoTimeout(HdfsServerConstants.READ_TIMEOUT);
342+
/* Unfortunately we don't have a good way to know if the Datanode is
343+
* taking a really long time to move a block, OR something has
344+
* gone wrong and it's never going to finish. To deal with this
345+
* scenario, we set a long timeout (20 minutes) to avoid hanging
346+
* the balancer indefinitely.
347+
*/
348+
sock.setSoTimeout(BLOCK_MOVE_READ_TIMEOUT);
349+
341350
sock.setKeepAlive(true);
342351

343352
OutputStream unbufOut = sock.getOutputStream();
@@ -360,6 +369,13 @@ private void dispatch() {
360369
LOG.info("Successfully moved " + this);
361370
} catch (IOException e) {
362371
LOG.warn("Failed to move " + this + ": " + e.getMessage());
372+
/* proxy or target may have an issue, insert a small delay
373+
* before using these nodes further. This avoids a potential storm
374+
* of "threads quota exceeded" Warnings when the balancer
375+
* gets out of sync with work going on in datanode.
376+
*/
377+
proxySource.activateDelay(DELAY_AFTER_ERROR);
378+
target.activateDelay(DELAY_AFTER_ERROR);
363379
} finally {
364380
IOUtils.closeStream(out);
365381
IOUtils.closeStream(in);
@@ -497,6 +513,7 @@ private static class BalancerDatanode {
497513
final double utilization;
498514
final long maxSize2Move;
499515
private long scheduledSize = 0L;
516+
protected long delayUntil = 0L;
500517
// blocks being moved but not confirmed yet
501518
private final List<PendingBlockMove> pendingBlocks =
502519
new ArrayList<PendingBlockMove>(MAX_NUM_CONCURRENT_MOVES);
@@ -573,6 +590,18 @@ protected synchronized long getScheduledSize(){
573590
protected synchronized void setScheduledSize(long size){
574591
scheduledSize = size;
575592
}
593+
594+
synchronized private void activateDelay(long delta) {
595+
delayUntil = Time.now() + delta;
596+
}
597+
598+
synchronized private boolean isDelayActive() {
599+
if (delayUntil == 0 || Time.now() > delayUntil){
600+
delayUntil = 0;
601+
return false;
602+
}
603+
return true;
604+
}
576605

577606
/* Check if the node can schedule more blocks to move */
578607
synchronized private boolean isPendingQNotFull() {
@@ -590,7 +619,7 @@ synchronized private boolean isPendingQEmpty() {
590619
/* Add a scheduled block move to the node */
591620
private synchronized boolean addPendingBlock(
592621
PendingBlockMove pendingBlock) {
593-
if (isPendingQNotFull()) {
622+
if (!isDelayActive() && isPendingQNotFull()) {
594623
return pendingBlocks.add(pendingBlock);
595624
}
596625
return false;

0 commit comments

Comments
 (0)