Skip to content

Commit cb7b2ce

Browse files
committed
YARN-2434. RM should not recover containers from previously failed attempt when AM restart is not enabled. Contributed by Jian He
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1619614 13f79535-47bb-0310-9956-ffa450edef68
1 parent abf276c commit cb7b2ce

File tree

3 files changed

+29
-0
lines changed

3 files changed

+29
-0
lines changed

hadoop-yarn-project/CHANGES.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,9 @@ Release 2.6.0 - UNRELEASED
231231
YARN-2424. LCE should support non-cgroups, non-secure mode (Chris Douglas
232232
via aw)
233233

234+
YARN-2434. RM should not recover containers from previously failed attempt
235+
when AM restart is not enabled (Jian He via jlowe)
236+
234237
Release 2.5.0 - 2014-08-11
235238

236239
INCOMPATIBLE CHANGES

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,19 @@ public synchronized void recoverContainersOnNode(
273273
SchedulerApplicationAttempt schedulerAttempt =
274274
schedulerApp.getCurrentAppAttempt();
275275

276+
if (!rmApp.getApplicationSubmissionContext()
277+
.getKeepContainersAcrossApplicationAttempts()) {
278+
// Do not recover containers for stopped attempt or previous attempt.
279+
if (schedulerAttempt.isStopped()
280+
|| !schedulerAttempt.getApplicationAttemptId().equals(
281+
container.getContainerId().getApplicationAttemptId())) {
282+
LOG.info("Skip recovering container " + container
283+
+ " for already stopped attempt.");
284+
killOrphanContainerOnNode(nm, container);
285+
continue;
286+
}
287+
}
288+
276289
// create container
277290
RMContainer rmContainer = recoverAndCreateContainer(container, nm);
278291

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestWorkPreservingRMRestart.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -513,6 +513,19 @@ public void testAMfailedBetweenRMRestart() throws Exception {
513513
// just-recovered containers.
514514
assertNull(scheduler.getRMContainer(runningContainer.getContainerId()));
515515
assertNull(scheduler.getRMContainer(completedContainer.getContainerId()));
516+
517+
rm2.waitForNewAMToLaunchAndRegister(app1.getApplicationId(), 2, nm1);
518+
519+
MockNM nm2 =
520+
new MockNM("127.1.1.1:4321", 8192, rm2.getResourceTrackerService());
521+
NMContainerStatus previousAttemptContainer =
522+
TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 4,
523+
ContainerState.RUNNING);
524+
nm2.registerNode(Arrays.asList(previousAttemptContainer), null);
525+
// Wait for RM to settle down on recovering containers;
526+
Thread.sleep(3000);
527+
// check containers from previous failed attempt should not be recovered.
528+
assertNull(scheduler.getRMContainer(previousAttemptContainer.getContainerId()));
516529
}
517530

518531
// Apps already completed before RM restart. Restarted RM scheduler should not

0 commit comments

Comments
 (0)