Skip to content

Commit 5a79a63

Browse files
committed
YARN-1849. Fixed NPE in ResourceTrackerService#registerNodeManager for UAM. Contributed by Karthik Kambatla
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1580077 13f79535-47bb-0310-9956-ffa450edef68
1 parent f3bd13b commit 5a79a63

File tree

4 files changed

+107
-38
lines changed

4 files changed

+107
-38
lines changed

hadoop-yarn-project/CHANGES.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -539,6 +539,9 @@ Release 2.4.0 - UNRELEASED
539539
YARN-1670. Fixed a bug in log-aggregation that can cause the writer to write
540540
more log-data than the log-length that it records. (Mit Desai via vinodk)
541541

542+
YARN-1849. Fixed NPE in ResourceTrackerService#registerNodeManager for UAM
543+
(Karthik Kambatla via jianhe )
544+
542545
Release 2.3.1 - UNRELEASED
543546

544547
INCOMPATIBLE CHANGES

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java

Lines changed: 42 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
import org.apache.hadoop.service.AbstractService;
3232
import org.apache.hadoop.util.VersionUtil;
3333
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
34+
import org.apache.hadoop.yarn.api.records.Container;
3435
import org.apache.hadoop.yarn.api.records.ContainerId;
3536
import org.apache.hadoop.yarn.api.records.ContainerState;
3637
import org.apache.hadoop.yarn.api.records.ContainerStatus;
@@ -187,12 +188,51 @@ protected void serviceStop() throws Exception {
187188
super.serviceStop();
188189
}
189190

191+
/**
192+
* Helper method to handle received ContainerStatus. If this corresponds to
193+
* the completion of a master-container of a managed AM,
194+
* we call the handler for RMAppAttemptContainerFinishedEvent.
195+
*/
196+
@SuppressWarnings("unchecked")
197+
@VisibleForTesting
198+
void handleContainerStatus(ContainerStatus containerStatus) {
199+
ApplicationAttemptId appAttemptId =
200+
containerStatus.getContainerId().getApplicationAttemptId();
201+
RMApp rmApp =
202+
rmContext.getRMApps().get(appAttemptId.getApplicationId());
203+
if (rmApp == null) {
204+
LOG.error("Received finished container : "
205+
+ containerStatus.getContainerId()
206+
+ "for unknown application " + appAttemptId.getApplicationId()
207+
+ " Skipping.");
208+
return;
209+
}
210+
211+
if (rmApp.getApplicationSubmissionContext().getUnmanagedAM()) {
212+
if (LOG.isDebugEnabled()) {
213+
LOG.debug("Ignoring container completion status for unmanaged AM"
214+
+ rmApp.getApplicationId());
215+
}
216+
return;
217+
}
218+
219+
RMAppAttempt rmAppAttempt = rmApp.getRMAppAttempt(appAttemptId);
220+
Container masterContainer = rmAppAttempt.getMasterContainer();
221+
if (masterContainer.getId().equals(containerStatus.getContainerId())
222+
&& containerStatus.getState() == ContainerState.COMPLETE) {
223+
// sending master container finished event.
224+
RMAppAttemptContainerFinishedEvent evt =
225+
new RMAppAttemptContainerFinishedEvent(appAttemptId,
226+
containerStatus);
227+
rmContext.getDispatcher().getEventHandler().handle(evt);
228+
}
229+
}
230+
190231
@SuppressWarnings("unchecked")
191232
@Override
192233
public RegisterNodeManagerResponse registerNodeManager(
193234
RegisterNodeManagerRequest request) throws YarnException,
194235
IOException {
195-
196236
NodeId nodeId = request.getNodeId();
197237
String host = nodeId.getHost();
198238
int cmPort = nodeId.getPort();
@@ -204,29 +244,7 @@ public RegisterNodeManagerResponse registerNodeManager(
204244
LOG.info("received container statuses on node manager register :"
205245
+ request.getContainerStatuses());
206246
for (ContainerStatus containerStatus : request.getContainerStatuses()) {
207-
ApplicationAttemptId appAttemptId =
208-
containerStatus.getContainerId().getApplicationAttemptId();
209-
RMApp rmApp =
210-
rmContext.getRMApps().get(appAttemptId.getApplicationId());
211-
if (rmApp != null) {
212-
RMAppAttempt rmAppAttempt = rmApp.getRMAppAttempt(appAttemptId);
213-
if (rmAppAttempt != null) {
214-
if (rmAppAttempt.getMasterContainer().getId()
215-
.equals(containerStatus.getContainerId())
216-
&& containerStatus.getState() == ContainerState.COMPLETE) {
217-
// sending master container finished event.
218-
RMAppAttemptContainerFinishedEvent evt =
219-
new RMAppAttemptContainerFinishedEvent(appAttemptId,
220-
containerStatus);
221-
rmContext.getDispatcher().getEventHandler().handle(evt);
222-
}
223-
}
224-
} else {
225-
LOG.error("Received finished container :"
226-
+ containerStatus.getContainerId()
227-
+ " for non existing application :"
228-
+ appAttemptId.getApplicationId());
229-
}
247+
handleContainerStatus(containerStatus);
230248
}
231249
}
232250
RegisterNodeManagerResponse response = recordFactory

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,11 @@
3535

3636
import javax.crypto.SecretKey;
3737

38+
import com.google.common.annotations.VisibleForTesting;
3839
import org.apache.commons.lang.StringUtils;
3940
import org.apache.commons.logging.Log;
4041
import org.apache.commons.logging.LogFactory;
42+
import org.apache.hadoop.classification.InterfaceAudience;
4143
import org.apache.hadoop.conf.Configuration;
4244
import org.apache.hadoop.security.Credentials;
4345
import org.apache.hadoop.security.UserGroupInformation;
@@ -629,7 +631,9 @@ public Container getMasterContainer() {
629631
}
630632
}
631633

632-
private void setMasterContainer(Container container) {
634+
@InterfaceAudience.Private
635+
@VisibleForTesting
636+
public void setMasterContainer(Container container) {
633637
masterContainer = container;
634638
}
635639

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java

Lines changed: 57 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,6 @@
2626
import java.util.HashMap;
2727
import java.util.List;
2828

29-
import org.junit.Assert;
30-
3129
import org.apache.hadoop.conf.Configuration;
3230
import org.apache.hadoop.io.IOUtils;
3331
import org.apache.hadoop.metrics2.MetricsSystem;
@@ -45,21 +43,29 @@
4543
import org.apache.hadoop.yarn.conf.YarnConfiguration;
4644
import org.apache.hadoop.yarn.event.Dispatcher;
4745
import org.apache.hadoop.yarn.event.DrainDispatcher;
46+
import org.apache.hadoop.yarn.event.Event;
4847
import org.apache.hadoop.yarn.event.EventHandler;
4948
import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse;
5049
import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest;
5150
import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerResponse;
5251
import org.apache.hadoop.yarn.server.api.records.NodeAction;
5352
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
53+
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptImpl;
5454
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics;
5555
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEvent;
5656
import org.apache.hadoop.yarn.server.utils.BuilderUtils;
5757
import org.apache.hadoop.yarn.util.Records;
5858
import org.apache.hadoop.yarn.util.YarnVersionInfo;
59+
5960
import org.junit.After;
61+
import org.junit.Assert;
6062
import org.junit.Test;
6163

6264
import static org.junit.Assert.assertEquals;
65+
import static org.mockito.Matchers.any;
66+
import static org.mockito.Mockito.never;
67+
import static org.mockito.Mockito.spy;
68+
import static org.mockito.Mockito.verify;
6369

6470
public class TestResourceTrackerService {
6571

@@ -468,26 +474,64 @@ private void checkUnealthyNMCount(MockRM rm, MockNM nm1, boolean health,
468474
ClusterMetrics.getMetrics().getUnhealthyNMs());
469475
}
470476

477+
@SuppressWarnings("unchecked")
471478
@Test
472-
public void testNodeRegistrationWithContainers() throws Exception {
473-
rm = new MockRM();
474-
rm.init(new YarnConfiguration());
479+
public void testHandleContainerStatusInvalidCompletions() throws Exception {
480+
rm = new MockRM(new YarnConfiguration());
475481
rm.start();
476-
RMApp app = rm.submitApp(1024);
477482

478-
MockNM nm = rm.registerNode("host1:1234", 8192);
479-
nm.nodeHeartbeat(true);
483+
EventHandler handler =
484+
spy(rm.getRMContext().getDispatcher().getEventHandler());
480485

481-
// Register node with some container statuses
486+
// Case 1: Unmanaged AM
487+
RMApp app = rm.submitApp(1024, true);
488+
489+
// Case 1.1: AppAttemptId is null
482490
ContainerStatus status = ContainerStatus.newInstance(
483491
ContainerId.newInstance(ApplicationAttemptId.newInstance(
484492
app.getApplicationId(), 2), 1),
485493
ContainerState.COMPLETE, "Dummy Completed", 0);
494+
rm.getResourceTrackerService().handleContainerStatus(status);
495+
verify(handler, never()).handle((Event) any());
496+
497+
// Case 1.2: Master container is null
498+
RMAppAttemptImpl currentAttempt =
499+
(RMAppAttemptImpl) app.getCurrentAppAttempt();
500+
currentAttempt.setMasterContainer(null);
501+
status = ContainerStatus.newInstance(
502+
ContainerId.newInstance(currentAttempt.getAppAttemptId(), 0),
503+
ContainerState.COMPLETE, "Dummy Completed", 0);
504+
rm.getResourceTrackerService().handleContainerStatus(status);
505+
verify(handler, never()).handle((Event)any());
486506

487-
// The following shouldn't throw NPE
488-
nm.registerNode(Collections.singletonList(status));
489-
assertEquals("Incorrect number of nodes", 1,
490-
rm.getRMContext().getRMNodes().size());
507+
// Case 2: Managed AM
508+
app = rm.submitApp(1024);
509+
510+
// Case 2.1: AppAttemptId is null
511+
status = ContainerStatus.newInstance(
512+
ContainerId.newInstance(ApplicationAttemptId.newInstance(
513+
app.getApplicationId(), 2), 1),
514+
ContainerState.COMPLETE, "Dummy Completed", 0);
515+
try {
516+
rm.getResourceTrackerService().handleContainerStatus(status);
517+
} catch (Exception e) {
518+
// expected - ignore
519+
}
520+
verify(handler, never()).handle((Event)any());
521+
522+
// Case 2.2: Master container is null
523+
currentAttempt =
524+
(RMAppAttemptImpl) app.getCurrentAppAttempt();
525+
currentAttempt.setMasterContainer(null);
526+
status = ContainerStatus.newInstance(
527+
ContainerId.newInstance(currentAttempt.getAppAttemptId(), 0),
528+
ContainerState.COMPLETE, "Dummy Completed", 0);
529+
try {
530+
rm.getResourceTrackerService().handleContainerStatus(status);
531+
} catch (Exception e) {
532+
// expected - ignore
533+
}
534+
verify(handler, never()).handle((Event)any());
491535
}
492536

493537
@Test

0 commit comments

Comments
 (0)