Skip to content

Commit 8f15f98

Browse files
committed
platform: Remove NCCL_NETDEVS_POLICY setting from P5en/P6-B200
Remove NCCL_NETDEVS_POLICY=max:1 setting from the default environment variables for P5en/P6-B200 instances. This configuration requires additional testing and optimization before being included as a default setting. This reverts commit 2474f73. Signed-off-by: Mozar Huang <[email protected]>
1 parent 0ecdc99 commit 8f15f98

File tree

2 files changed

+5
-40
lines changed

2 files changed

+5
-40
lines changed

src/platform-aws.cpp

Lines changed: 3 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -70,12 +70,6 @@
7070
*
7171
* The NVLSTree chunk size can not be larger than the NVLS chunk size,
7272
* so we ensure both are set to 512KiB.
73-
*
74-
* * NCCL v2.28.3 introduced NCCL_NETDEVS_POLICY to control how NET devices
75-
* are assigned to GPUs. In platforms having multiple GPUs and NICs per
76-
* PCIe switch, setting this policy will change the traffic distribution
77-
* across the NICs, depending on the collective type as well as other
78-
* factors. Thus, the best policy setting may vary per platform.
7973
*/
8074
static struct ec2_platform_data platform_data_map[] = {
8175
{
@@ -145,41 +139,13 @@ static struct ec2_platform_data platform_data_map[] = {
145139
{ "NCCL_NET_FORCE_FLUSH", "0" },
146140
},
147141
},
148-
{
149-
.name = "p5en/p6-b200",
150-
.regex = "^(p5en|p6-b200).*",
151-
.topology = NULL,
152-
.default_dup_conns = 0,
153-
.latency = 35.0,
154-
.gdr_required = true,
155-
.default_protocol = PROTOCOL::RDMA,
156-
.domain_per_thread = true,
157-
/*
158-
* Note: Based on empirical testing, setting the
159-
* NCCL_NETDEVS_POLICY=max:1 gives optimal performance
160-
* on platforms with 2 GPUs and 2 NICs per PCIe switch,
161-
* such as P5en and P6-B200.
162-
*/
163-
.env = {
164-
{ "NCCL_BUFFSIZE", "8388608" },
165-
{ "NCCL_P2P_NET_CHUNKSIZE", "524288" },
166-
{ "NCCL_NVLSTREE_MAX_CHUNKSIZE", "524288" },
167-
{ "NCCL_NVLS_CHUNKSIZE", "524288" },
168-
{ "NCCL_NET_FORCE_FLUSH", "0" },
169-
{ "NCCL_NETDEVS_POLICY", "max:1" },
170-
},
171-
},
172142
{
173143
.name = "p-series",
174144
/*
175145
* While the regex will match against P5 and later
176-
* instance families, we expect this to apply to
177-
* P6e-GB200 and later, due to previous entries to
178-
* match P5, P5e, P5en, and P6-B200.
179-
*
180-
* Note: Need to revisit NCCL_NETDEVS_POLICY when
181-
* platforms have different topology or major
182-
* hardware changes.
146+
* instance families, we expect this to only apply
147+
* to P5en and later, due to previous entries to
148+
* match P5 and P5e.
183149
*/
184150
.regex = "^p([5-9]|[0-9]{2,}).*",
185151
.topology = NULL,

tests/unit/aws_platform_mapper.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,8 @@ static int check_known_platforms(void)
6060
ret += check_value(platform_data_list, len, "p5.4xlarge", "p5.4xlarge");
6161
ret += check_value(platform_data_list, len, "p5.48xlarge", "p5/p5e");
6262
ret += check_value(platform_data_list, len, "p5e.48xlarge", "p5/p5e");
63-
ret += check_value(platform_data_list, len, "p5en.48xlarge", "p5en/p6-b200");
64-
ret += check_value(platform_data_list, len, "p6-b200.48xlarge", "p5en/p6-b200");
65-
ret += check_value(platform_data_list, len, "p6e-gb200.36xlarge", "p-series");
63+
ret += check_value(platform_data_list, len, "p5en.48xlarge", "p-series");
64+
ret += check_value(platform_data_list, len, "p6-b200.48xlarge", "p-series");
6665
ret += check_value(platform_data_list, len, "g5.48xlarge", "g5.48xlarge");
6766
ret += check_value(platform_data_list, len, "g6.16xlarge", NULL);
6867

0 commit comments

Comments
 (0)