|
70 | 70 | * |
71 | 71 | * The NVLSTree chunk size can not be larger than the NVLS chunk size, |
72 | 72 | * so we ensure both are set to 512KiB. |
73 | | - * |
74 | | - * * NCCL v2.28.3 introduced NCCL_NETDEVS_POLICY to control how NET devices |
75 | | - * are assigned to GPUs. In platforms having multiple GPUs and NICs per |
76 | | - * PCIe switch, setting this policy will change the traffic distribution |
77 | | - * across the NICs, depending on the collective type as well as other |
78 | | - * factors. Thus, the best policy setting may vary per platform. |
79 | 73 | */ |
80 | 74 | static struct ec2_platform_data platform_data_map[] = { |
81 | 75 | { |
@@ -145,41 +139,13 @@ static struct ec2_platform_data platform_data_map[] = { |
145 | 139 | { "NCCL_NET_FORCE_FLUSH", "0" }, |
146 | 140 | }, |
147 | 141 | }, |
148 | | - { |
149 | | - .name = "p5en/p6-b200", |
150 | | - .regex = "^(p5en|p6-b200).*", |
151 | | - .topology = NULL, |
152 | | - .default_dup_conns = 0, |
153 | | - .latency = 35.0, |
154 | | - .gdr_required = true, |
155 | | - .default_protocol = PROTOCOL::RDMA, |
156 | | - .domain_per_thread = true, |
157 | | - /* |
158 | | - * Note: Based on empirical testing, setting the |
159 | | - * NCCL_NETDEVS_POLICY=max:1 gives optimal performance |
160 | | - * on platforms with 2 GPUs and 2 NICs per PCIe switch, |
161 | | - * such as P5en and P6-B200. |
162 | | - */ |
163 | | - .env = { |
164 | | - { "NCCL_BUFFSIZE", "8388608" }, |
165 | | - { "NCCL_P2P_NET_CHUNKSIZE", "524288" }, |
166 | | - { "NCCL_NVLSTREE_MAX_CHUNKSIZE", "524288" }, |
167 | | - { "NCCL_NVLS_CHUNKSIZE", "524288" }, |
168 | | - { "NCCL_NET_FORCE_FLUSH", "0" }, |
169 | | - { "NCCL_NETDEVS_POLICY", "max:1" }, |
170 | | - }, |
171 | | - }, |
172 | 142 | { |
173 | 143 | .name = "p-series", |
174 | 144 | /* |
175 | 145 | * While the regex will match against P5 and later |
176 | | - * instance families, we expect this to apply to |
177 | | - * P6e-GB200 and later, due to previous entries to |
178 | | - * match P5, P5e, P5en, and P6-B200. |
179 | | - * |
180 | | - * Note: Need to revisit NCCL_NETDEVS_POLICY when |
181 | | - * platforms have different topology or major |
182 | | - * hardware changes. |
| 146 | + * instance families, we expect this to only apply |
| 147 | + * to P5en and later, due to previous entries to |
| 148 | + * match P5 and P5e. |
183 | 149 | */ |
184 | 150 | .regex = "^p([5-9]|[0-9]{2,}).*", |
185 | 151 | .topology = NULL, |
|
0 commit comments