@@ -951,68 +951,76 @@ static ncclResult_t region_init_internal_p6(nccl_ofi_tuner_region_context_t *reg
951951
952952 nccl_ofi_tuner_point_t extended_tree_ll =
953953 extend_region ((nccl_ofi_tuner_point_t ){393216 , 16 },
954- (nccl_ofi_tuner_point_t ){393216 , 1024 },
955- (nccl_ofi_tuner_point_t ){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
954+ (nccl_ofi_tuner_point_t ){393216 , 1024 },
955+ (nccl_ofi_tuner_point_t ){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
956956 nccl_ofi_tuner_point_t extended_tree_ll128 =
957- extend_region ((nccl_ofi_tuner_point_t ){131596288 , 1024 },
958- (nccl_ofi_tuner_point_t ){144179200 , 2048 },
959- (nccl_ofi_tuner_point_t ){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
957+ extend_region ((nccl_ofi_tuner_point_t ){96993280 , 1024 },
958+ (nccl_ofi_tuner_point_t ){106430464 , 2048 },
959+ (nccl_ofi_tuner_point_t ){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
960960 nccl_ofi_tuner_point_t extended_nvlstree_simple =
961961 extend_region ((nccl_ofi_tuner_point_t ){10737418240 , 512 },
962- (nccl_ofi_tuner_point_t ){34359738368 , 1500 },
963- (nccl_ofi_tuner_point_t ){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
962+ (nccl_ofi_tuner_point_t ){34359738368 , 1500 },
963+ (nccl_ofi_tuner_point_t ){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
964964
965965 const nccl_ofi_tuner_region_t regions[] = {
966- {.algorithm = NCCL_ALGO_TREE,
967- .protocol = NCCL_PROTO_LL,
968- .num_vertices = 4 ,
969- .vertices = {{0 , 16 }, {393216 , 16 }, extended_tree_ll, {0 , TUNER_MAX_RANKS}}},
970- {.algorithm = NCCL_ALGO_TREE,
971- .protocol = NCCL_PROTO_LL128,
972- .num_vertices = 9 ,
973- .vertices = {
966+ {.algorithm = NCCL_ALGO_TREE,
967+ .protocol = NCCL_PROTO_LL,
968+ .num_vertices = 4 ,
969+ .vertices = {{0 , 16 }, {393216 , 16 }, extended_tree_ll, {0 , TUNER_MAX_RANKS}}},
970+ {.algorithm = NCCL_ALGO_TREE,
971+ .protocol = NCCL_PROTO_LL128,
972+ .num_vertices = 10 ,
973+ .vertices = {
974974 extended_tree_ll,
975975 {393216 , 16 },
976- {12058624 , 16 },
977- {24641536 , 32 },
978- {59244544 , 64 },
979- {90701824 , 128 },
980- {131596288 , 1024 },
981- {144179200 , 2048 },
976+ {4718592 , 16 },
977+ {18350080 , 32 },
978+ {40370176 , 64 },
979+ {57147392 , 128 },
980+ {72876032 , 256 },
981+ {96993280 , 1024 },
982+ {106430464 , 2048 },
982983 extended_tree_ll128}},
983- {.algorithm = NCCL_ALGO_RING,
984- .protocol = NCCL_PROTO_LL128,
985- .num_vertices = 5 ,
986- .vertices = {
984+ {.algorithm = NCCL_ALGO_RING,
985+ .protocol = NCCL_PROTO_LL128,
986+ .num_vertices = 5 ,
987+ .vertices = {
987988 {90701824 , 128 },
988- {59244544 , 64 },
989- {24641536 , 32 },
989+ {50855936 , 64 },
990+ {18350080 , 32 },
990991 {133693440 , 32 },
991992 {120061952 , 64 }}},
992- {.algorithm = NCCL_ALGO_NVLS_TREE,
993- .protocol = NCCL_PROTO_SIMPLE,
994- .num_vertices = 13 ,
995- .vertices = {
993+ {.algorithm = NCCL_ALGO_NVLS_TREE,
994+ .protocol = NCCL_PROTO_SIMPLE,
995+ .num_vertices = 19 ,
996+ .vertices = {
996997 extended_tree_ll128,
997- {144179200 , 2048 },
998- {131596288 , 1024 },
998+ {106430464 , 2048 },
999+ {96993280 , 1024 },
1000+ {72876032 , 256 },
1001+ {57147392 , 128 },
1002+ {40370176 , 64 },
1003+ {18350080 , 32 },
1004+ {50855936 , 64 },
9991005 {90701824 , 128 },
10001006 {120061952 , 64 },
10011007 {133693440 , 32 },
1002- {24641536 , 32 },
1003- {12058624 , 16 },
1008+ {18350080 , 32 },
1009+ {4718592 , 16 },
10041010 {TUNER_MAX_SIZE, 16 },
1005- {468713472 , 32 },
1011+ {435159040 , 32 },
1012+ {1072693248 , 64 },
10061013 {10737418240 , 512 },
10071014 {34359738368 , 1500 },
10081015 extended_nvlstree_simple}},
1009- {.algorithm = NCCL_ALGO_RING,
1010- .protocol = NCCL_PROTO_SIMPLE,
1011- .num_vertices = 4 ,
1012- .vertices = {
1016+ {.algorithm = NCCL_ALGO_RING,
1017+ .protocol = NCCL_PROTO_SIMPLE,
1018+ .num_vertices = 5 ,
1019+ .vertices = {
10131020 extended_nvlstree_simple,
10141021 {10737418240 , 512 },
1015- {468713472 , 32 },
1022+ {1072693248 , 64 },
1023+ {435159040 , 32 },
10161024 {TUNER_MAX_SIZE, 16 }}}};
10171025 ret = set_regions (region_ctx, collType, sizeof (regions) / sizeof (regions[0 ]), regions);
10181026 if (ret != ncclSuccess) {
@@ -1024,43 +1032,43 @@ static ncclResult_t region_init_internal_p6(nccl_ofi_tuner_region_context_t *reg
10241032
10251033 nccl_ofi_tuner_point_t extended_ring_ll =
10261034 extend_region ((nccl_ofi_tuner_point_t ){74973184 , 1024 },
1027- (nccl_ofi_tuner_point_t ){209190912 , 2048 },
1028- (nccl_ofi_tuner_point_t ){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
1035+ (nccl_ofi_tuner_point_t ){213385216 , 2048 },
1036+ (nccl_ofi_tuner_point_t ){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
10291037 nccl_ofi_tuner_point_t extended_ring_ll128 =
10301038 extend_region ((nccl_ofi_tuner_point_t ){8321499136 , 512 },
1031- (nccl_ofi_tuner_point_t ){32212254720 , 2048 },
1032- (nccl_ofi_tuner_point_t ){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
1039+ (nccl_ofi_tuner_point_t ){32212254720 , 2048 },
1040+ (nccl_ofi_tuner_point_t ){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
10331041
10341042 const nccl_ofi_tuner_region_t regions[] = {
10351043 {.algorithm = NCCL_ALGO_RING,
10361044 .protocol = NCCL_PROTO_LL,
10371045 .num_vertices = 10 ,
10381046 .vertices = {
10391047 {0 , 16 },
1040- {196608 , 16 },
1041- {196608 , 32 },
1042- {393216 , 64 },
1043- {8912896 , 128 },
1048+ {786432 , 16 },
1049+ {1572864 , 32 },
1050+ {2621440 , 64 },
1051+ {4718592 , 128 },
10441052 {17301504 , 256 },
10451053 {74973184 , 1024 },
1046- {209190912 , 2048 },
1054+ {213385216 , 2048 },
10471055 extended_ring_ll,
10481056 {0 , TUNER_MAX_RANKS}}},
10491057 {.algorithm = NCCL_ALGO_RING,
10501058 .protocol = NCCL_PROTO_LL128,
10511059 .num_vertices = 14 ,
10521060 .vertices = {
10531061 extended_ring_ll,
1054- {209190912 , 2048 },
1062+ {213385216 , 2048 },
10551063 {74973184 , 1024 },
10561064 {17301504 , 256 },
1057- {8912896 , 128 },
1058- {393216 , 64 },
1059- {196608 , 32 },
1060- {196608 , 16 },
1061- {234356736 , 16 },
1062- {374341632 , 32 },
1063- {829423616 , 64 },
1065+ {4718592 , 128 },
1066+ {2621440 , 64 },
1067+ {1572864 , 32 },
1068+ {786432 , 16 },
1069+ {198705152 , 16 },
1070+ {456130560 , 32 },
1071+ {871366656 , 64 },
10641072 {8321499136 , 512 },
10651073 {32212254720 , 2048 },
10661074 extended_ring_ll128}},
@@ -1071,11 +1079,10 @@ static ncclResult_t region_init_internal_p6(nccl_ofi_tuner_region_context_t *reg
10711079 extended_ring_ll128,
10721080 {32212254720 , 2048 },
10731081 {8321499136 , 512 },
1074- {829423616 , 64 },
1075- {374341632 , 32 },
1076- {234356736 , 16 },
1082+ {871366656 , 64 },
1083+ {456130560 , 32 },
1084+ {198705152 , 16 },
10771085 {TUNER_MAX_SIZE, 16 }}}};
1078-
10791086 ret = set_regions (region_ctx, collType, sizeof (regions) / sizeof (regions[0 ]), regions);
10801087 if (ret != ncclSuccess) {
10811088 goto exit;
@@ -1085,44 +1092,46 @@ static ncclResult_t region_init_internal_p6(nccl_ofi_tuner_region_context_t *reg
10851092 collType = ncclFuncReduceScatter;
10861093
10871094 nccl_ofi_tuner_point_t extended_ring_ll =
1088- extend_region ((nccl_ofi_tuner_point_t ){78118912 , 1024 },
1089- (nccl_ofi_tuner_point_t ){217579520 , 2048 },
1090- (nccl_ofi_tuner_point_t ){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
1095+ extend_region ((nccl_ofi_tuner_point_t ){73924608 , 1024 },
1096+ (nccl_ofi_tuner_point_t ){209190912 , 2048 },
1097+ (nccl_ofi_tuner_point_t ){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
10911098 nccl_ofi_tuner_point_t extended_ring_ll128 =
10921099 extend_region ((nccl_ofi_tuner_point_t ){4294967296 , 256 },
1093- (nccl_ofi_tuner_point_t ){8589934592 , 512 },
1094- (nccl_ofi_tuner_point_t ){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
1100+ (nccl_ofi_tuner_point_t ){8589934592 , 512 },
1101+ (nccl_ofi_tuner_point_t ){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
10951102
10961103 const nccl_ofi_tuner_region_t regions[] = {
10971104 {.algorithm = NCCL_ALGO_RING,
10981105 .protocol = NCCL_PROTO_LL,
1099- .num_vertices = 10 ,
1106+ .num_vertices = 11 ,
11001107 .vertices = {
11011108 {0 , 16 },
1102- {393216 , 16 },
1103- {196608 , 32 },
1104- {393216 , 64 },
1105- {1572864 , 128 },
1109+ {786432 , 16 },
1110+ {1572864 , 32 },
1111+ {2621440 , 64 },
1112+ {4718592 , 128 },
11061113 {17301504 , 256 },
1107- {78118912 , 1024 },
1108- {217579520 , 2048 },
1114+ {35127296 , 512 },
1115+ {73924608 , 1024 },
1116+ {209190912 , 2048 },
11091117 extended_ring_ll,
11101118 {0 , TUNER_MAX_RANKS}}},
11111119 {.algorithm = NCCL_ALGO_RING,
11121120 .protocol = NCCL_PROTO_LL128,
1113- .num_vertices = 14 ,
1121+ .num_vertices = 15 ,
11141122 .vertices = {
11151123 extended_ring_ll,
1116- {217579520 , 2048 },
1117- {78118912 , 1024 },
1124+ {209190912 , 2048 },
1125+ {73924608 , 1024 },
1126+ {35127296 , 512 },
11181127 {17301504 , 256 },
1119- {1572864 , 128 },
1120- {393216 , 64 },
1121- {196608 , 32 },
1122- {393216 , 16 },
1123- {187170816 , 16 },
1124- {472907776 , 32 },
1125- {594542592 , 64 },
1128+ {4718592 , 128 },
1129+ {2621440 , 64 },
1130+ {1572864 , 32 },
1131+ {786432 , 16 },
1132+ {219676672 , 16 },
1133+ {508559360 , 32 },
1134+ {592445440 , 64 },
11261135 {4294967296 , 256 },
11271136 {8589934592 , 512 },
11281137 extended_ring_ll128}},
@@ -1132,11 +1141,10 @@ static ncclResult_t region_init_internal_p6(nccl_ofi_tuner_region_context_t *reg
11321141 .vertices = {
11331142 extended_ring_ll128,
11341143 {4294967296 , 256 },
1135- {594542592 , 64 },
1136- {472907776 , 32 },
1137- {187170816 , 16 },
1144+ {592445440 , 64 },
1145+ {508559360 , 32 },
1146+ {219676672 , 16 },
11381147 {TUNER_MAX_SIZE, 16 }}}};
1139-
11401148 ret = set_regions (region_ctx, collType, sizeof (regions) / sizeof (regions[0 ]), regions);
11411149 if (ret != ncclSuccess) {
11421150 goto exit;
0 commit comments