Skip to content

Commit 3ac9392

Browse files
committed
tuner: update all 0x0 p6 tuner regions.
This change only updates those for 0x0 collectives on p6. Signed-off-by: Feng Ji <[email protected]>
1 parent 817a7a4 commit 3ac9392

File tree

1 file changed

+96
-88
lines changed

1 file changed

+96
-88
lines changed

src/tuner/nccl_ofi_regions.cpp

Lines changed: 96 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -951,68 +951,76 @@ static ncclResult_t region_init_internal_p6(nccl_ofi_tuner_region_context_t *reg
951951

952952
nccl_ofi_tuner_point_t extended_tree_ll =
953953
extend_region((nccl_ofi_tuner_point_t){393216, 16},
954-
(nccl_ofi_tuner_point_t){393216, 1024},
955-
(nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
954+
(nccl_ofi_tuner_point_t){393216, 1024},
955+
(nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
956956
nccl_ofi_tuner_point_t extended_tree_ll128 =
957-
extend_region((nccl_ofi_tuner_point_t){131596288, 1024},
958-
(nccl_ofi_tuner_point_t){144179200, 2048},
959-
(nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
957+
extend_region((nccl_ofi_tuner_point_t){96993280, 1024},
958+
(nccl_ofi_tuner_point_t){106430464, 2048},
959+
(nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
960960
nccl_ofi_tuner_point_t extended_nvlstree_simple =
961961
extend_region((nccl_ofi_tuner_point_t){10737418240, 512},
962-
(nccl_ofi_tuner_point_t){34359738368, 1500},
963-
(nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
962+
(nccl_ofi_tuner_point_t){34359738368, 1500},
963+
(nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
964964

965965
const nccl_ofi_tuner_region_t regions[] = {
966-
{.algorithm = NCCL_ALGO_TREE,
967-
.protocol = NCCL_PROTO_LL,
968-
.num_vertices = 4,
969-
.vertices = {{0, 16}, {393216, 16}, extended_tree_ll, {0, TUNER_MAX_RANKS}}},
970-
{.algorithm = NCCL_ALGO_TREE,
971-
.protocol = NCCL_PROTO_LL128,
972-
.num_vertices = 9,
973-
.vertices = {
966+
{.algorithm = NCCL_ALGO_TREE,
967+
.protocol = NCCL_PROTO_LL,
968+
.num_vertices = 4,
969+
.vertices = {{0, 16}, {393216, 16}, extended_tree_ll, {0, TUNER_MAX_RANKS}}},
970+
{.algorithm = NCCL_ALGO_TREE,
971+
.protocol = NCCL_PROTO_LL128,
972+
.num_vertices = 10,
973+
.vertices = {
974974
extended_tree_ll,
975975
{393216, 16},
976-
{12058624, 16},
977-
{24641536, 32},
978-
{59244544, 64},
979-
{90701824, 128},
980-
{131596288, 1024},
981-
{144179200, 2048},
976+
{4718592, 16},
977+
{18350080, 32},
978+
{40370176, 64},
979+
{57147392, 128},
980+
{72876032, 256},
981+
{96993280, 1024},
982+
{106430464, 2048},
982983
extended_tree_ll128}},
983-
{.algorithm = NCCL_ALGO_RING,
984-
.protocol = NCCL_PROTO_LL128,
985-
.num_vertices = 5,
986-
.vertices = {
984+
{.algorithm = NCCL_ALGO_RING,
985+
.protocol = NCCL_PROTO_LL128,
986+
.num_vertices = 5,
987+
.vertices = {
987988
{90701824, 128},
988-
{59244544, 64},
989-
{24641536, 32},
989+
{50855936, 64},
990+
{18350080, 32},
990991
{133693440, 32},
991992
{120061952, 64}}},
992-
{.algorithm = NCCL_ALGO_NVLS_TREE,
993-
.protocol = NCCL_PROTO_SIMPLE,
994-
.num_vertices = 13,
995-
.vertices = {
993+
{.algorithm = NCCL_ALGO_NVLS_TREE,
994+
.protocol = NCCL_PROTO_SIMPLE,
995+
.num_vertices = 19,
996+
.vertices = {
996997
extended_tree_ll128,
997-
{144179200, 2048},
998-
{131596288, 1024},
998+
{106430464, 2048},
999+
{96993280, 1024},
1000+
{72876032, 256},
1001+
{57147392, 128},
1002+
{40370176, 64},
1003+
{18350080, 32},
1004+
{50855936, 64},
9991005
{90701824, 128},
10001006
{120061952, 64},
10011007
{133693440, 32},
1002-
{24641536, 32},
1003-
{12058624, 16},
1008+
{18350080, 32},
1009+
{4718592, 16},
10041010
{TUNER_MAX_SIZE, 16},
1005-
{468713472, 32},
1011+
{435159040, 32},
1012+
{1072693248, 64},
10061013
{10737418240, 512},
10071014
{34359738368, 1500},
10081015
extended_nvlstree_simple}},
1009-
{.algorithm = NCCL_ALGO_RING,
1010-
.protocol = NCCL_PROTO_SIMPLE,
1011-
.num_vertices = 4,
1012-
.vertices = {
1016+
{.algorithm = NCCL_ALGO_RING,
1017+
.protocol = NCCL_PROTO_SIMPLE,
1018+
.num_vertices = 5,
1019+
.vertices = {
10131020
extended_nvlstree_simple,
10141021
{10737418240, 512},
1015-
{468713472, 32},
1022+
{1072693248, 64},
1023+
{435159040, 32},
10161024
{TUNER_MAX_SIZE, 16}}}};
10171025
ret = set_regions(region_ctx, collType, sizeof(regions) / sizeof(regions[0]), regions);
10181026
if (ret != ncclSuccess) {
@@ -1024,43 +1032,43 @@ static ncclResult_t region_init_internal_p6(nccl_ofi_tuner_region_context_t *reg
10241032

10251033
nccl_ofi_tuner_point_t extended_ring_ll =
10261034
extend_region((nccl_ofi_tuner_point_t){74973184, 1024},
1027-
(nccl_ofi_tuner_point_t){209190912, 2048},
1028-
(nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
1035+
(nccl_ofi_tuner_point_t){213385216, 2048},
1036+
(nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
10291037
nccl_ofi_tuner_point_t extended_ring_ll128 =
10301038
extend_region((nccl_ofi_tuner_point_t){8321499136, 512},
1031-
(nccl_ofi_tuner_point_t){32212254720, 2048},
1032-
(nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
1039+
(nccl_ofi_tuner_point_t){32212254720, 2048},
1040+
(nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
10331041

10341042
const nccl_ofi_tuner_region_t regions[] = {
10351043
{.algorithm = NCCL_ALGO_RING,
10361044
.protocol = NCCL_PROTO_LL,
10371045
.num_vertices = 10,
10381046
.vertices = {
10391047
{0, 16},
1040-
{196608, 16},
1041-
{196608, 32},
1042-
{393216, 64},
1043-
{8912896, 128},
1048+
{786432, 16},
1049+
{1572864, 32},
1050+
{2621440, 64},
1051+
{4718592, 128},
10441052
{17301504, 256},
10451053
{74973184, 1024},
1046-
{209190912, 2048},
1054+
{213385216, 2048},
10471055
extended_ring_ll,
10481056
{0, TUNER_MAX_RANKS}}},
10491057
{.algorithm = NCCL_ALGO_RING,
10501058
.protocol = NCCL_PROTO_LL128,
10511059
.num_vertices = 14,
10521060
.vertices = {
10531061
extended_ring_ll,
1054-
{209190912, 2048},
1062+
{213385216, 2048},
10551063
{74973184, 1024},
10561064
{17301504, 256},
1057-
{8912896, 128},
1058-
{393216, 64},
1059-
{196608, 32},
1060-
{196608, 16},
1061-
{234356736, 16},
1062-
{374341632, 32},
1063-
{829423616, 64},
1065+
{4718592, 128},
1066+
{2621440, 64},
1067+
{1572864, 32},
1068+
{786432, 16},
1069+
{198705152, 16},
1070+
{456130560, 32},
1071+
{871366656, 64},
10641072
{8321499136, 512},
10651073
{32212254720, 2048},
10661074
extended_ring_ll128}},
@@ -1071,11 +1079,10 @@ static ncclResult_t region_init_internal_p6(nccl_ofi_tuner_region_context_t *reg
10711079
extended_ring_ll128,
10721080
{32212254720, 2048},
10731081
{8321499136, 512},
1074-
{829423616, 64},
1075-
{374341632, 32},
1076-
{234356736, 16},
1082+
{871366656, 64},
1083+
{456130560, 32},
1084+
{198705152, 16},
10771085
{TUNER_MAX_SIZE, 16}}}};
1078-
10791086
ret = set_regions(region_ctx, collType, sizeof(regions) / sizeof(regions[0]), regions);
10801087
if (ret != ncclSuccess) {
10811088
goto exit;
@@ -1085,44 +1092,46 @@ static ncclResult_t region_init_internal_p6(nccl_ofi_tuner_region_context_t *reg
10851092
collType = ncclFuncReduceScatter;
10861093

10871094
nccl_ofi_tuner_point_t extended_ring_ll =
1088-
extend_region((nccl_ofi_tuner_point_t){78118912, 1024},
1089-
(nccl_ofi_tuner_point_t){217579520, 2048},
1090-
(nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
1095+
extend_region((nccl_ofi_tuner_point_t){73924608, 1024},
1096+
(nccl_ofi_tuner_point_t){209190912, 2048},
1097+
(nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
10911098
nccl_ofi_tuner_point_t extended_ring_ll128 =
10921099
extend_region((nccl_ofi_tuner_point_t){4294967296, 256},
1093-
(nccl_ofi_tuner_point_t){8589934592, 512},
1094-
(nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
1100+
(nccl_ofi_tuner_point_t){8589934592, 512},
1101+
(nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
10951102

10961103
const nccl_ofi_tuner_region_t regions[] = {
10971104
{.algorithm = NCCL_ALGO_RING,
10981105
.protocol = NCCL_PROTO_LL,
1099-
.num_vertices = 10,
1106+
.num_vertices = 11,
11001107
.vertices = {
11011108
{0, 16},
1102-
{393216, 16},
1103-
{196608, 32},
1104-
{393216, 64},
1105-
{1572864, 128},
1109+
{786432, 16},
1110+
{1572864, 32},
1111+
{2621440, 64},
1112+
{4718592, 128},
11061113
{17301504, 256},
1107-
{78118912, 1024},
1108-
{217579520, 2048},
1114+
{35127296, 512},
1115+
{73924608, 1024},
1116+
{209190912, 2048},
11091117
extended_ring_ll,
11101118
{0, TUNER_MAX_RANKS}}},
11111119
{.algorithm = NCCL_ALGO_RING,
11121120
.protocol = NCCL_PROTO_LL128,
1113-
.num_vertices = 14,
1121+
.num_vertices = 15,
11141122
.vertices = {
11151123
extended_ring_ll,
1116-
{217579520, 2048},
1117-
{78118912, 1024},
1124+
{209190912, 2048},
1125+
{73924608, 1024},
1126+
{35127296, 512},
11181127
{17301504, 256},
1119-
{1572864, 128},
1120-
{393216, 64},
1121-
{196608, 32},
1122-
{393216, 16},
1123-
{187170816, 16},
1124-
{472907776, 32},
1125-
{594542592, 64},
1128+
{4718592, 128},
1129+
{2621440, 64},
1130+
{1572864, 32},
1131+
{786432, 16},
1132+
{219676672, 16},
1133+
{508559360, 32},
1134+
{592445440, 64},
11261135
{4294967296, 256},
11271136
{8589934592, 512},
11281137
extended_ring_ll128}},
@@ -1132,11 +1141,10 @@ static ncclResult_t region_init_internal_p6(nccl_ofi_tuner_region_context_t *reg
11321141
.vertices = {
11331142
extended_ring_ll128,
11341143
{4294967296, 256},
1135-
{594542592, 64},
1136-
{472907776, 32},
1137-
{187170816, 16},
1144+
{592445440, 64},
1145+
{508559360, 32},
1146+
{219676672, 16},
11381147
{TUNER_MAX_SIZE, 16}}}};
1139-
11401148
ret = set_regions(region_ctx, collType, sizeof(regions) / sizeof(regions[0]), regions);
11411149
if (ret != ncclSuccess) {
11421150
goto exit;

0 commit comments

Comments
 (0)