Skip to content

use null tokenizer #13480

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
May 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion scripts/performance/argument_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,6 @@ def list_of_strings(arg):
required=False,
default=None,
)

parser.add_argument(
"-cm",
"--custom_mounts",
Expand All @@ -347,5 +346,11 @@ def list_of_strings(arg):
required=False,
default=[],
)
parser.add_argument(
"--use_hf_tokenizer",
help="Use HuggingFace tokenizer. Disabled by default. Null tokenizer will be used if not provided.",
action="store_true",
required=False,
)

return parser
8 changes: 7 additions & 1 deletion scripts/performance/llm/finetune_deepseek_v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from nemo.collections.llm.gpt.data.mock import MockDataModule
from nemo.collections.llm.gpt.data.squad import SquadDataModule
from nemo.collections.llm.recipes.deepseek_v3 import finetune_recipe, model
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
from nemo.lightning.pytorch.callbacks.moe_token_drop import MegatronTokenDropCallback
Expand Down Expand Up @@ -108,7 +109,12 @@ def override_recipe_configs(
recipe.resume.restore_config = None

# data module configs
recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
if args.use_hf_tokenizer:
recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
else:
recipe.data.tokenizer = run.Config(
get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=129280
)
recipe.model.tokenizer = recipe.data.tokenizer

if recipe.data.__fn_or_cls__ == SquadDataModule and not isfile_train_pack_metadata(HF_MODEL_URI, recipe.data):
Expand Down
9 changes: 8 additions & 1 deletion scripts/performance/llm/finetune_llama31_405b.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import (
userbuffers_fp8_h100_h16384_tp4_mbs1_seqlen2048_lora,
)
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin, PerfEnvPlugin

from ..argument_parser import parse_cli_args
Expand Down Expand Up @@ -103,7 +104,13 @@ def override_recipe_configs(
)

# data module configs
recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
if args.use_hf_tokenizer:
recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
else:
recipe.data.tokenizer = run.Config(
get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=128256
)
recipe.model.tokenizer = recipe.data.tokenizer
if recipe.data.__fn_or_cls__ == SquadDataModule and not isfile_train_pack_metadata(HF_MODEL_URI, recipe.data):
# flag is valid only for SquadDataModule
recipe.data.force_redownload = True
Expand Down
9 changes: 8 additions & 1 deletion scripts/performance/llm/finetune_llama3_70b.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import (
userbuffers_fp8_h100_h8192_tp2_mbs1_seqlen4096_lora,
)
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin, PerfEnvPlugin

from ..argument_parser import parse_cli_args
Expand Down Expand Up @@ -110,7 +111,13 @@ def override_recipe_configs(
)

# data module configs
recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
if args.use_hf_tokenizer:
recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
else:
recipe.data.tokenizer = run.Config(
get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=128256
)
recipe.model.tokenizer = recipe.data.tokenizer
if recipe.data.__fn_or_cls__ == SquadDataModule and not isfile_train_pack_metadata(HF_MODEL_URI, recipe.data):
# flag is valid only for SquadDataModule
recipe.data.force_redownload = True
Expand Down
9 changes: 8 additions & 1 deletion scripts/performance/llm/finetune_llama3_8b.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

from nemo.collections.llm.gpt.data.squad import SquadDataModule
from nemo.collections.llm.recipes.llama3_8b import finetune_recipe, model
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin, PerfEnvPlugin

from ..argument_parser import parse_cli_args
Expand Down Expand Up @@ -96,7 +97,13 @@ def override_recipe_configs(
)

# data module configs
recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
if args.use_hf_tokenizer:
recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
else:
recipe.data.tokenizer = run.Config(
get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=128256
)
recipe.model.tokenizer = recipe.data.tokenizer
if recipe.data.__fn_or_cls__ == SquadDataModule and not isfile_train_pack_metadata(HF_MODEL_URI, recipe.data):
# flag is valid only for SquadDataModule
recipe.data.force_redownload = True
Expand Down
8 changes: 7 additions & 1 deletion scripts/performance/llm/pretrain_deepseek_v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import nemo_run as run

from nemo.collections.llm.recipes.deepseek_v3 import pretrain_recipe
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
from nemo.lightning.pytorch.callbacks.moe_token_drop import MegatronTokenDropCallback
Expand Down Expand Up @@ -123,7 +124,12 @@ def override_recipe_configs(
)

# data module configs
recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
if args.use_hf_tokenizer:
recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
else:
recipe.data.tokenizer = run.Config(
get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=129280
)
recipe.model.tokenizer = recipe.data.tokenizer

# compute dtype configs
Expand Down
9 changes: 8 additions & 1 deletion scripts/performance/llm/pretrain_gpt3_175b.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
userbuffers_fp8_b200_h12288_tp4_mbs1_seqlen2048,
userbuffers_fp8_h100_h12288_tp4_mbs1_seqlen2048,
)
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin, PerfEnvPlugin

from ..argument_parser import parse_cli_args
Expand Down Expand Up @@ -87,7 +88,13 @@ def override_recipe_configs(

gpu_type = args.gpu.lower()
# data module configs
recipe.data.tokenizer = hf_tokenizer("nvidia/megatron-gpt2-345m")
if args.use_hf_tokenizer:
recipe.data.tokenizer = hf_tokenizer("nvidia/megatron-gpt2-345m")
else:
recipe.data.tokenizer = run.Config(
get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=51200
)
recipe.model.tokenizer = recipe.data.tokenizer

ub_cfg = {
"h100": {
Expand Down
9 changes: 8 additions & 1 deletion scripts/performance/llm/pretrain_llama31_405b.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
userbuffers_fp8_b200_h16384_tp4_cp2_mbs1_seqlen8192,
userbuffers_fp8_h100_h16384_tp8_cp2_mbs1_seqlen8192,
)
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin, PerfEnvPlugin

from ..argument_parser import parse_cli_args
Expand Down Expand Up @@ -88,7 +89,13 @@ def override_recipe_configs(
gpu_type = args.gpu.lower()

# data module configs
recipe.data.tokenizer = hf_tokenizer("meta-llama/Llama-3.1-405B")
if args.use_hf_tokenizer:
recipe.data.tokenizer = hf_tokenizer("meta-llama/Llama-3.1-405B")
else:
recipe.data.tokenizer = run.Config(
get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=128256
)
recipe.model.tokenizer = recipe.data.tokenizer

ub_cfg = {
"h100": {
Expand Down
9 changes: 8 additions & 1 deletion scripts/performance/llm/pretrain_llama3_70b.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
userbuffers_fp8_b200_h8192_tp2_mbs1_seqlen8192,
userbuffers_fp8_h100_h8192_tp4_mbs1_seqlen8192,
)
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin, PerfEnvPlugin

from ..argument_parser import parse_cli_args
Expand Down Expand Up @@ -88,7 +89,13 @@ def override_recipe_configs(
gpu_type = args.gpu.lower()

# data module configs
recipe.data.tokenizer = hf_tokenizer("meta-llama/Meta-Llama-3-70B")
if args.use_hf_tokenizer:
recipe.data.tokenizer = hf_tokenizer("meta-llama/Meta-Llama-3-70B")
else:
recipe.data.tokenizer = run.Config(
get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=128256
)
recipe.model.tokenizer = recipe.data.tokenizer

ub_cfg = {
"h100": {
Expand Down
9 changes: 8 additions & 1 deletion scripts/performance/llm/pretrain_llama3_8b.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import nemo_run as run

from nemo.collections.llm.recipes.llama3_8b import pretrain_recipe
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin, PerfEnvPlugin

from ..argument_parser import parse_cli_args
Expand Down Expand Up @@ -72,7 +73,13 @@ def override_recipe_configs(
)

# data module configs
recipe.data.tokenizer = hf_tokenizer("meta-llama/Meta-Llama-3-8B")
if args.use_hf_tokenizer:
recipe.data.tokenizer = hf_tokenizer("meta-llama/Meta-Llama-3-8B")
else:
recipe.data.tokenizer = run.Config(
get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=128256
)
recipe.model.tokenizer = recipe.data.tokenizer

return recipe

Expand Down
9 changes: 8 additions & 1 deletion scripts/performance/llm/pretrain_llama4_e128.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

from nemo.collections.llm.recipes.llama4_e128 import pretrain_recipe
from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin, PerfEnvPlugin

from ..argument_parser import parse_cli_args
Expand Down Expand Up @@ -72,7 +73,13 @@ def override_recipe_configs(
)

# data module configs
recipe.data.tokenizer = hf_tokenizer('meta-llama/Llama-4-Scout-17B-16E-Instruct')
if args.use_hf_tokenizer:
recipe.data.tokenizer = hf_tokenizer('meta-llama/Llama-4-Scout-17B-16E-Instruct')
else:
recipe.data.tokenizer = run.Config(
get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=202048
)
recipe.model.tokenizer = recipe.data.tokenizer

# compute dtype configs
if args.compute_dtype.lower() == "fp8":
Expand Down
9 changes: 8 additions & 1 deletion scripts/performance/llm/pretrain_llama4_e16.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

from nemo.collections.llm.recipes.llama4_e16 import pretrain_recipe
from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin, PerfEnvPlugin

from ..argument_parser import parse_cli_args
Expand Down Expand Up @@ -72,7 +73,13 @@ def override_recipe_configs(
)

# data module configs
recipe.data.tokenizer = hf_tokenizer('meta-llama/Llama-4-Scout-17B-16E-Instruct')
if args.use_hf_tokenizer:
recipe.data.tokenizer = hf_tokenizer('meta-llama/Llama-4-Scout-17B-16E-Instruct')
else:
recipe.data.tokenizer = run.Config(
get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=202048
)
recipe.model.tokenizer = recipe.data.tokenizer

# compute dtype configs
if args.compute_dtype.lower() == "fp8":
Expand Down
9 changes: 8 additions & 1 deletion scripts/performance/llm/pretrain_mixtral_8x22b.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import nemo_run as run

from nemo.collections.llm.recipes.mixtral_8x22b_64k import pretrain_recipe
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin, PerfEnvPlugin

from ..argument_parser import parse_cli_args
Expand Down Expand Up @@ -80,7 +81,13 @@ def override_recipe_configs(
)

# data module configs
recipe.data.tokenizer = hf_tokenizer("mistralai/Mixtral-8x22B-v0.1")
if args.use_hf_tokenizer:
recipe.data.tokenizer = hf_tokenizer("mistralai/Mixtral-8x22B-v0.1")
else:
recipe.data.tokenizer = run.Config(
get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=32000
)
recipe.model.tokenizer = recipe.data.tokenizer

# to mitigate the incorrect gradient_scaling_factor calculation in megatron.core
# under scenario average_in_collective=True and tp_size != etp_size, disabling average_in_collective.
Expand Down
9 changes: 8 additions & 1 deletion scripts/performance/llm/pretrain_mixtral_8x7b.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import nemo_run as run

from nemo.collections.llm.recipes.mixtral_8x7b import pretrain_recipe
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin, PerfEnvPlugin

from ..argument_parser import parse_cli_args
Expand Down Expand Up @@ -74,7 +75,13 @@ def override_recipe_configs(
)

# data module configs
recipe.data.tokenizer = hf_tokenizer("mistralai/Mixtral-8x7B-v0.1")
if args.use_hf_tokenizer:
recipe.data.tokenizer = hf_tokenizer("mistralai/Mixtral-8x7B-v0.1")
else:
recipe.data.tokenizer = run.Config(
get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=32000
)
recipe.model.tokenizer = recipe.data.tokenizer

return recipe

Expand Down
3 changes: 3 additions & 0 deletions scripts/performance/llm/pretrain_nemotron3_22b.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from ..utils import (
args_sanity_check,
get_user_configs,
logging,
set_exp_logging_configs,
set_primary_perf_configs,
slurm_executor,
Expand Down Expand Up @@ -77,6 +78,8 @@ def override_recipe_configs(
)

# data module configs
if args.use_hf_tokenizer:
logging.warning("HuggingFace tokenizer not supported for Nemotron3 22B. Using NullTokenizer.")
recipe.data.tokenizer = run.Config(
get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=256000
)
Expand Down
4 changes: 4 additions & 0 deletions scripts/performance/llm/pretrain_nemotron3_8b.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from ..utils import (
args_sanity_check,
get_user_configs,
logging,
set_exp_logging_configs,
set_primary_perf_configs,
slurm_executor,
Expand Down Expand Up @@ -71,9 +72,12 @@ def override_recipe_configs(
)

# data module configs
if args.use_hf_tokenizer:
logging.warning("HuggingFace tokenizer not supported for Nemotron3 8B. Using NullTokenizer.")
recipe.data.tokenizer = run.Config(
get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=256000
)
recipe.model.tokenizer = recipe.data.tokenizer

return recipe

Expand Down
4 changes: 4 additions & 0 deletions scripts/performance/llm/pretrain_nemotron4_15b.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
args_sanity_check,
get_comm_overlap_callback_idx,
get_user_configs,
logging,
set_exp_logging_configs,
set_primary_perf_configs,
slurm_executor,
Expand Down Expand Up @@ -77,9 +78,12 @@ def override_recipe_configs(
gpu_type = args.gpu.lower()

# data module configs
if args.use_hf_tokenizer:
logging.warning("HuggingFace tokenizer not supported for Nemotron4 15B. Using NullTokenizer.")
recipe.data.tokenizer = run.Config(
get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=256000
)
recipe.model.tokenizer = recipe.data.tokenizer

comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
assert comm_overlap_callback_idx is not None, "MegatronCommOverlapCallback missing. Required for performance."
Expand Down
3 changes: 3 additions & 0 deletions scripts/performance/llm/pretrain_nemotron4_340b.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
args_sanity_check,
get_comm_overlap_callback_idx,
get_user_configs,
logging,
set_exp_logging_configs,
set_primary_perf_configs,
slurm_executor,
Expand Down Expand Up @@ -86,6 +87,8 @@ def override_recipe_configs(
gpu_type = args.gpu.lower()

# data module configs
if args.use_hf_tokenizer:
logging.warning("HuggingFace tokenizer not supported for Nemotron4 340B. Using NullTokenizer.")
recipe.data.tokenizer = run.Config(
get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=256000
)
Expand Down
Loading