NVIDIA · ko3n1g · May 6, 2025 · May 2, 2025 · May 2, 2025 · May 2, 2025
diff --git a/tests/export/multimodal/test_build.py b/tests/export/multimodal/test_build.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import tempfile
+import unittest
+from unittest.mock import MagicMock, patch
+
+import pytest
+import torch
+
+
+@pytest.mark.run_only_on('GPU')
+class TestBuild(unittest.TestCase):
+
+    @pytest.mark.run_only_on('GPU')
+    def setUp(self):
+        self.temp_dir = tempfile.mkdtemp()
+        self.mock_config = {
+            "mm_cfg": {
+                "vision_encoder": {
+                    "from_pretrained": "test_model",
+                    "hidden_size": 768,
+                },
+                "mm_mlp_adapter_type": "linear",
+                "hidden_size": 4096,
+            }
+        }
+        self.mock_weights = {
+            "model.embedding.word_embeddings.adapter_layer.mm_projector_adapter.mm_projector.weight": torch.randn(
+                4096, 768
+            ),
+            "model.embedding.word_embeddings.adapter_layer.mm_projector_adapter.mm_projector.bias": torch.randn(4096),
+        }
+
+    @pytest.mark.run_only_on('GPU')
+    def tearDown(self):
+        # Clean up temporary directory
+        if os.path.exists(self.temp_dir):
+            for root, dirs, files in os.walk(self.temp_dir, topdown=False):
+                for name in files:
+                    os.remove(os.path.join(root, name))
+                for name in dirs:
+                    os.rmdir(os.path.join(root, name))
+            os.rmdir(self.temp_dir)
+
+    @pytest.mark.run_only_on('GPU')
+    @patch('nemo.export.multimodal.build.TensorRTLLM')
+    def test_build_trtllm_engine(self, mock_trtllm):
+        # Test basic functionality
+        mock_exporter = MagicMock()
+        mock_trtllm.return_value = mock_exporter
+
+        from nemo.export.multimodal.build import build_trtllm_engine
+
+        build_trtllm_engine(
+            model_dir=self.temp_dir,
+            visual_checkpoint_path="test_path",
+            model_type="neva",
+            tensor_parallelism_size=1,
+            max_input_len=256,
+            max_output_len=256,
+            max_batch_size=1,
+            max_multimodal_len=1024,
+            dtype="bfloat16",
+        )
+
+        mock_exporter.export.assert_called_once()
+
+    @pytest.mark.run_only_on('GPU')
+    @patch('nemo.export.multimodal.build.MLLaMAForCausalLM')
+    @patch('nemo.export.multimodal.build.build_trtllm')
+    def test_build_mllama_trtllm_engine(self, mock_build_trtllm, mock_mllama):
+        # Test basic functionality
+        mock_model = MagicMock()
+        mock_mllama.from_hugging_face.return_value = mock_model
+        mock_build_trtllm.return_value = MagicMock()
+
+        from nemo.export.multimodal.build import build_mllama_trtllm_engine
+
+        build_mllama_trtllm_engine(
+            model_dir=self.temp_dir,
+            hf_model_path="test_path",
+            tensor_parallelism_size=1,
+            max_input_len=256,
+            max_output_len=256,
+            max_batch_size=1,
+            max_multimodal_len=1024,
+            dtype="bfloat16",
+        )
+
+        mock_mllama.from_hugging_face.assert_called_once()
+        mock_build_trtllm.assert_called_once()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/export/test_sentencepiece_tokenizer.py b/tests/export/test_sentencepiece_tokenizer.py
@@ -0,0 +1,192 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import tempfile
+import unittest
+from unittest.mock import MagicMock
+
+import numpy as np
+import sentencepiece
+import torch
+
+from nemo.export.sentencepiece_tokenizer import SentencePieceTokenizer
+
+
+class TestSentencePieceTokenizer(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        # Create a temporary directory for test files
+        cls.test_dir = tempfile.mkdtemp()
+
+        # Create a simple sentencepiece model for testing
+        with open(os.path.join(cls.test_dir, "test.txt"), "w") as f:
+            f.write("Hello world\nThis is a test\n")
+
+        # Train a simple sentencepiece model
+        sentencepiece.SentencePieceTrainer.Train(
+            f'--input={os.path.join(cls.test_dir, "test.txt")} '
+            f'--model_prefix={os.path.join(cls.test_dir, "test_model")} '
+            '--vocab_size=55 --model_type=bpe'
+        )
+
+        cls.model_path = os.path.join(cls.test_dir, "test_model.model")
+
+    @classmethod
+    def tearDownClass(cls):
+        # Clean up temporary files
+        import shutil
+
+        shutil.rmtree(cls.test_dir)
+
+    def setUp(self):
+        self.tokenizer = SentencePieceTokenizer(model_path=self.model_path)
+
+    def test_initialization(self):
+        # Test initialization with model path
+        tokenizer = SentencePieceTokenizer(model_path=self.model_path)
+        self.assertIsNotNone(tokenizer.tokenizer)
+        self.assertEqual(tokenizer.original_vocab_size, tokenizer.vocab_size)
+
+        # Test initialization with invalid model path
+        with self.assertRaises(ValueError):
+            SentencePieceTokenizer(model_path="nonexistent.model")
+
+        # Test initialization with both model_path and tokenizer
+        mock_tokenizer = MagicMock()
+        with self.assertRaises(ValueError):
+            SentencePieceTokenizer(model_path=self.model_path, tokenizer=mock_tokenizer)
+
+        # Test initialization with neither model_path nor tokenizer
+        with self.assertRaises(ValueError):
+            SentencePieceTokenizer()
+
+    def test_text_to_tokens(self):
+        text = "Hello world"
+        tokens = self.tokenizer.text_to_tokens(text)
+        self.assertIsInstance(tokens, list)
+        self.assertTrue(all(isinstance(t, str) for t in tokens))
+
+    def test_encode(self):
+        text = "Hello world"
+        ids = self.tokenizer.encode(text)
+        self.assertIsInstance(ids, list)
+        self.assertTrue(all(isinstance(i, int) for i in ids))
+
+    def test_tokens_to_text(self):
+        text = "Hello world"
+        tokens = self.tokenizer.text_to_tokens(text)
+        reconstructed_text = self.tokenizer.tokens_to_text(tokens)
+        self.assertIsInstance(reconstructed_text, str)
+        self.assertNotEqual(reconstructed_text, "")  # Should not be empty
+
+    def test_batch_decode(self):
+        text = "Hello world"
+        ids = self.tokenizer.encode(text)
+
+        # Test with list
+        decoded_text = self.tokenizer.batch_decode(ids)
+        self.assertIsInstance(decoded_text, str)
+
+        # Test with numpy array
+        ids_np = np.array(ids)
+        decoded_text_np = self.tokenizer.batch_decode(ids_np)
+        self.assertIsInstance(decoded_text_np, str)
+
+        # Test with torch tensor
+        ids_torch = torch.tensor(ids)
+        decoded_text_torch = self.tokenizer.batch_decode(ids_torch)
+        self.assertIsInstance(decoded_text_torch, str)
+
+    def test_token_to_id(self):
+        text = "Hello"
+        tokens = self.tokenizer.text_to_tokens(text)
+        token_id = self.tokenizer.token_to_id(tokens[0])
+        self.assertIsInstance(token_id, int)
+
+    def test_ids_to_tokens(self):
+        text = "Hello world"
+        ids = self.tokenizer.encode(text)
+        tokens = self.tokenizer.ids_to_tokens(ids)
+        self.assertIsInstance(tokens, list)
+        self.assertTrue(all(isinstance(t, str) for t in tokens))
+
+    def test_tokens_to_ids(self):
+        text = "Hello"
+        tokens = self.tokenizer.text_to_tokens(text)
+        ids = self.tokenizer.tokens_to_ids(tokens)
+        self.assertIsInstance(ids, list)
+        self.assertTrue(all(isinstance(i, int) for i in ids))
+
+    def test_legacy_mode(self):
+        special_tokens = ["[PAD]", "[BOS]", "[EOS]"]
+        tokenizer = SentencePieceTokenizer(model_path=self.model_path, special_tokens=special_tokens, legacy=True)
+
+        # Test adding special tokens
+        self.assertGreater(tokenizer.vocab_size, tokenizer.original_vocab_size)
+
+        # Test special token encoding
+        text = "Hello [PAD] world"
+        tokens = tokenizer.text_to_tokens(text)
+        self.assertIn("[PAD]", tokens)
+
+        # Test special token decoding
+        ids = tokenizer.encode(text)
+        decoded_text = tokenizer.batch_decode(ids)
+        self.assertIn("[PAD]", decoded_text)
+
+    def test_properties(self):
+        # Test pad_id property
+        self.assertIsInstance(self.tokenizer.pad_id, int)
+
+        # Test bos_token_id property
+        self.assertIsInstance(self.tokenizer.bos_token_id, int)
+
+        # Test eos_token_id property
+        self.assertIsInstance(self.tokenizer.eos_token_id, int)
+
+        # Test unk_id property
+        self.assertIsInstance(self.tokenizer.unk_id, int)
+
+    def test_vocab_property(self):
+        vocab = self.tokenizer.vocab
+        self.assertIsInstance(vocab, list)
+        self.assertTrue(all(isinstance(t, str) for t in vocab))
+
+    def test_convert_ids_to_tokens(self):
+        text = "Hello world"
+        ids = self.tokenizer.encode(text)
+        tokens = self.tokenizer.convert_ids_to_tokens(ids)
+        self.assertIsInstance(tokens, list)
+        self.assertTrue(all(isinstance(t, str) for t in tokens))
+
+    def test_convert_tokens_to_string(self):
+        text = "Hello world"
+        tokens = self.tokenizer.text_to_tokens(text)
+        string = self.tokenizer.convert_tokens_to_string(tokens)
+        self.assertIsInstance(string, str)
+
+    def test_len(self):
+        self.assertEqual(len(self.tokenizer), self.tokenizer.vocab_size)
+
+    def test_is_fast(self):
+        self.assertTrue(self.tokenizer.is_fast)
+
+    def test_get_added_vocab(self):
+        self.assertIsNone(self.tokenizer.get_added_vocab())
+
+
+if __name__ == '__main__':
+    unittest.main()