From caebf619be0e496cca8e8742d4346b2c3ea740c7 Mon Sep 17 00:00:00 2001 From: Tarek Mahmoud Sayed Date: Thu, 20 Feb 2025 17:08:53 -0800 Subject: [PATCH 1/2] Support O3 OpenAI model mapping --- src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs | 2 ++ test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs b/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs index 28e272e267..54d4daa9e3 100644 --- a/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs +++ b/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs @@ -1026,6 +1026,7 @@ private static readonly (string Prefix, ModelEncoding Encoding)[] _modelPrefixTo [ // chat ( "o1-", ModelEncoding.O200kBase ), // e.g. o1-mini + ( "o3-", ModelEncoding.O200kBase ), // e.g. o3-mini ( "gpt-4o-", ModelEncoding.O200kBase), // e.g., gpt-4o-2024-05-13 ( "gpt-4-", ModelEncoding.Cl100kBase), // e.g., gpt-4-0314, etc., plus gpt-4-32k ( "gpt-3.5-", ModelEncoding.Cl100kBase), // e.g, gpt-3.5-turbo-0301, -0401, etc. @@ -1038,6 +1039,7 @@ private static readonly (string Prefix, ModelEncoding Encoding)[] _modelPrefixTo // chat { "gpt-4o", ModelEncoding.O200kBase }, { "o1", ModelEncoding.O200kBase }, + { "o3", ModelEncoding.O200kBase }, { "gpt-4", ModelEncoding.Cl100kBase }, { "gpt-3.5-turbo", ModelEncoding.Cl100kBase }, { "gpt-3.5-turbo-16k", ModelEncoding.Cl100kBase }, diff --git a/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs b/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs index 1e7cad6890..0d09a66b05 100644 --- a/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs +++ b/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs @@ -394,6 +394,10 @@ public void TestEncodeR50kBase() [Theory] [InlineData("o1")] [InlineData("o1-")] + [InlineData("o1-mimi")] + [InlineData("o3")] + [InlineData("o3-")] + [InlineData("o3-mini")] [InlineData("gpt-4o")] [InlineData("gpt-4o-")] [InlineData("gpt-4")] @@ -496,6 +500,7 @@ public void TestEncodingNamesNegativeCases() [InlineData("gpt-4")] [InlineData("gpt-4o")] [InlineData("o1")] + [InlineData("o3")] [InlineData("text-davinci-003")] [InlineData("text-curie-001")] [InlineData("text-davinci-edit-001")] From 7fd7d0765fd692d114eaea38dfc146b2840e4ffd Mon Sep 17 00:00:00 2001 From: Tarek Mahmoud Sayed <10833894+tarekgh@users.noreply.github.com> Date: Thu, 20 Feb 2025 17:14:04 -0800 Subject: [PATCH 2/2] Update test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs b/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs index 0d09a66b05..6333723e7d 100644 --- a/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs +++ b/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs @@ -394,7 +394,7 @@ public void TestEncodeR50kBase() [Theory] [InlineData("o1")] [InlineData("o1-")] - [InlineData("o1-mimi")] + [InlineData("o1-mini")] [InlineData("o3")] [InlineData("o3-")] [InlineData("o3-mini")]