Skip to content

Phi3Tokenizer Chinese garbled code #7440

Open
@williamlzw

Description

@williamlzw

Microsoft.ML.GenAI.Phi 0.23.0-preview.1.25127.4
Microsoft.ML.Tokenizers 2.0.0-preview.1.25127.4
Microsoft.ML.Tokenizers.Data.Cl100kBase 2.0.0-preview.1.25127.4

public async static void Test2()
{
    string device = "cuda";
    var weightFolder = @"D:\model\Phi-3-mini-128k-instruct";
    var model = Phi3ForCasualLM.FromPretrained(weightFolder, "config.json", layersOnTargetDevice: -1, quantizeToInt4: true, targetDevice: device);

    var modelPath = Path.Join(weightFolder, "tokenizer.model");
    var tokenizer = Phi3TokenizerHelper.FromPretrained(modelPath);

    var pipeline = new CausalLMPipeline<Tokenizer, Phi3ForCasualLM>(tokenizer, model, device);
    var client = new Phi3CausalLMChatClient(pipeline);
    var task = """
        你能讲一个有趣的笑话吗?
        """;
   
    List<ChatMessage> _chatHistory = new();
    _chatHistory.Add(new ChatMessage(ChatRole.System, "你是一个助手,用中文回答用户的问题"));
    _chatHistory.Add(new ChatMessage(ChatRole.User, task));
    var options = new ChatOptions
    {
        StopSequences = ["<|end_of_text|>"],//phi3
        AdditionalProperties = new() { { "max_length", 2048 } },
    };

    await foreach (var response in client.GetStreamingResponseAsync(_chatHistory, options))
    {
        Console.Write(response.Text);
    }

    Console.WriteLine();
    Console.WriteLine("End!");
    
}

output:
当然可以。有一位老???问一个小???:“???子们

Metadata

Metadata

Labels

untriagedNew issue has not been triaged

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions