Skip to content

Commit 52e1ef5

Browse files
sayanshaw24Sayan Shaw
and
Sayan Shaw
authored
Update Tokenizer and Detokenizer for Chat Template Tokenization (#957)
* update tokenizer and detokenizer for chat template tokenization * undo certain decoder changes --------- Co-authored-by: Sayan Shaw <[email protected]>
1 parent 6eb73f9 commit 52e1ef5

File tree

6 files changed

+202081
-12
lines changed

6 files changed

+202081
-12
lines changed

operators/tokenizer/bpe_streaming.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,7 @@ class BpeStreamingDecoder : public KernelBpeDecoder {
192192
for (size_t tok_idx = 0; tok_idx < seq_len; ++tok_idx) {
193193
const auto id = ort_extensions::narrow<extTokenId_t>(*(p_ids + tok_idx));
194194
std::string decoded_token;
195+
195196
auto status = spm_model_
196197
? SpmId2Token(id, decoded_token, f_special_last)
197198
: Id2Token(id, decoded_token, true, f_special_last);

operators/tokenizer/bpe_tokenizer_model.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -360,7 +360,6 @@ class BpeModel {
360360
pos--;
361361
}
362362
auto stripped_token = token.substr(0, pos);
363-
final_result.back().first = stripped_token;
364363
final_result.emplace_back(token, id);
365364
continue;
366365
}

shared/api/tokenizer_impl.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,9 @@ OrtxStatus TokenizerImpl::BatchDecode(const std::vector<span<extTokenId_t const>
120120
std::transform(s.begin(), s.end(), ids.begin(), [](extTokenId_t v) { return static_cast<int64_t>(v); });
121121
ortc::Tensor<int64_t> ts_input(std::vector<int64_t>{1, static_cast<int64_t>(ids.size())}, (void*)ids.data());
122122
ortc::Tensor<std::string> ts_output;
123+
124+
// Note: currently the detokenizer Compute is called with skip_special_tokens = true
125+
// by default, but this parameter should be exposed to GenAI in the future.
123126
OrtxStatus status = std::visit([&](auto& detokenizer) {
124127
return detokenizer->Compute(ts_input, ts_output); }, detokenizer_);
125128

0 commit comments

Comments
 (0)