Description
Description:
When using the main
cli program to transcribe audio containing non-ASCII characters and outputting detailed JSON --output-json-full
with features DTW
enabled, the transcription.tokens[*].text
field sometimes contains garbled characters (represented as �
). However, the main transcription.text
field appears correct.
For example:
"text": "えー、こんばんは。岸高野、岸大輔です。本日は心を入れ替えまして、TBSラジオのスタジオから収録しております。どういうことなんだよ、それ。心を入れ替えてすることじゃねえだろ。本来そっちなんだよ。",
"tokens": [
{
"text": "[_BEG_]",
"timestamps": {
"from": "00:00:00,000",
"to": "00:00:00,000"
},
"offsets": {
"from": 0,
"to": 0
},
"id": 50365,
"p": 0.813801,
"t_dtw": -1
},
{
"text": "えー",
"timestamps": {
"from": "00:00:00,000",
"to": "00:00:00,470"
},
"offsets": {
"from": 0,
"to": 470
},
"id": 36697,
"p": 0.0519727,
"t_dtw": 158
},
{
"text": "、",
"timestamps": {
"from": "00:00:00,470",
"to": "00:00:00,700"
},
"offsets": {
"from": 470,
"to": 700
},
"id": 1231,
"p": 0.824364,
"t_dtw": 196
},
{
"text": "こん",
"timestamps": {
"from": "00:00:00,930",
"to": "00:00:01,170"
},
"offsets": {
"from": 930,
"to": 1170
},
"id": 31172,
"p": 0.990851,
"t_dtw": 210
},
{
"text": "ば",
"timestamps": {
"from": "00:00:01,400",
"to": "00:00:01,400"
},
"offsets": {
"from": 1400,
"to": 1400
},
"id": 13349,
"p": 0.998904,
"t_dtw": 220
},
{
"text": "ん",
"timestamps": {
"from": "00:00:01,420",
"to": "00:00:01,630"
},
"offsets": {
"from": 1420,
"to": 1630
},
"id": 3225,
"p": 0.99433,
"t_dtw": 228
},
{
"text": "は",
"timestamps": {
"from": "00:00:01,630",
"to": "00:00:01,860"
},
"offsets": {
"from": 1630,
"to": 1860
},
"id": 3065,
"p": 0.993292,
"t_dtw": 236
},
{
"text": "。",
"timestamps": {
"from": "00:00:01,890",
"to": "00:00:02,090"
},
"offsets": {
"from": 1890,
"to": 2090
},
"id": 1543,
"p": 0.638299,
"t_dtw": 280
},
{
"text": "�", # E5 B2
"timestamps": {
"from": "00:00:02,090",
"to": "00:00:02,240"
},
"offsets": {
"from": 2090,
"to": 2240
},
"id": 23182,
"p": 0.940747,
"t_dtw": 318
},
{
"text": "�", # B8
"timestamps": {
"from": "00:00:02,240",
"to": "00:00:02,310"
},
"offsets": {
"from": 2240,
"to": 2310
},
"id": 116,
"p": 0.999584,
"t_dtw": 326
},
{
"text": "高",
"timestamps": {
"from": "00:00:02,310",
"to": "00:00:02,350"
},
"offsets": {
"from": 2310,
"to": 2350
},
"id": 12979,
"p": 0.375837,
"t_dtw": 342
},
{
"text": "野",
"timestamps": {
"from": "00:00:02,540",
"to": "00:00:02,770"
},
"offsets": {
"from": 2540,
"to": 2770
},
"id": 37178,
"p": 0.900068,
"t_dtw": 358
},
....
Noted that the single character 岸
is shown correctly in text
field but splitted into 2 tokens
, the utf-8 encoding for 岸
is E5 B2 B8
, corresponding to text
field of the 2 tokens: E5 B2
and B8
.
imo the root cause for this is that the tokenizer sometimes splits single non-ASCII characters into multiple tokens.
I found a workaround for decoding this using python, simply by concatenating consecutive tokens that is garbled. I'm not sure if this will/should be solved from the whisper.cpp side.
def merge_token_by_utf8(inputs: list[Token]) -> list[Token]:
out = []
i = 0
while i < len(inputs):
if '\\' in inputs[i].text:
to_merge = [inputs[i]]
j = 0
merged = None
while j < len(inputs):
j+=1
if '\\' in inputs[i+j].text:
to_merge.append(inputs[i+j])
try:
merged = TokenList.merge_tokens(to_merge)
matches = list(re.finditer(r'(\\x[0-9a-fA-F]{2})', merged.text))
prefix,suffix='',''
if len(matches)>0:
if matches[0].start() == 0:
split_point = matches[-1].end()
merged.text = merged.text[:split_point]
suffix = merged.text[split_point:]
else:
split_point = matches[0].start()
prefix = merged.text[:split_point]
merged.text = merged.text[split_point:]
merged.text = merged.text.encode('latin1').decode('unicode_escape').encode('latin1').decode('utf-8')
merged.text = prefix + merged.text + suffix
break
except Exception as e:
continue
else:
raise Exception(f"failed to merge {inputs[i+j].text}")
print(f'merged: [{merged.text}]')
out.append(merged)
i+=j
else:
out.append(inputs[i])
i+=1
return out
def read() -> List[Token]:
raw = open(json_file_path, errors='backslashreplace').read().replace('\\','\\\\')
raw = json.loads(raw)
return merge_jap_token_by_utf8(raw)