cleanup whisper a little (ml-explore#639)

awni · web-flow · commit 78c431dc25bf · 2024-03-30T13:13:58.000-07:00
diff --git a/llms/mlx_lm/utils.py b/llms/mlx_lm/utils.py
@@ -239,12 +239,13 @@ def generate(
         ),
         range(max_tokens),
     ):
-        if token == tokenizer.eos_token_id:
-            break
+        token = token.item()
         if n == 0:
             prompt_time = time.perf_counter() - tic
             tic = time.perf_counter()
-        tokens.append(token.item())
+        if token == tokenizer.eos_token_id:
+            break
+        tokens.append(token)
 
         if verbose:
             s = tokenizer.decode(tokens)
diff --git a/whisper/convert.py b/whisper/convert.py
@@ -91,7 +91,8 @@ def _download(url: str, root: str) -> str:
                 output.write(buffer)
                 loop.update(len(buffer))
 
-    model_bytes = open(download_target, "rb").read()
+    with open(download_target, "rb") as fid:
+        model_bytes = fid.read()
     if hashlib.sha256(model_bytes).hexdigest() != expected_sha256:
         raise RuntimeError(
             "Model has been downloaded but the SHA256 checksum does not not match. Please retry loading the model."
diff --git a/whisper/test.py b/whisper/test.py
@@ -297,7 +297,7 @@ def test_transcribe_alice(self):
             "temperature": 0.0,
             "avg_logprob": -0.1350895343440594,
             "compression_ratio": 1.6208333333333333,
-            "no_speech_prob": 0.002246702555567026,
+            "no_speech_prob": 0.009053784422576427,
         }
 
         def check_segment(seg, expected):
diff --git a/whisper/whisper/audio.py b/whisper/whisper/audio.py
@@ -58,7 +58,7 @@ def load_audio(file: str, sr: int = SAMPLE_RATE):
     except CalledProcessError as e:
         raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
 
-    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
+    return mx.array(np.frombuffer(out, np.int16)).flatten().astype(mx.float32) / 32768.0
 
 
 def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
@@ -73,8 +73,7 @@ def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
     if array.shape[axis] < length:
         pad_widths = [(0, 0)] * array.ndim
         pad_widths[axis] = (0, length - array.shape[axis])
-        pad_fn = mx.pad if isinstance(array, mx.array) else np.pad
-        array = pad_fn(array, pad_widths)
+        array = mx.pad(array, pad_widths)
 
     return array
 
@@ -154,9 +153,9 @@ def log_mel_spectrogram(
     """
     device = mx.default_device()
     mx.set_default_device(mx.cpu)
-    if not isinstance(audio, mx.array):
-        if isinstance(audio, str):
-            audio = load_audio(audio)
+    if isinstance(audio, str):
+        audio = load_audio(audio)
+    elif not isinstance(audio, mx.array):
         audio = mx.array(audio)
 
     if padding > 0:
diff --git a/whisper/whisper/transcribe.py b/whisper/whisper/transcribe.py
diff --git a/whisper/whisper/whisper.py b/whisper/whisper/whisper.py

Original file line number	Diff line number	Diff line change
`@@ -297,7 +297,7 @@ def test_transcribe_alice(self):`
`297`	`297`	`"temperature": 0.0,`
`298`	`298`	`"avg_logprob": -0.1350895343440594,`
`299`	`299`	`"compression_ratio": 1.6208333333333333,`
`300`		`- "no_speech_prob": 0.002246702555567026,`
	`300`	`+ "no_speech_prob": 0.009053784422576427,`
`301`	`301`	`}`
`302`	`302`
`303`	`303`	`def check_segment(seg, expected):`