Merge pull request alexrudall#200 from Clemalfroy/whisper

alexrudall · web-flow · commit ca43f4cfc460 · 2023-03-02T20:30:21.000Z
Add Whisper endpoints
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [3.5.0] - 2023-03-02
+
+### Added
+
+- Add Client#transcribe and Client translate endpoints - Whisper over the wire! Thanks to [@Clemalfroy](https://github.com/Clemalfroy)
+
 ## [3.4.0] - 2023-03-01
 
 ### Added
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    ruby-openai (3.4.0)
+    ruby-openai (3.5.0)
       httparty (>= 0.18.1)
 
 GEM
diff --git a/README.md b/README.md
@@ -267,6 +267,38 @@ Pass a string to check if it violates OpenAI's Content Policy:
     => 5.505014632944949e-05
 ```
 
+### Whisper
+
+Whisper is a speech to text model that can be used to generate text based on an audio file [messages](https://platform.openai.com/docs/guides/chat/introduction):
+
+#### Translate
+
+The translations API takes as input the audio file in any of the supported languages and transcribes the audio into English.
+
+```ruby
+    response = client.translate(
+        parameters: {
+            model: "whisper-1",
+            file: File.open('path_to_file'),
+        })
+    puts response.parsed_body['text']
+    => "Translation of the text"
+```
+
+#### Transcribe
+
+The transcriptions API takes as input the audio file you want to transcribe and returns the text in the desired output file format.
+
+```ruby
+    response = client.transcribe(
+        parameters: {
+            model: "whisper-1",
+            file: File.open('path_to_file'),
+        })
+    puts response.parsed_body['text']
+    => "Transcription of the text"
+```
+
 ## Development
 
 After checking out the repo, run `bin/setup` to install dependencies. You can run `bin/console` for an interactive prompt that will allow you to experiment.
diff --git a/lib/openai/client.rb b/lib/openai/client.rb
@@ -43,6 +43,14 @@ def moderations(parameters: {})
       OpenAI::Client.json_post(path: "/moderations", parameters: parameters)
     end
 
+    def transcribe(parameters: {})
+      OpenAI::Client.multipart_post(path: "/audio/transcriptions", parameters: parameters)
+    end
+
+    def translate(parameters: {})
+      OpenAI::Client.multipart_post(path: "/audio/translations", parameters: parameters)
+    end
+
     def self.get(path:)
       HTTParty.get(
         uri(path: path),
diff --git a/lib/openai/version.rb b/lib/openai/version.rb
@@ -1,3 +1,3 @@
 module OpenAI
-  VERSION = "3.4.0".freeze
+  VERSION = "3.5.0".freeze
 end
diff --git a/spec/fixtures/cassettes/whisper-1_transcribe.yml b/spec/fixtures/cassettes/whisper-1_transcribe.yml
diff --git a/spec/fixtures/cassettes/whisper-1_translate.yml b/spec/fixtures/cassettes/whisper-1_translate.yml
diff --git a/spec/fixtures/files/audio_sample.mp3 b/spec/fixtures/files/audio_sample.mp3
diff --git a/spec/openai/client/audio_spec.rb b/spec/openai/client/audio_spec.rb
@@ -0,0 +1,59 @@
+RSpec.describe OpenAI::Client do
+  describe "#transcribe" do
+    context "with audio", :vcr do
+      let(:filename) { "audio_sample.mp3" }
+      let(:audio) { File.join(RSPEC_ROOT, "fixtures/files", filename) }
+
+      let(:response) do
+        OpenAI::Client.new.transcribe(
+          parameters: {
+            model: model,
+            file: File.open(audio, 'r:iso-8859-1')
+          }
+        )
+      end
+      let(:content) { response.parsed_response["text"] }
+      let(:cassette) { "#{model} transcribe".downcase }
+
+      context "with model: whisper-1" do
+        let(:model) { "whisper-1" }
+
+        it "succeeds" do
+
+          VCR.use_cassette(cassette) do
+            expect(content.empty?).to eq(false)
+          end
+        end
+      end
+    end
+  end
+
+  describe "#translate" do
+    context "with audio", :vcr do
+      let(:filename) { "audio_sample.mp3" }
+      let(:audio) { File.join(RSPEC_ROOT, "fixtures/files", filename) }
+
+      let(:response) do
+        OpenAI::Client.new.translate(
+          parameters: {
+            model: model,
+            file: File.open(audio, 'r:iso-8859-1')
+          }
+        )
+      end
+      let(:content) { response.parsed_response["text"] }
+      let(:cassette) { "#{model} translate".downcase }
+
+      context "with model: whisper-1" do
+        let(:model) { "whisper-1" }
+
+        it "succeeds" do
+
+          VCR.use_cassette(cassette) do
+            expect(content.empty?).to eq(false)
+          end
+        end
+      end
+    end
+  end
+end