Mercurial > repos > bgruening > whisper

Binary file test-data/english.wav has changed
Binary file test-data/german.wav has changed
Binary file test-data/german_english.mp3 has changed
Binary file test-data/persian.wav has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/transcribe.txt	Wed Apr 24 22:10:03 2024 +0000
@@ -0,0 +1,2 @@
+21-year-old Jesus joined Manchester City last year in January 2017 from Brazilian club Paul Marius
+for a reported fee of £27 million.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/transcribe_english.json	Wed Apr 24 22:10:03 2024 +0000
@@ -0,0 +1,1 @@
+{"text": " 21-year-old Jesus joined Manchester City last year in January 2017 from Brazilian club Paul Marius for a reported fee of \u00a327 million.", "segments": [{"id": 0, "seek": 0, "start": 0.0, "end": 9.8, "text": " 21-year-old Jesus joined Manchester City last year in January 2017 from Brazilian club Paul Marius", "tokens": [50364, 5080, 12, 5294, 12, 2641, 2705, 6869, 27180, 4392, 1036, 1064, 294, 7061, 6591, 490, 23435, 6482, 4552, 2039, 4872, 50854], "temperature": 0.0, "avg_logprob": -0.44505851409014535, "compression_ratio": 1.1166666666666667, "no_speech_prob": 0.045594051480293274}, {"id": 1, "seek": 0, "start": 9.8, "end": 12.4, "text": " for a reported fee of \u00a327 million.", "tokens": [50854, 337, 257, 7055, 12054, 295, 14378, 10076, 2459, 13, 50984], "temperature": 0.0, "avg_logprob": -0.44505851409014535, "compression_ratio": 1.1166666666666667, "no_speech_prob": 0.045594051480293274}], "language": "English"}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/transcribe_english.srt	Wed Apr 24 22:10:03 2024 +0000
@@ -0,0 +1,8 @@
+1
+00:00:00,000 --> 00:00:09,800
+21-year-old Jesus joined Manchester City last year in January 2017 from Brazilian club Paul Marius
+
+2
+00:00:09,800 --> 00:00:12,400
+for a reported fee of £27 million.
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/transcribe_english.tsv	Wed Apr 24 22:10:03 2024 +0000
@@ -0,0 +1,3 @@
+start	end	text
+0	9800	21-year-old Jesus joined Manchester City last year in January 2017 from Brazilian club Paul Marius
+9800	12400	for a reported fee of £27 million.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/transcribe_german.txt	Wed Apr 24 22:10:03 2024 +0000
@@ -0,0 +1,2 @@
+Reitschafte es New Zealand's A1 GP Auto Black Beauty sieben Mal mit Geschwindigkeiten von
+über 160 kmh über die Brücke zu fahren.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/transcribe_german_english.txt	Wed Apr 24 22:10:03 2024 +0000
@@ -0,0 +1,6 @@
+Der gewünschte Gesprächspartner ist zurzeit nicht erreichbar,
+wird aber per SMS über Ihren Anruf informiert.
+Vielen Dank.
+The person you have called is currently not available.
+However, he will be informed about your call attempt via SMS.
+Thank you for calling.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/transcribe_persian.json	Wed Apr 24 22:10:03 2024 +0000
@@ -0,0 +1,1 @@
+{"text": " \u0628\u0631\u062e\u06cc \u0627\u0632 \u0627\u06cc\u0646 \u0633\u0646\u06af \u0647\u0627 \u06a9\u0647 \u062f\u0631 \u06a9\u0648\u0631\u06cc \u0632\u0645\u06cc\u0646 \u062e\u06cc\u0644\u06cc \u0646\u0627\u062f\u0631 \u0647\u0633\u062a\u0646\u062f \u0628\u0627 \u0642\u06cc\u0645\u062a\u06cc \u0627\u0632 \u06cc\u0627\u0632\u062f\u0647 \u0647\u0632\u0627\u0631 \u062f\u0648\u0644\u0627\u0631 \u062a\u0627 \u0628\u06cc\u0633\u062a\u0648 \u062f\u0648 \u0647\u0632\u0627\u0631 \u0648 \u067e\u0646\u0633\u062a\u062f \u062f\u0648\u0644\u0627\u0631 \u062f\u0631 \u0647\u0631 \u0627\u0648\u0646\u0633 \u0641\u0631\u0648\u062e\u062a \u0645\u06cc \u0634\u0628\u0646\u062f \u06a9\u0647 \u0686\u06cc\u0632 \u062d\u062f\u0648\u062f \u062f\u0647 \u0628\u0631\u0627\u0628\u0631 \u0642\u06cc\u0645\u062a \u062a\u0644\u0627\u0633\u062a.", "segments": [{"id": 0, "seek": 0, "start": 0.0, "end": 13.42, "text": " \u0628\u0631\u062e\u06cc \u0627\u0632 \u0627\u06cc\u0646 \u0633\u0646\u06af \u0647\u0627 \u06a9\u0647 \u062f\u0631 \u06a9\u0648\u0631\u06cc \u0632\u0645\u06cc\u0646 \u062e\u06cc\u0644\u06cc \u0646\u0627\u062f\u0631 \u0647\u0633\u062a\u0646\u062f \u0628\u0627 \u0642\u06cc\u0645\u062a\u06cc \u0627\u0632 \u06cc\u0627\u0632\u062f\u0647 \u0647\u0632\u0627\u0631 \u062f\u0648\u0644\u0627\u0631 \u062a\u0627 \u0628\u06cc\u0633\u062a\u0648 \u062f\u0648 \u0647\u0632\u0627\u0631 \u0648 \u067e\u0646\u0633\u062a\u062f \u062f\u0648\u0644\u0627\u0631 \u062f\u0631 \u0647\u0631 \u0627\u0648\u0646\u0633 \u0641\u0631\u0648\u062e\u062a \u0645\u06cc \u0634\u0628\u0646\u062f \u06a9\u0647 \u0686\u06cc\u0632 \u062d\u062f\u0648\u062f \u062f\u0647 \u0628\u0631\u0627\u0628\u0631 \u0642\u06cc\u0645\u062a \u062a\u0644\u0627\u0633\u062a.", "tokens": [50364, 4724, 2288, 9778, 4135, 1975, 11622, 1975, 32151, 8608, 1863, 16761, 8032, 995, 7565, 3224, 11778, 2288, 7565, 13063, 4135, 30767, 2304, 32151, 16490, 4135, 1211, 4135, 8717, 18513, 2288, 8032, 14851, 41260, 4724, 995, 12174, 4135, 2304, 2655, 4135, 1975, 11622, 25429, 31377, 3215, 3224, 8032, 11622, 9640, 11778, 12610, 9640, 6055, 995, 4724, 4135, 14851, 2407, 11778, 2407, 8032, 11622, 9640, 4032, 21453, 1863, 14851, 3215, 11778, 12610, 9640, 11778, 2288, 8032, 2288, 1975, 11536, 3794, 6156, 32887, 46456, 48478, 13412, 3555, 41260, 7565, 3224, 34766, 4135, 11622, 11331, 3215, 23328, 11778, 3224, 4724, 2288, 16758, 2288, 12174, 4135, 2304, 2655, 6055, 15040, 14851, 13, 51064], "temperature": 0.0, "avg_logprob": -0.19529172723943536, "compression_ratio": 1.7426900584795322, "no_speech_prob": 0.08913763612508774, "words": [{"word": " \u0628\u0631\u062e\u06cc", "start": 0.0, "end": 1.42, "probability": 0.9033132642507553}, {"word": " \u0627\u0632", "start": 1.42, "end": 1.62, "probability": 0.9889343976974487}, {"word": " \u0627\u06cc\u0646", "start": 1.62, "end": 1.74, "probability": 0.9703238010406494}, {"word": " \u0633\u0646\u06af", "start": 1.74, "end": 2.12, "probability": 0.9526320497194926}, {"word": " \u0647\u0627", "start": 2.12, "end": 2.3, "probability": 0.7875397503376007}, {"word": " \u06a9\u0647", "start": 2.3, "end": 2.42, "probability": 0.9241614937782288}, {"word": " \u062f\u0631", "start": 2.42, "end": 2.64, "probability": 0.9900502860546112}, {"word": " \u06a9\u0648\u0631\u06cc", "start": 2.64, "end": 2.9, "probability": 0.6278652548789978}, {"word": " \u0632\u0645\u06cc\u0646", "start": 2.9, "end": 3.28, "probability": 0.9571676055590311}, {"word": " \u062e\u06cc\u0644\u06cc", "start": 3.28, "end": 3.66, "probability": 0.9876178354024887}, {"word": " \u0646\u0627\u062f\u0631", "start": 3.66, "end": 4.02, "probability": 0.9114052851994833}, {"word": " \u0647\u0633\u062a\u0646\u062f", "start": 4.02, "end": 4.5, "probability": 0.7521352767944336}, {"word": " \u0628\u0627", "start": 4.5, "end": 4.74, "probability": 0.7534557282924652}, {"word": " \u0642\u06cc\u0645\u062a\u06cc", "start": 4.74, "end": 5.26, "probability": 0.8726431250572204}, {"word": " \u0627\u0632", "start": 5.26, "end": 5.42, "probability": 0.984766036272049}, {"word": " \u06cc\u0627\u0632\u062f\u0647", "start": 5.42, "end": 5.92, "probability": 0.7151797562837601}, {"word": " \u0647\u0632\u0627\u0631", "start": 5.92, "end": 6.3, "probability": 0.9623414278030396}, {"word": " \u062f\u0648\u0644\u0627\u0631", "start": 6.3, "end": 6.9, "probability": 0.9146908124287924}, {"word": " \u062a\u0627", "start": 6.9, "end": 7.2, "probability": 0.9085398316383362}, {"word": " \u0628\u06cc\u0633\u062a\u0648", "start": 7.2, "end": 7.6, "probability": 0.8677552342414856}, {"word": " \u062f\u0648", "start": 7.6, "end": 7.7, "probability": 0.9638055264949799}, {"word": " \u0647\u0632\u0627\u0631", "start": 7.7, "end": 7.98, "probability": 0.9253919919331869}, {"word": " \u0648", "start": 7.98, "end": 8.12, "probability": 0.7168136835098267}, {"word": " \u067e\u0646\u0633\u062a\u062f", "start": 8.12, "end": 8.54, "probability": 0.7954745143651962}, {"word": " \u062f\u0648\u0644\u0627\u0631", "start": 8.54, "end": 8.92, "probability": 0.9934115409851074}, {"word": " \u062f\u0631", "start": 8.92, "end": 9.2, "probability": 0.9842988848686218}, {"word": " \u0647\u0631", "start": 9.2, "end": 9.34, "probability": 0.9502557218074799}, {"word": " \u0627\u0648\u0646\u0633", "start": 9.34, "end": 9.62, "probability": 0.6762789487838745}, {"word": " \u0641\u0631\u0648\u062e\u062a", "start": 9.62, "end": 9.98, "probability": 0.9117981791496277}, {"word": " \u0645\u06cc", "start": 9.98, "end": 10.14, "probability": 0.7245240807533264}, {"word": " \u0634\u0628\u0646\u062f", "start": 10.14, "end": 10.7, "probability": 0.6922649443149567}, {"word": " \u06a9\u0647", "start": 10.7, "end": 11.04, "probability": 0.899426281452179}, {"word": " \u0686\u06cc\u0632", "start": 11.04, "end": 11.32, "probability": 0.9809092283248901}, {"word": " \u062d\u062f\u0648\u062f", "start": 11.32, "end": 11.68, "probability": 0.8215810457865397}, {"word": " \u062f\u0647", "start": 11.68, "end": 11.98, "probability": 0.889119029045105}, {"word": " \u0628\u0631\u0627\u0628\u0631", "start": 11.98, "end": 12.4, "probability": 0.9024366587400436}, {"word": " \u0642\u06cc\u0645\u062a", "start": 12.4, "end": 12.84, "probability": 0.9782368242740631}, {"word": " \u062a\u0644\u0627\u0633\u062a.", "start": 12.84, "end": 13.42, "probability": 0.6221214731534322}]}], "language": "Persian"}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/whisper.xml	Wed Apr 24 22:10:03 2024 +0000
@@ -0,0 +1,358 @@
+<tool id="whisper" name="Speach to Text" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01" license="MIT">
+    <description>
+        Transcribe audio or video files to text using the OpenAI Whisper
+    </description>
+    <macros>
+        <token name="@TOOL_VERSION@">20231117</token>
+        <token name="@VERSION_SUFFIX@">0</token>
+    </macros>
+    <requirements>
+        <container type="docker">quay.io/galaxy/whisper:20231117</container>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        mkdir -p ./outs ./models &&
+        ln -s '$infile' ./input.${infile.ext} &&
+        whisper ./input.${infile.ext}
+        --model $model
+        --output_dir ./outs
+        --threads \${GALAXY_SLOTS:-2}
+        --task transcribe
+        --output_format all
+        --verbose False
+        --model_dir \${OPENAI_WHISPER_MODEL_DIR:-./models}
+        #if str($language).strip():
+            --language '$language'
+        #end if
+        #if $advanced.temperature:
+            --temperature '$temperature'
+        #end if
+        #if $advanced.best_of:
+            --best_of '$advanced.best_of'
+        #end if
+        #if $advanced.beam_size:
+            --beam_size '$advanced.beam_size'
+        #end if
+        #if $advanced.patience:
+            --patience '$advanced.patience'
+        #end if
+        #if $advanced.length_penalty:
+            --length_penalty '$advanced.length_penalty'
+        #end if
+        #if $advanced.suppress_tokens:
+            --suppress_tokens '$advanced.suppress_tokens'
+        #end if
+        #if str($advanced.initial_prompt).strip():
+            --initial_prompt '$advanced.initial_prompt'
+        #end if
+        #if $advanced.condition_on_previous_text:
+            --condition_on_previous_text '$advanced.condition_on_previous_text'
+        #end if
+        #if $advanced.temperature_increment_on_fallback:
+            --temperature_increment_on_fallback '$advanced.temperature_increment_on_fallback'
+        #end if
+        #if $advanced.compression_ratio_threshold:
+            --compression_ratio_threshold '$advanced.compression_ratio_threshold'
+        #end if
+        #if $advanced.logprob_threshold:
+            --logprob_threshold '$advanced.logprob_threshold'
+        #end if
+        #if $advanced.no_speech_threshold:
+            --no_speech_threshold '$advanced.no_speech_threshold'
+        #end if
+        #if $advanced.word_timestamps.word_timestamps == "True":
+            --word_timestamps '$advanced.word_timestamps.word_timestamps'
+            #if $advanced.word_timestamps.highlight_words:
+                --highlight_words '$advanced.word_timestamps.highlight_words'
+            #end if
+            #if $advanced.word_timestamps.max_line_width:
+                --max_line_width '$advanced.word_timestamps.max_line_width'
+            #end if
+            #if $advanced.word_timestamps.max_line_count:
+                --max_line_count '$advanced.word_timestamps.max_line_count'
+            #end if
+            #if $advanced.word_timestamps.max_words_per_line:
+                --max_words_per_line '$advanced.word_timestamps.max_words_per_line'
+            #end if
+        #end if
+        ]]>
+    </command>
+    <environment_variables>
+        <!-- we will disable the progress bar which is printed to stderr -->
+        <environment_variable name="TQDM_DISABLE">1</environment_variable>
+    </environment_variables>
+    <inputs>
+        <param name="infile" type="data" format="wav,mp3,mkv,flv,mpg,ogg,wma,mp4" label="Select audio or video file" />
+        <param argument="--model" type="select" label="Speech to Text Model">
+            <option value="tiny">Tiny (~32x faster than the large model)</option>
+            <option value="base">Base (~16x faster than the large model)</option>
+            <option value="small" selected="true">Small (~6x faster than the large model)</option>
+            <option value="medium">Medium (~2x faster than the large model)</option>
+            <option value="large">Large</option>
+        </param>
+        <param argument="--language" type="select" label="Language">
+            <option value="">Auto (detect language)</option>
+            <option value="Afrikaans">Afrikaans</option>
+            <option value="Albanian">Albanian</option>
+            <option value="Amharic">Amharic</option>
+            <option value="Arabic">Arabic</option>
+            <option value="Armenian">Armenian</option>
+            <option value="Assamese">Assamese</option>
+            <option value="Azerbaijani">Azerbaijani</option>
+            <option value="Bashkir">Bashkir</option>
+            <option value="Basque">Basque</option>
+            <option value="Belarusian">Belarusian</option>
+            <option value="Bengali">Bengali</option>
+            <option value="Bosnian">Bosnian</option>
+            <option value="Breton">Breton</option>
+            <option value="Bulgarian">Bulgarian</option>
+            <option value="Burmese">Burmese</option>
+            <option value="Cantonese">Cantonese</option>
+            <option value="Castilian">Castilian</option>
+            <option value="Catalan">Catalan</option>
+            <option value="Chinese">Chinese</option>
+            <option value="Croatian">Croatian</option>
+            <option value="Czech">Czech</option>
+            <option value="Danish">Danish</option>
+            <option value="Dutch">Dutch</option>
+            <option value="English">English</option>
+            <option value="Estonian">Estonian</option>
+            <option value="Faroese">Faroese</option>
+            <option value="Finnish">Finnish</option>
+            <option value="Flemish">Flemish</option>
+            <option value="French">French</option>
+            <option value="Galician">Galician</option>
+            <option value="Georgian">Georgian</option>
+            <option value="German">German</option>
+            <option value="Greek">Greek</option>
+            <option value="Gujarati">Gujarati</option>
+            <option value="Haitian">Haitian</option>
+            <option value="Haitian Creole">Haitian Creole</option>
+            <option value="Hausa">Hausa</option>
+            <option value="Hawaiian">Hawaiian</option>
+            <option value="Hebrew">Hebrew</option>
+            <option value="Hindi">Hindi</option>
+            <option value="Hungarian">Hungarian</option>
+            <option value="Icelandic">Icelandic</option>
+            <option value="Indonesian">Indonesian</option>
+            <option value="Italian">Italian</option>
+            <option value="Japanese">Japanese</option>
+            <option value="Javanese">Javanese</option>
+            <option value="Kannada">Kannada</option>
+            <option value="Kazakh">Kazakh</option>
+            <option value="Khmer">Khmer</option>
+            <option value="Korean">Korean</option>
+            <option value="Lao">Lao</option>
+            <option value="Latin">Latin</option>
+            <option value="Latvian">Latvian</option>
+            <option value="Letzeburgesch">Letzeburgesch</option>
+            <option value="Lingala">Lingala</option>
+            <option value="Lithuanian">Lithuanian</option>
+            <option value="Luxembourgish">Luxembourgish</option>
+            <option value="Macedonian">Macedonian</option>
+            <option value="Malagasy">Malagasy</option>
+            <option value="Malay">Malay</option>
+            <option value="Malayalam">Malayalam</option>
+            <option value="Maltese">Maltese</option>
+            <option value="Mandarin">Mandarin</option>
+            <option value="Maori">Maori</option>
+            <option value="Marathi">Marathi</option>
+            <option value="Moldavian">Moldavian</option>
+            <option value="Moldovan">Moldovan</option>
+            <option value="Mongolian">Mongolian</option>
+            <option value="Myanmar">Myanmar</option>
+            <option value="Nepali">Nepali</option>
+            <option value="Norwegian">Norwegian</option>
+            <option value="Nynorsk">Nynorsk</option>
+            <option value="Occitan">Occitan</option>
+            <option value="Panjabi">Panjabi</option>
+            <option value="Pashto">Pashto</option>
+            <option value="Persian">Persian</option>
+            <option value="Polish">Polish</option>
+            <option value="Portuguese">Portuguese</option>
+            <option value="Punjabi">Punjabi</option>
+            <option value="Pushto">Pushto</option>
+            <option value="Romanian">Romanian</option>
+            <option value="Russian">Russian</option>
+            <option value="Sanskrit">Sanskrit</option>
+            <option value="Serbian">Serbian</option>
+            <option value="Shona">Shona</option>
+            <option value="Sindhi">Sindhi</option>
+            <option value="Sinhala">Sinhala</option>
+            <option value="Sinhalese">Sinhalese</option>
+            <option value="Slovak">Slovak</option>
+            <option value="Slovenian">Slovenian</option>
+            <option value="Somali">Somali</option>
+            <option value="Spanish">Spanish</option>
+            <option value="Sundanese">Sundanese</option>
+            <option value="Swahili">Swahili</option>
+            <option value="Swedish">Swedish</option>
+            <option value="Tagalog">Tagalog</option>
+            <option value="Tajik">Tajik</option>
+            <option value="Tamil">Tamil</option>
+            <option value="Tatar">Tatar</option>
+            <option value="Telugu">Telugu</option>
+            <option value="Thai">Thai</option>
+            <option value="Tibetan">Tibetan</option>
+            <option value="Turkish">Turkish</option>
+            <option value="Turkmen">Turkmen</option>
+            <option value="Ukrainian">Ukrainian</option>
+            <option value="Urdu">Urdu</option>
+            <option value="Uzbek">Uzbek</option>
+            <option value="Valencian">Valencian</option>
+            <option value="Vietnamese">Vietnamese</option>
+            <option value="Welsh">Welsh</option>
+            <option value="Yiddish">Yiddish</option>
+            <option value="Yoruba">Yoruba</option>
+        </param>
+        <param argument="--output_format" type="select" label="Output Format" multiple="true">
+            <option value="txt" selected="true">Text</option>
+            <option value="json">JSON</option>
+            <option value="srt">SubRip</option>
+            <option value="vtt">WebVTT</option>
+            <option value="tsv">Tab-separated values</option>
+        </param>
+        <section name="advanced" title="Advanced Options">
+            <param argument="--temperature" type="integer" value="0" optional="true" label="Temperature" help="Temperature to use for sampling" />
+            <param argument="--best_of" type="integer" value="5" optional="true" label="Best of" help="Number of candidates when sampling with non-zero temperature" />
+            <param argument="--beam_size" type="integer" value="5" optional="true" label="Beam size" help="Number of beams in beam search, only applicable when temperature is zero" />
+            <param argument="--patience" type="float" value="" optional="true" label="Optional patience value to use in beam decoding" help="As in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search" />
+            <param argument="--length_penalty" type="float" value="" optional="true" label="Optional token length penalty coefficient (alpha)" help="As in https://arxiv.org/abs/1609.08144, uses simple length normalization by default" />
+            <param argument="--suppress_tokens" type="integer" value="-1" optional="true" label="Suppress tokens" help="Comma-separated list of token ids to suppress during sampling; -1 will suppress most special characters except common punctuations" />
+            <param argument="--initial_prompt" type="text" value="" optional="true" label="Initial prompt" help="Optional text to provide as a prompt for the first window" />
+            <param argument="--condition_on_previous_text" type="boolean" truevalue="True" falsevalue="False" checked="true" optional="true" label="Condition on previous text" help="If True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop" />
+            <param argument="--temperature_increment_on_fallback" type="float" value="0.2" optional="true" label="Temperature increment on fallback" help="Temperature to increase when falling back when the decoding fails to meet either of the thresholds below" />
+            <param argument="--compression_ratio_threshold" type="float" value="2.4" optional="true" label="Compression ratio threshold" help="If the gzip compression ratio is higher than this value, treat the decoding as failed" />
+            <param argument="--logprob_threshold" type="float" value="-1.0" optional="true" label="Logprob threshold" help="If the average log probability is lower than this value, treat the decoding as failed" />
+            <param argument="--no_speech_threshold" type="float" value="0.6" optional="true" label="No speech threshold" help="If the probability of the |nospeech| token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence" />
+            <conditional name="word_timestamps">
+                <param argument="--word_timestamps" type="select" label="Extract word timestamps?" help="(experimental) Extract word-level timestamps and refine the results based on them">
+                    <option value="False">False</option>
+                    <option value="True">True</option>
+                </param>
+                <when value="True">
+                    <param argument="--highlight_words" type="boolean" truevalue="True" falsevalue="False" value="False" optional="true" label="Highlight words" help="Underline each word as it is spoken in srt and vtt" />
+                    <param argument="--max_line_width" type="integer" value="" optional="true" label="Max line width" help="The maximum number of characters in a line before breaking the line" />
+                    <param argument="--max_line_count" type="integer" value="" optional="true" label="Max line count" help="The maximum number of lines in a segment" />
+                    <param argument="--max_words_per_line" type="integer" value="" optional="true" label="Max words per line" help="No effect with --max_line_width. the maximum number of words in a segment" />
+                </when>
+                <when value="False">
+                </when>
+            </conditional>
+        </section>
+    </inputs>
+    <outputs>
+        <data name="output_txt" format="txt" from_work_dir="./outs/input.txt" label="${tool.name} on ${on_string}.txt">
+            <filter>'txt' in output_format</filter>
+        </data>
+        <data name="output_json" format="json" from_work_dir="./outs/input.json" label="${tool.name} on ${on_string}.json">
+            <filter>'json' in output_format</filter>
+        </data>
+        <data name="output_srt" format="txt" from_work_dir="./outs/input.srt" label="${tool.name} on ${on_string}.srt">
+            <filter>'srt' in output_format</filter>
+        </data>
+        <data name="output_vtt" format="txt" from_work_dir="./outs/input.vtt" label="${tool.name} on ${on_string}.vtt">
+            <filter>'vtt' in output_format</filter>
+        </data>
+        <data name="output_tsv" format="tabular" from_work_dir="./outs/input.tsv" label="${tool.name} on ${on_string}.tsv">
+            <filter>'tsv' in output_format</filter>
+        </data>
+    </outputs>
+    <tests>
+        <test expect_num_outputs="1">
+            <param name="infile" value="english.wav" ftype="wav"/>
+            <param name="model" value="tiny"/>
+            <param name="language" value="English"/>
+            <param name="output_format" value="txt"/>
+            <output name="output_txt" file="transcribe.txt"/>
+        </test>
+        <test expect_num_outputs="3">
+            <param name="infile" value="english.wav" ftype="wav"/>
+            <param name="model" value="tiny"/>
+            <param name="language" value="English"/>
+            <param name="output_format" value="srt,tsv,json"/>
+            <output name="output_srt" file="transcribe_english.srt"/>
+            <output name="output_tsv" file="transcribe_english.tsv"/>
+            <output name="output_json">
+                <assert_contents>
+                    <has_text text="21-year-old Jesus joined Manchester City last year in January 2017 from"/>
+                    <has_text text="temperature"/>
+                    <has_text text="no_speech_prob"/>
+                    <has_text text="English"/>
+                    <has_n_lines n="1"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="1">
+            <param name="infile" value="german.wav" ftype="wav"/>
+            <param name="model" value="small"/>
+            <param name="language" value="German"/>
+            <param name="output_format" value="txt"/>
+            <output name="output_txt" file="transcribe_german.txt"/>
+        </test>
+        <test expect_num_outputs="1">
+            <param name="infile" value="german_english.mp3" ftype="mp3"/>
+            <param name="model" value="medium"/>
+            <param name="output_format" value="txt"/>
+            <output name="output_txt" file="transcribe_german_english.txt"/>
+        </test>
+        <test expect_num_outputs="2">
+            <param name="infile" value="persian.wav" ftype="wav"/>
+            <param name="model" value="medium"/>
+            <param name="language" value="Persian"/>
+            <param name="output_format" value="srt,json"/>
+            <section name="advanced">
+                <param name="condition_on_previous_text" value="False"/>
+                <conditional name="word_timestamps">
+                    <param name="word_timestamps" value="True"/>
+                    <param name="max_words_per_line" value="5"/>
+                </conditional>
+            </section>
+            <output name="output_srt">
+                <assert_contents>
+                    <has_n_lines n="32"/>
+                </assert_contents>
+            </output>
+            <output name="output_json">
+                <assert_contents>
+                    <has_text text="\u0628\u0631\u062e\u06cc \u0627\u0632 \u0627\u06cc\u0646"/>
+                    <has_text text="temperature"/>
+                    <has_text text="no_speech_prob"/>
+                    <has_text text="Persian"/>
+                    <has_n_lines n="1"/>
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+
+.. class:: infomark
+
+**What it does**
+
+Transcribe audio or video files to text using the `Whisper from OpenAI <https://github.com/openai/whisper>`_.
+
+Usage
+.....
+
+
+**Input**
+Audio or video file to transcribe in one of wav, mp3, mkv, flv, mpg, ogg, wma, or mp4.
+
+
+**Output**
+Transcribed text in the selected format. The output can be in text, JSON, SubRip, WebVTT, or tab-separated values (tabular) format.
+    ]]></help>
+    <creator>
+        <person givenName="Alireza" familyName="Heidari" url="http://github.com/itisalirh"/>
+    </creator>
+    <citations>
+        <citation type="bibtex">
+            @misc{openai2022whisper,
+                title={Whisper},
+                author={OpenAI},
+                year={2022},
+                url={https://github.com/openai/whisper}
+            }
+        </citation>
+    </citations>
+</tool>