diff tesseract.xml @ 4:e36d806cc4c7 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/tesseract commit da3b71e3076fb7475d1bb9c96293a0c83da8a62e
author iuc
date Wed, 14 Jan 2026 16:15:36 +0000
parents 521bcbb5aa16
children a3c79d2b1041
line wrap: on
line diff
--- a/tesseract.xml	Tue Nov 04 13:42:50 2025 +0000
+++ b/tesseract.xml	Wed Jan 14 16:15:36 2026 +0000
@@ -13,10 +13,22 @@
         #else
             echo '$input_file' > img_paths &&
         #end if
-        tesseract img_paths output
-        --tessdata-dir '${tessdata.fields.path}'
-        #if $language:
-            -l ${ str($language).replace(",","+") }
+        #if str($models.models_select) == 'own'
+            mkdir tessdata_dir &&
+            #set languages = ""
+            #for $model in $models.tessdata
+                ln -s '${model}' tessdata_dir/${model.element_identifier} &&
+                #set languages = languages + ("+" if languages else "") + $model.element_identifier.replace(".traineddata", "")
+            #end for
+            tesseract img_paths output
+            --tessdata-dir tessdata_dir
+            -l $languages
+        #else
+            tesseract img_paths output
+            --tessdata-dir '${models.tessdata.fields.path}'
+            #if $models.language:
+                -l ${ str($models.language).replace(",","+") }
+            #end if
         #end if
         --psm $psm
         #if $dpi:
@@ -34,142 +46,153 @@
     ]]></command>
     <inputs>
         <param name="input_file" type="data" format="jpg,png,tif,tiff,bmp,pdf" label="Image file(s)" multiple="false"/>
-        <param name="tessdata" type="select" label="Tessdata" help="Language data models">
-            <options from_data_table="tessdata">
-                <column name="value" index="0"/>
-                <column name="name" index="1"/>
-                <column name="version" index="2"/>
-                <column name="path" index="3"/>
-                <filter type="sort_by" column="1"/>
-            </options>
-            <validator type="no_options" message="A built-in tesseract model is not available. Please ask the Galaxy admins to install one on the server."/>
-        </param>
+        <conditional name="models">
+            <param name="models_select" type="select" label="Choose between official or own models">
+                <option value="official">Tesseract official models</option>
+                <option value="own">Own models</option>
+            </param>
+            <when value="official">
+                <param name="tessdata" type="select" label="Tessdata" help="Language data models">
+                    <options from_data_table="tessdata">
+                        <column name="value" index="0"/>
+                        <column name="name" index="1"/>
+                        <column name="version" index="2"/>
+                        <column name="path" index="3"/>
+                        <filter type="sort_by" column="1"/>
+                    </options>
+                    <validator type="no_options" message="A built-in tesseract model is not available. Please ask the Galaxy admins to install one on the server."/>
+                </param>
+                <param name="language" type="select" label="OCR Language(s)" multiple="true" help="In the case of a multilingual image(s), more the one language can be selected" optional="false">
+                    <option value="afr">Afrikaans</option>
+                    <option value="amh">Amharic</option>
+                    <option value="ara">Arabic</option>
+                    <option value="asm">Assamese</option>
+                    <option value="aze">Azerbaijani</option>
+                    <option value="aze_cyrl">Azerbaijani - Cyrilic</option>
+                    <option value="bel">Belarusian</option>
+                    <option value="ben">Bengali</option>
+                    <option value="bod">Tibetan</option>
+                    <option value="bos">Bosnian</option>
+                    <option value="bre">Breton</option>
+                    <option value="bul">Bulgarian</option>
+                    <option value="cat">Catalan; Valencian</option>
+                    <option value="ceb">Cebuano</option>
+                    <option value="ces">Czech</option>
+                    <option value="chi_sim">Chinese simplified</option>
+                    <option value="chi_tra">Chinese traditional</option>
+                    <option value="chr">Cherokee</option>
+                    <option value="cos">Corsican</option>
+                    <option value="cym">Welsh</option>
+                    <option value="dan">Danish</option>
+                    <option value="deu">German</option>
+                    <option value="deu_latf">German Fraktur Latin</option>
+                    <option value="div">Dhivehi</option>
+                    <option value="dzo">Dzongkha</option>
+                    <option value="ell">Greek, Modern, 1453-</option>
+                    <option value="eng" selected="true">English</option>
+                    <option value="enm">English, Middle, 1100-1500</option>
+                    <option value="epo">Esperanto</option>
+                    <option value="equ">Math / equation detection module</option>
+                    <option value="est">Estonian</option>
+                    <option value="eus">Basque</option>
+                    <option value="fas">Persian</option>
+                    <option value="fao">Faroese</option>
+                    <option value="fil">Filipino</option>
+                    <option value="fin">Finnish</option>
+                    <option value="fra">French</option>
+                    <option value="frm">French, Middle, ca.1400-1600</option>
+                    <option value="fry">West Frisian</option>
+                    <option value="gla">Scottish Gaelic</option>
+                    <option value="gle">Irish</option>
+                    <option value="glg">Galician</option>
+                    <option value="grc">Greek, Ancient, to 1453</option>
+                    <option value="guj">Gujarati</option>
+                    <option value="hat">Haitian; Haitian Creole</option>
+                    <option value="heb">Hebrew</option>
+                    <option value="hin">Hindi</option>
+                    <option value="hrv">Croatian</option>
+                    <option value="hun">Hungarian</option>
+                    <option value="hye">Armenian</option>
+                    <option value="iku">Inuktitut</option>
+                    <option value="ind">Indonesian</option>
+                    <option value="isl">Icelandic</option>
+                    <option value="ita">Italian</option>
+                    <option value="ita_old">Italian - Old</option>
+                    <option value="jav">Javanese</option>
+                    <option value="jpn">Japanese</option>
+                    <option value="kan">Kannada</option>
+                    <option value="kat">Georgian</option>
+                    <option value="kat_old">Georgian - Old</option>
+                    <option value="kaz">Kazakh</option>
+                    <option value="khm">Central Khmer</option>
+                    <option value="kir">Kirghiz; Kyrgyz</option>
+                    <option value="kmr">Kurdish Kurmanji</option>
+                    <option value="kor">Korean</option>
+                    <option value="kor_vert">Korean vertical</option>
+                    <option value="lao">Lao</option>
+                    <option value="lat">Latin</option>
+                    <option value="lav">Latvian</option>
+                    <option value="lit">Lithuanian</option>
+                    <option value="ltz">Luxembourgish</option>
+                    <option value="mal">Malayalam</option>
+                    <option value="mar">Marathi</option>
+                    <option value="mkd">Macedonian</option>
+                    <option value="mlt">Maltese</option>
+                    <option value="mon">Mongolian</option>
+                    <option value="mri">Maori</option>
+                    <option value="msa">Malay</option>
+                    <option value="mya">Burmese</option>
+                    <option value="nep">Nepali</option>
+                    <option value="nld">Dutch; Flemish</option>
+                    <option value="nor">Norwegian</option>
+                    <option value="oci">Occitan post 1500</option>
+                    <option value="ori">Oriya</option>
+                    <option value="osd">Orientation and script detection module</option>
+                    <option value="pan">Panjabi; Punjabi</option>
+                    <option value="pol">Polish</option>
+                    <option value="por">Portuguese</option>
+                    <option value="pus">Pushto; Pashto</option>
+                    <option value="que">Quechua</option>
+                    <option value="ron">Romanian; Moldavian; Moldovan</option>
+                    <option value="rus">Russian</option>
+                    <option value="san">Sanskrit</option>
+                    <option value="sin">Sinhala; Sinhalese</option>
+                    <option value="slk">Slovak</option>
+                    <option value="slv">Slovenian</option>
+                    <option value="snd">Sindhi</option>
+                    <option value="spa">Spanish; Castilian</option>
+                    <option value="spa_old">Spanish; Castilian - Old</option>
+                    <option value="sqi">Albanian</option>
+                    <option value="srp">Serbian</option>
+                    <option value="srp_latn">Serbian - Latin</option>
+                    <option value="sun">Sundanese</option>
+                    <option value="swa">Swahili</option>
+                    <option value="swe">Swedish</option>
+                    <option value="syr">Syriac</option>
+                    <option value="tam">Tamil</option>
+                    <option value="tat">Tatar</option>
+                    <option value="tel">Telugu</option>
+                    <option value="tgk">Tajik</option>
+                    <option value="tha">Thai</option>
+                    <option value="tir">Tigrinya</option>
+                    <option value="ton">Tonga</option>
+                    <option value="tur">Turkish</option>
+                    <option value="uig">Uighur; Uyghur</option>
+                    <option value="ukr">Ukrainian</option>
+                    <option value="urd">Urdu</option>
+                    <option value="uzb">Uzbek</option>
+                    <option value="uzb_cyrl">Uzbek - Cyrilic</option>
+                    <option value="vie">Vietnamese</option>
+                    <option value="yid">Yiddish</option>
+                    <option value="yor">Yoruba</option>
+                </param>
+            </when>
+            <when value="own">
+                <param name="tessdata" type="data" format="binary" label="Tessdata" help="Language data models" multiple="true"/>
+            </when>
+        </conditional>
         <param name="user_words" type="data" format="txt" label="User words file" optional="true" help="The user words file allows you to specify a list of words that Tesseract should treat as known words. One word per line"/>
         <param name="user_patterns" type="data" format="txt" label="User patterns file" optional="true" help="One pattern per line in UTF-8 format. For more information please visit the tesseract docs about patterns linked in the help section"/>
-        <param name="language" type="select" label="OCR Language(s)" multiple="true" help="In the case of a multilingual image(s), more the one language can be selected" optional="false">
-            <option value="afr">Afrikaans</option>
-            <option value="amh">Amharic</option>
-            <option value="ara">Arabic</option>
-            <option value="asm">Assamese</option>
-            <option value="aze">Azerbaijani</option>
-            <option value="aze_cyrl">Azerbaijani - Cyrilic</option>
-            <option value="bel">Belarusian</option>
-            <option value="ben">Bengali</option>
-            <option value="bod">Tibetan</option>
-            <option value="bos">Bosnian</option>
-            <option value="bre">Breton</option>
-            <option value="bul">Bulgarian</option>
-            <option value="cat">Catalan; Valencian</option>
-            <option value="ceb">Cebuano</option>
-            <option value="ces">Czech</option>
-            <option value="chi_sim">Chinese simplified</option>
-            <option value="chi_tra">Chinese traditional</option>
-            <option value="chr">Cherokee</option>
-            <option value="cos">Corsican</option>
-            <option value="cym">Welsh</option>
-            <option value="dan">Danish</option>
-            <option value="deu">German</option>
-            <option value="deu_latf">German Fraktur Latin</option>
-            <option value="div">Dhivehi</option>
-            <option value="dzo">Dzongkha</option>
-            <option value="ell">Greek, Modern, 1453-</option>
-            <option value="eng" selected="true">English</option>
-            <option value="enm">English, Middle, 1100-1500</option>
-            <option value="epo">Esperanto</option>
-            <option value="equ">Math / equation detection module</option>
-            <option value="est">Estonian</option>
-            <option value="eus">Basque</option>
-            <option value="fas">Persian</option>
-            <option value="fao">Faroese</option>
-            <option value="fil">Filipino</option>
-            <option value="fin">Finnish</option>
-            <option value="fra">French</option>
-            <option value="frm">French, Middle, ca.1400-1600</option>
-            <option value="fry">West Frisian</option>
-            <option value="gla">Scottish Gaelic</option>
-            <option value="gle">Irish</option>
-            <option value="glg">Galician</option>
-            <option value="grc">Greek, Ancient, to 1453</option>
-            <option value="guj">Gujarati</option>
-            <option value="hat">Haitian; Haitian Creole</option>
-            <option value="heb">Hebrew</option>
-            <option value="hin">Hindi</option>
-            <option value="hrv">Croatian</option>
-            <option value="hun">Hungarian</option>
-            <option value="hye">Armenian</option>
-            <option value="iku">Inuktitut</option>
-            <option value="ind">Indonesian</option>
-            <option value="isl">Icelandic</option>
-            <option value="ita">Italian</option>
-            <option value="ita_old">Italian - Old</option>
-            <option value="jav">Javanese</option>
-            <option value="jpn">Japanese</option>
-            <option value="kan">Kannada</option>
-            <option value="kat">Georgian</option>
-            <option value="kat_old">Georgian - Old</option>
-            <option value="kaz">Kazakh</option>
-            <option value="khm">Central Khmer</option>
-            <option value="kir">Kirghiz; Kyrgyz</option>
-            <option value="kmr">Kurdish Kurmanji</option>
-            <option value="kor">Korean</option>
-            <option value="kor_vert">Korean vertical</option>
-            <option value="lao">Lao</option>
-            <option value="lat">Latin</option>
-            <option value="lav">Latvian</option>
-            <option value="lit">Lithuanian</option>
-            <option value="ltz">Luxembourgish</option>
-            <option value="mal">Malayalam</option>
-            <option value="mar">Marathi</option>
-            <option value="mkd">Macedonian</option>
-            <option value="mlt">Maltese</option>
-            <option value="mon">Mongolian</option>
-            <option value="mri">Maori</option>
-            <option value="msa">Malay</option>
-            <option value="mya">Burmese</option>
-            <option value="nep">Nepali</option>
-            <option value="nld">Dutch; Flemish</option>
-            <option value="nor">Norwegian</option>
-            <option value="oci">Occitan post 1500</option>
-            <option value="ori">Oriya</option>
-            <option value="osd">Orientation and script detection module</option>
-            <option value="pan">Panjabi; Punjabi</option>
-            <option value="pol">Polish</option>
-            <option value="por">Portuguese</option>
-            <option value="pus">Pushto; Pashto</option>
-            <option value="que">Quechua</option>
-            <option value="ron">Romanian; Moldavian; Moldovan</option>
-            <option value="rus">Russian</option>
-            <option value="san">Sanskrit</option>
-            <option value="sin">Sinhala; Sinhalese</option>
-            <option value="slk">Slovak</option>
-            <option value="slv">Slovenian</option>
-            <option value="snd">Sindhi</option>
-            <option value="spa">Spanish; Castilian</option>
-            <option value="spa_old">Spanish; Castilian - Old</option>
-            <option value="sqi">Albanian</option>
-            <option value="srp">Serbian</option>
-            <option value="srp_latn">Serbian - Latin</option>
-            <option value="sun">Sundanese</option>
-            <option value="swa">Swahili</option>
-            <option value="swe">Swedish</option>
-            <option value="syr">Syriac</option>
-            <option value="tam">Tamil</option>
-            <option value="tat">Tatar</option>
-            <option value="tel">Telugu</option>
-            <option value="tgk">Tajik</option>
-            <option value="tha">Thai</option>
-            <option value="tir">Tigrinya</option>
-            <option value="ton">Tonga</option>
-            <option value="tur">Turkish</option>
-            <option value="uig">Uighur; Uyghur</option>
-            <option value="ukr">Ukrainian</option>
-            <option value="urd">Urdu</option>
-            <option value="uzb">Uzbek</option>
-            <option value="uzb_cyrl">Uzbek - Cyrilic</option>
-            <option value="vie">Vietnamese</option>
-            <option value="yid">Yiddish</option>
-            <option value="yor">Yoruba</option>
-        </param>
         <param name="output_formats" type="select" label="Output format(s)" multiple="true" optional="false">
             <option value="tessedit_create_txt" selected="true">Text</option>
             <option value="tessedit_create_pdf">PDF</option>
@@ -210,20 +233,26 @@
     </outputs>
     <tests>
         <test expect_num_outputs="2">
+            <conditional name="models">
+                <param name="models_select" value="official"/>
+                <param name="tessdata" value="test_tessdata"/>
+                <param name="language" value="chr"/>
+            </conditional>
             <param name="input_file" value="eurotext.png"/>
-            <param name="tessdata" value="test_tessdata"/>
+            <param name="user_patterns" value="eng.user-patterns"/>
             <param name="user_words" value="eng.user-words"/>
-            <param name="user_patterns" value="eng.user-patterns"/>
-            <param name="language" value="chr"/>
             <param name="output_formats" value="tessedit_create_txt,tessedit_create_pdf"/>
             <param name="psm" value="3"/>
             <output name="output_text" file="image_output.txt"/>
             <output name="output_pdf" file="image_output.pdf"/>
         </test>
         <test expect_num_outputs="2">
+            <conditional name="models">
+                <param name="models_select" value="official"/>
+                <param name="tessdata" value="test_tessdata"/>
+                <param name="language" value="chr"/>
+            </conditional>
             <param name="input_file" value="test_image_cherokee.png"/>
-            <param name="tessdata" value="test_tessdata"/>
-            <param name="language" value="chr"/>
             <param name="output_formats" value="tessedit_create_hocr,tessedit_create_tsv"/>
             <param name="psm" value="11"/>
             <output name="output_hocr">
@@ -236,9 +265,12 @@
             <output name="output_tsv" file="image_output.tsv"/>
         </test>
         <test expect_num_outputs="4">
+            <conditional name="models">
+                <param name="models_select" value="official"/>
+                <param name="tessdata" value="test_tessdata"/>
+                <param name="language" value="chr"/>
+            </conditional>
             <param name="input_file" value="test_input.pdf"/>
-            <param name="tessdata" value="test_tessdata"/>
-            <param name="language" value="chr"/>
             <param name="output_formats" value="tessedit_create_txt,tessedit_create_pdf,tessedit_create_hocr,tessedit_create_tsv"/>
             <param name="psm" value="11"/>
             <output name="output_hocr">
@@ -251,6 +283,25 @@
             <output name="output_tsv" file="pdf_output.tsv"/>
             <output name="output_text" file="pdf_output.txt"/>
             <output name="output_pdf" file="pdf_output.pdf"/>
+        </test>     
+        <test expect_num_outputs="4">
+            <param name="input_file" value="test_input.pdf"/>
+            <param name="output_formats" value="tessedit_create_txt,tessedit_create_pdf,tessedit_create_hocr,tessedit_create_tsv"/>
+            <param name="psm" value="11"/>
+            <conditional name="models">
+                <param name="models_select" value="own"/>
+                <param name="tessdata" value="tessdata/chr.traineddata"/>
+            </conditional>
+            <output name="output_hocr">
+                <assert_contents>
+                    <has_text text="Ꮳ"/>
+                    <has_text text="ᏌᎠᏯᏙᏣᎠ"/>
+                    <has_size value="13185" delta="10"/>
+                </assert_contents>
+            </output>
+            <output name="output_tsv" file="pdf_output.tsv"/>
+            <output name="output_text" file="pdf_output.txt"/>
+            <output name="output_pdf" file="pdf_output.pdf"/>
         </test>
     </tests>
     <help><![CDATA[