Mercurial > repos > iuc > tesseract
diff tesseract.xml @ 4:e36d806cc4c7 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/tesseract commit da3b71e3076fb7475d1bb9c96293a0c83da8a62e
| author | iuc |
|---|---|
| date | Wed, 14 Jan 2026 16:15:36 +0000 |
| parents | 521bcbb5aa16 |
| children | a3c79d2b1041 |
line wrap: on
line diff
--- a/tesseract.xml Tue Nov 04 13:42:50 2025 +0000 +++ b/tesseract.xml Wed Jan 14 16:15:36 2026 +0000 @@ -13,10 +13,22 @@ #else echo '$input_file' > img_paths && #end if - tesseract img_paths output - --tessdata-dir '${tessdata.fields.path}' - #if $language: - -l ${ str($language).replace(",","+") } + #if str($models.models_select) == 'own' + mkdir tessdata_dir && + #set languages = "" + #for $model in $models.tessdata + ln -s '${model}' tessdata_dir/${model.element_identifier} && + #set languages = languages + ("+" if languages else "") + $model.element_identifier.replace(".traineddata", "") + #end for + tesseract img_paths output + --tessdata-dir tessdata_dir + -l $languages + #else + tesseract img_paths output + --tessdata-dir '${models.tessdata.fields.path}' + #if $models.language: + -l ${ str($models.language).replace(",","+") } + #end if #end if --psm $psm #if $dpi: @@ -34,142 +46,153 @@ ]]></command> <inputs> <param name="input_file" type="data" format="jpg,png,tif,tiff,bmp,pdf" label="Image file(s)" multiple="false"/> - <param name="tessdata" type="select" label="Tessdata" help="Language data models"> - <options from_data_table="tessdata"> - <column name="value" index="0"/> - <column name="name" index="1"/> - <column name="version" index="2"/> - <column name="path" index="3"/> - <filter type="sort_by" column="1"/> - </options> - <validator type="no_options" message="A built-in tesseract model is not available. Please ask the Galaxy admins to install one on the server."/> - </param> + <conditional name="models"> + <param name="models_select" type="select" label="Choose between official or own models"> + <option value="official">Tesseract official models</option> + <option value="own">Own models</option> + </param> + <when value="official"> + <param name="tessdata" type="select" label="Tessdata" help="Language data models"> + <options from_data_table="tessdata"> + <column name="value" index="0"/> + <column name="name" index="1"/> + <column name="version" index="2"/> + <column name="path" index="3"/> + <filter type="sort_by" column="1"/> + </options> + <validator type="no_options" message="A built-in tesseract model is not available. Please ask the Galaxy admins to install one on the server."/> + </param> + <param name="language" type="select" label="OCR Language(s)" multiple="true" help="In the case of a multilingual image(s), more the one language can be selected" optional="false"> + <option value="afr">Afrikaans</option> + <option value="amh">Amharic</option> + <option value="ara">Arabic</option> + <option value="asm">Assamese</option> + <option value="aze">Azerbaijani</option> + <option value="aze_cyrl">Azerbaijani - Cyrilic</option> + <option value="bel">Belarusian</option> + <option value="ben">Bengali</option> + <option value="bod">Tibetan</option> + <option value="bos">Bosnian</option> + <option value="bre">Breton</option> + <option value="bul">Bulgarian</option> + <option value="cat">Catalan; Valencian</option> + <option value="ceb">Cebuano</option> + <option value="ces">Czech</option> + <option value="chi_sim">Chinese simplified</option> + <option value="chi_tra">Chinese traditional</option> + <option value="chr">Cherokee</option> + <option value="cos">Corsican</option> + <option value="cym">Welsh</option> + <option value="dan">Danish</option> + <option value="deu">German</option> + <option value="deu_latf">German Fraktur Latin</option> + <option value="div">Dhivehi</option> + <option value="dzo">Dzongkha</option> + <option value="ell">Greek, Modern, 1453-</option> + <option value="eng" selected="true">English</option> + <option value="enm">English, Middle, 1100-1500</option> + <option value="epo">Esperanto</option> + <option value="equ">Math / equation detection module</option> + <option value="est">Estonian</option> + <option value="eus">Basque</option> + <option value="fas">Persian</option> + <option value="fao">Faroese</option> + <option value="fil">Filipino</option> + <option value="fin">Finnish</option> + <option value="fra">French</option> + <option value="frm">French, Middle, ca.1400-1600</option> + <option value="fry">West Frisian</option> + <option value="gla">Scottish Gaelic</option> + <option value="gle">Irish</option> + <option value="glg">Galician</option> + <option value="grc">Greek, Ancient, to 1453</option> + <option value="guj">Gujarati</option> + <option value="hat">Haitian; Haitian Creole</option> + <option value="heb">Hebrew</option> + <option value="hin">Hindi</option> + <option value="hrv">Croatian</option> + <option value="hun">Hungarian</option> + <option value="hye">Armenian</option> + <option value="iku">Inuktitut</option> + <option value="ind">Indonesian</option> + <option value="isl">Icelandic</option> + <option value="ita">Italian</option> + <option value="ita_old">Italian - Old</option> + <option value="jav">Javanese</option> + <option value="jpn">Japanese</option> + <option value="kan">Kannada</option> + <option value="kat">Georgian</option> + <option value="kat_old">Georgian - Old</option> + <option value="kaz">Kazakh</option> + <option value="khm">Central Khmer</option> + <option value="kir">Kirghiz; Kyrgyz</option> + <option value="kmr">Kurdish Kurmanji</option> + <option value="kor">Korean</option> + <option value="kor_vert">Korean vertical</option> + <option value="lao">Lao</option> + <option value="lat">Latin</option> + <option value="lav">Latvian</option> + <option value="lit">Lithuanian</option> + <option value="ltz">Luxembourgish</option> + <option value="mal">Malayalam</option> + <option value="mar">Marathi</option> + <option value="mkd">Macedonian</option> + <option value="mlt">Maltese</option> + <option value="mon">Mongolian</option> + <option value="mri">Maori</option> + <option value="msa">Malay</option> + <option value="mya">Burmese</option> + <option value="nep">Nepali</option> + <option value="nld">Dutch; Flemish</option> + <option value="nor">Norwegian</option> + <option value="oci">Occitan post 1500</option> + <option value="ori">Oriya</option> + <option value="osd">Orientation and script detection module</option> + <option value="pan">Panjabi; Punjabi</option> + <option value="pol">Polish</option> + <option value="por">Portuguese</option> + <option value="pus">Pushto; Pashto</option> + <option value="que">Quechua</option> + <option value="ron">Romanian; Moldavian; Moldovan</option> + <option value="rus">Russian</option> + <option value="san">Sanskrit</option> + <option value="sin">Sinhala; Sinhalese</option> + <option value="slk">Slovak</option> + <option value="slv">Slovenian</option> + <option value="snd">Sindhi</option> + <option value="spa">Spanish; Castilian</option> + <option value="spa_old">Spanish; Castilian - Old</option> + <option value="sqi">Albanian</option> + <option value="srp">Serbian</option> + <option value="srp_latn">Serbian - Latin</option> + <option value="sun">Sundanese</option> + <option value="swa">Swahili</option> + <option value="swe">Swedish</option> + <option value="syr">Syriac</option> + <option value="tam">Tamil</option> + <option value="tat">Tatar</option> + <option value="tel">Telugu</option> + <option value="tgk">Tajik</option> + <option value="tha">Thai</option> + <option value="tir">Tigrinya</option> + <option value="ton">Tonga</option> + <option value="tur">Turkish</option> + <option value="uig">Uighur; Uyghur</option> + <option value="ukr">Ukrainian</option> + <option value="urd">Urdu</option> + <option value="uzb">Uzbek</option> + <option value="uzb_cyrl">Uzbek - Cyrilic</option> + <option value="vie">Vietnamese</option> + <option value="yid">Yiddish</option> + <option value="yor">Yoruba</option> + </param> + </when> + <when value="own"> + <param name="tessdata" type="data" format="binary" label="Tessdata" help="Language data models" multiple="true"/> + </when> + </conditional> <param name="user_words" type="data" format="txt" label="User words file" optional="true" help="The user words file allows you to specify a list of words that Tesseract should treat as known words. One word per line"/> <param name="user_patterns" type="data" format="txt" label="User patterns file" optional="true" help="One pattern per line in UTF-8 format. For more information please visit the tesseract docs about patterns linked in the help section"/> - <param name="language" type="select" label="OCR Language(s)" multiple="true" help="In the case of a multilingual image(s), more the one language can be selected" optional="false"> - <option value="afr">Afrikaans</option> - <option value="amh">Amharic</option> - <option value="ara">Arabic</option> - <option value="asm">Assamese</option> - <option value="aze">Azerbaijani</option> - <option value="aze_cyrl">Azerbaijani - Cyrilic</option> - <option value="bel">Belarusian</option> - <option value="ben">Bengali</option> - <option value="bod">Tibetan</option> - <option value="bos">Bosnian</option> - <option value="bre">Breton</option> - <option value="bul">Bulgarian</option> - <option value="cat">Catalan; Valencian</option> - <option value="ceb">Cebuano</option> - <option value="ces">Czech</option> - <option value="chi_sim">Chinese simplified</option> - <option value="chi_tra">Chinese traditional</option> - <option value="chr">Cherokee</option> - <option value="cos">Corsican</option> - <option value="cym">Welsh</option> - <option value="dan">Danish</option> - <option value="deu">German</option> - <option value="deu_latf">German Fraktur Latin</option> - <option value="div">Dhivehi</option> - <option value="dzo">Dzongkha</option> - <option value="ell">Greek, Modern, 1453-</option> - <option value="eng" selected="true">English</option> - <option value="enm">English, Middle, 1100-1500</option> - <option value="epo">Esperanto</option> - <option value="equ">Math / equation detection module</option> - <option value="est">Estonian</option> - <option value="eus">Basque</option> - <option value="fas">Persian</option> - <option value="fao">Faroese</option> - <option value="fil">Filipino</option> - <option value="fin">Finnish</option> - <option value="fra">French</option> - <option value="frm">French, Middle, ca.1400-1600</option> - <option value="fry">West Frisian</option> - <option value="gla">Scottish Gaelic</option> - <option value="gle">Irish</option> - <option value="glg">Galician</option> - <option value="grc">Greek, Ancient, to 1453</option> - <option value="guj">Gujarati</option> - <option value="hat">Haitian; Haitian Creole</option> - <option value="heb">Hebrew</option> - <option value="hin">Hindi</option> - <option value="hrv">Croatian</option> - <option value="hun">Hungarian</option> - <option value="hye">Armenian</option> - <option value="iku">Inuktitut</option> - <option value="ind">Indonesian</option> - <option value="isl">Icelandic</option> - <option value="ita">Italian</option> - <option value="ita_old">Italian - Old</option> - <option value="jav">Javanese</option> - <option value="jpn">Japanese</option> - <option value="kan">Kannada</option> - <option value="kat">Georgian</option> - <option value="kat_old">Georgian - Old</option> - <option value="kaz">Kazakh</option> - <option value="khm">Central Khmer</option> - <option value="kir">Kirghiz; Kyrgyz</option> - <option value="kmr">Kurdish Kurmanji</option> - <option value="kor">Korean</option> - <option value="kor_vert">Korean vertical</option> - <option value="lao">Lao</option> - <option value="lat">Latin</option> - <option value="lav">Latvian</option> - <option value="lit">Lithuanian</option> - <option value="ltz">Luxembourgish</option> - <option value="mal">Malayalam</option> - <option value="mar">Marathi</option> - <option value="mkd">Macedonian</option> - <option value="mlt">Maltese</option> - <option value="mon">Mongolian</option> - <option value="mri">Maori</option> - <option value="msa">Malay</option> - <option value="mya">Burmese</option> - <option value="nep">Nepali</option> - <option value="nld">Dutch; Flemish</option> - <option value="nor">Norwegian</option> - <option value="oci">Occitan post 1500</option> - <option value="ori">Oriya</option> - <option value="osd">Orientation and script detection module</option> - <option value="pan">Panjabi; Punjabi</option> - <option value="pol">Polish</option> - <option value="por">Portuguese</option> - <option value="pus">Pushto; Pashto</option> - <option value="que">Quechua</option> - <option value="ron">Romanian; Moldavian; Moldovan</option> - <option value="rus">Russian</option> - <option value="san">Sanskrit</option> - <option value="sin">Sinhala; Sinhalese</option> - <option value="slk">Slovak</option> - <option value="slv">Slovenian</option> - <option value="snd">Sindhi</option> - <option value="spa">Spanish; Castilian</option> - <option value="spa_old">Spanish; Castilian - Old</option> - <option value="sqi">Albanian</option> - <option value="srp">Serbian</option> - <option value="srp_latn">Serbian - Latin</option> - <option value="sun">Sundanese</option> - <option value="swa">Swahili</option> - <option value="swe">Swedish</option> - <option value="syr">Syriac</option> - <option value="tam">Tamil</option> - <option value="tat">Tatar</option> - <option value="tel">Telugu</option> - <option value="tgk">Tajik</option> - <option value="tha">Thai</option> - <option value="tir">Tigrinya</option> - <option value="ton">Tonga</option> - <option value="tur">Turkish</option> - <option value="uig">Uighur; Uyghur</option> - <option value="ukr">Ukrainian</option> - <option value="urd">Urdu</option> - <option value="uzb">Uzbek</option> - <option value="uzb_cyrl">Uzbek - Cyrilic</option> - <option value="vie">Vietnamese</option> - <option value="yid">Yiddish</option> - <option value="yor">Yoruba</option> - </param> <param name="output_formats" type="select" label="Output format(s)" multiple="true" optional="false"> <option value="tessedit_create_txt" selected="true">Text</option> <option value="tessedit_create_pdf">PDF</option> @@ -210,20 +233,26 @@ </outputs> <tests> <test expect_num_outputs="2"> + <conditional name="models"> + <param name="models_select" value="official"/> + <param name="tessdata" value="test_tessdata"/> + <param name="language" value="chr"/> + </conditional> <param name="input_file" value="eurotext.png"/> - <param name="tessdata" value="test_tessdata"/> + <param name="user_patterns" value="eng.user-patterns"/> <param name="user_words" value="eng.user-words"/> - <param name="user_patterns" value="eng.user-patterns"/> - <param name="language" value="chr"/> <param name="output_formats" value="tessedit_create_txt,tessedit_create_pdf"/> <param name="psm" value="3"/> <output name="output_text" file="image_output.txt"/> <output name="output_pdf" file="image_output.pdf"/> </test> <test expect_num_outputs="2"> + <conditional name="models"> + <param name="models_select" value="official"/> + <param name="tessdata" value="test_tessdata"/> + <param name="language" value="chr"/> + </conditional> <param name="input_file" value="test_image_cherokee.png"/> - <param name="tessdata" value="test_tessdata"/> - <param name="language" value="chr"/> <param name="output_formats" value="tessedit_create_hocr,tessedit_create_tsv"/> <param name="psm" value="11"/> <output name="output_hocr"> @@ -236,9 +265,12 @@ <output name="output_tsv" file="image_output.tsv"/> </test> <test expect_num_outputs="4"> + <conditional name="models"> + <param name="models_select" value="official"/> + <param name="tessdata" value="test_tessdata"/> + <param name="language" value="chr"/> + </conditional> <param name="input_file" value="test_input.pdf"/> - <param name="tessdata" value="test_tessdata"/> - <param name="language" value="chr"/> <param name="output_formats" value="tessedit_create_txt,tessedit_create_pdf,tessedit_create_hocr,tessedit_create_tsv"/> <param name="psm" value="11"/> <output name="output_hocr"> @@ -251,6 +283,25 @@ <output name="output_tsv" file="pdf_output.tsv"/> <output name="output_text" file="pdf_output.txt"/> <output name="output_pdf" file="pdf_output.pdf"/> + </test> + <test expect_num_outputs="4"> + <param name="input_file" value="test_input.pdf"/> + <param name="output_formats" value="tessedit_create_txt,tessedit_create_pdf,tessedit_create_hocr,tessedit_create_tsv"/> + <param name="psm" value="11"/> + <conditional name="models"> + <param name="models_select" value="own"/> + <param name="tessdata" value="tessdata/chr.traineddata"/> + </conditional> + <output name="output_hocr"> + <assert_contents> + <has_text text="Ꮳ"/> + <has_text text="ᏌᎠᏯᏙᏣᎠ"/> + <has_size value="13185" delta="10"/> + </assert_contents> + </output> + <output name="output_tsv" file="pdf_output.tsv"/> + <output name="output_text" file="pdf_output.txt"/> + <output name="output_pdf" file="pdf_output.pdf"/> </test> </tests> <help><