Mercurial > repos > iuc > tesseract
changeset 0:49dbf2f5b0e2 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/tesseract/ commit f1ec0d8eddacc5d5e0ca3bc112005f4f2d2597f6
author | iuc |
---|---|
date | Tue, 24 Jun 2025 09:16:57 +0000 |
parents | |
children | d39f16771b62 |
files | macros.xml tesseract.xml test-data/eng.user-patterns test-data/eng.user-words test-data/eurotext.png test-data/output.pdf test-data/output.tsv test-data/output.txt test-data/tessdata.loc test-data/tessdata/chr.traineddata test-data/test_image_cherokee.png tool-data/tessdata.loc.sample tool_data_table_conf.xml.sample tool_data_table_conf.xml.test |
diffstat | 14 files changed, 327 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Tue Jun 24 09:16:57 2025 +0000 @@ -0,0 +1,23 @@ +<macros> + <token name="@TOOL_VERSION@">5.0.0</token> + <token name="@VERSION_SUFFIX@">0</token> + <xml name="requirements"> + <requirements> + <requirement type="package" version="@TOOL_VERSION@">tesseract</requirement> + </requirements> + </xml> + <xml name="creators"> + <creator> + <person givenName="Rand" familyName="Zoabi" url="https://github.com/RZ9082" identifier="https://orcid.org/0009-0000-2501-8053" /> + </creator> + </xml> + <xml name="citations"> + <citations> + <citation type="doi">10.1145/1815330.1815339</citation> + <citation type="doi">10.1145/1577802.1577804</citation> + <citation type="doi">10.1145/1577802.1577809</citation> + <citation type="doi">10.1109/ICDAR.2009.257</citation> + <citation type="doi">10.1109/ICDAR.2007.4376991</citation> + </citations> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tesseract.xml Tue Jun 24 09:16:57 2025 +0000 @@ -0,0 +1,247 @@ +<tool id="tesseract" name="Tesseract" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="24.2"> + <description>Optical Character Recognition</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <expand macro="creators" /> + <command detect_errors="exit_code"><![CDATA[ + echo '$input_file' | tr ',' '\n' > img_paths && + tesseract img_paths output + --tessdata-dir '${tessdata.fields.path}' + #if $language: + -l ${ str($language).replace(",","+") } + #end if + --psm $psm + #if $dpi: + --dpi $dpi + #end if + #if $user_words + --user-words '$user_words' + #end if + #if $user_patterns + --user-patterns '$user_patterns' + #end if + #for $format in $output_formats + -c $format=1 + #end for + ]]></command> + <inputs> + <param name="input_file" type="data" format="jpg,png,tif,tiff,bmp" label="Image file(s)" multiple="true"/> + <param name="tessdata" type="select" label="Tessdata" help="Language data models"> + <options from_data_table="tessdata"> + <column name="value" index="0"/> + <column name="name" index="1"/> + <column name="version" index="2"/> + <column name="path" index="3"/> + <filter type="sort_by" column="1"/> + </options> + <validator type="no_options" message="A built-in tesseract model is not available. Please ask the Galaxy admins to install one on the server."/> + </param> + <param name="user_words" type="data" format="txt" label="User words file" optional="true" help="The user words file allows you to specify a list of words that Tesseract should treat as known words. One word per line"/> + <param name="user_patterns" type="data" format="txt" label="User patterns file" optional="true" help="One pattern per line in UTF-8 format. For more information please visit the tesseract docs about patterns linked in the help section"/> + <param name="language" type="select" label="OCR Language(s)" multiple="true" help="In the case of a multilingual image(s), more the one language can be selected"> + <option value="afr">Afrikaans</option> + <option value="amh">Amharic</option> + <option value="ara">Arabic</option> + <option value="asm">Assamese</option> + <option value="aze">Azerbaijani</option> + <option value="aze_cyrl">Azerbaijani - Cyrilic</option> + <option value="bel">Belarusian</option> + <option value="ben">Bengali</option> + <option value="bod">Tibetan</option> + <option value="bos">Bosnian</option> + <option value="bre">Breton</option> + <option value="bul">Bulgarian</option> + <option value="cat">Catalan; Valencian</option> + <option value="ceb">Cebuano</option> + <option value="ces">Czech</option> + <option value="chi_sim">Chinese simplified</option> + <option value="chi_tra">Chinese traditional</option> + <option value="chr">Cherokee</option> + <option value="cos">Corsican</option> + <option value="cym">Welsh</option> + <option value="dan">Danish</option> + <option value="deu">German</option> + <option value="deu_latf">German Fraktur Latin</option> + <option value="div">Dhivehi</option> + <option value="dzo">Dzongkha</option> + <option value="ell">Greek, Modern, 1453-</option> + <option value="eng">English</option> + <option value="enm">English, Middle, 1100-1500</option> + <option value="epo">Esperanto</option> + <option value="equ">Math / equation detection module</option> + <option value="est">Estonian</option> + <option value="eus">Basque</option> + <option value="fas">Persian</option> + <option value="fao">Faroese</option> + <option value="fil">Filipino</option> + <option value="fin">Finnish</option> + <option value="fra">French</option> + <option value="frm">French, Middle, ca.1400-1600</option> + <option value="fry">West Frisian</option> + <option value="gla">Scottish Gaelic</option> + <option value="gle">Irish</option> + <option value="glg">Galician</option> + <option value="grc">Greek, Ancient, to 1453</option> + <option value="guj">Gujarati</option> + <option value="hat">Haitian; Haitian Creole</option> + <option value="heb">Hebrew</option> + <option value="hin">Hindi</option> + <option value="hrv">Croatian</option> + <option value="hun">Hungarian</option> + <option value="hye">Armenian</option> + <option value="iku">Inuktitut</option> + <option value="ind">Indonesian</option> + <option value="isl">Icelandic</option> + <option value="ita">Italian</option> + <option value="ita_old">Italian - Old</option> + <option value="jav">Javanese</option> + <option value="jpn">Japanese</option> + <option value="kan">Kannada</option> + <option value="kat">Georgian</option> + <option value="kat_old">Georgian - Old</option> + <option value="kaz">Kazakh</option> + <option value="khm">Central Khmer</option> + <option value="kir">Kirghiz; Kyrgyz</option> + <option value="kmr">Kurdish Kurmanji</option> + <option value="kor">Korean</option> + <option value="kor_vert">Korean vertical</option> + <option value="lao">Lao</option> + <option value="lat">Latin</option> + <option value="lav">Latvian</option> + <option value="lit">Lithuanian</option> + <option value="ltz">Luxembourgish</option> + <option value="mal">Malayalam</option> + <option value="mar">Marathi</option> + <option value="mkd">Macedonian</option> + <option value="mlt">Maltese</option> + <option value="mon">Mongolian</option> + <option value="mri">Maori</option> + <option value="msa">Malay</option> + <option value="mya">Burmese</option> + <option value="nep">Nepali</option> + <option value="nld">Dutch; Flemish</option> + <option value="nor">Norwegian</option> + <option value="oci">Occitan post 1500</option> + <option value="ori">Oriya</option> + <option value="osd">Orientation and script detection module</option> + <option value="pan">Panjabi; Punjabi</option> + <option value="pol">Polish</option> + <option value="por">Portuguese</option> + <option value="pus">Pushto; Pashto</option> + <option value="que">Quechua</option> + <option value="ron">Romanian; Moldavian; Moldovan</option> + <option value="rus">Russian</option> + <option value="san">Sanskrit</option> + <option value="sin">Sinhala; Sinhalese</option> + <option value="slk">Slovak</option> + <option value="slv">Slovenian</option> + <option value="snd">Sindhi</option> + <option value="spa">Spanish; Castilian</option> + <option value="spa_old">Spanish; Castilian - Old</option> + <option value="sqi">Albanian</option> + <option value="srp">Serbian</option> + <option value="srp_latn">Serbian - Latin</option> + <option value="sun">Sundanese</option> + <option value="swa">Swahili</option> + <option value="swe">Swedish</option> + <option value="syr">Syriac</option> + <option value="tam">Tamil</option> + <option value="tat">Tatar</option> + <option value="tel">Telugu</option> + <option value="tgk">Tajik</option> + <option value="tha">Thai</option> + <option value="tir">Tigrinya</option> + <option value="ton">Tonga</option> + <option value="tur">Turkish</option> + <option value="uig">Uighur; Uyghur</option> + <option value="ukr">Ukrainian</option> + <option value="urd">Urdu</option> + <option value="uzb">Uzbek</option> + <option value="uzb_cyrl">Uzbek - Cyrilic</option> + <option value="vie">Vietnamese</option> + <option value="yid">Yiddish</option> + <option value="yor">Yoruba</option> + </param> + <param name="output_formats" type="select" label="Output format(s)" multiple="true" optional="false"> + <option value="tessedit_create_txt" selected="true">Text</option> + <option value="tessedit_create_pdf">PDF</option> + <option value="tessedit_create_hocr">HOCR</option> + <option value="tessedit_create_tsv">TSV</option> + </param> + <param argument="--psm" type="select" label="Page Segmentation Mode (PSM)" help="How the page layout is interpreted." optional="true"> + <option value="0">Orientation and script detection only</option> + <option value="1">Automatic page segmentation with OSD</option> + <option value="2">Automatic page segmentation, but no OSD, or OCR</option> + <option value="3" selected="true">Fully automatic page segmentation, but no OSD</option> + <option value="4">Assume a single column of text of variable sizes</option> + <option value="5">Assume a single uniform block of vertically aligned text</option> + <option value="6">Assume a single uniform block of text</option> + <option value="7">Treat the image as a single text line</option> + <option value="8">Treat the image as a single word</option> + <option value="9">Treat the image as a single word in a circle</option> + <option value="10">Treat the image as a single character</option> + <option value="11">Sparse text. Find as much text as possible in no particular order</option> + <option value="12">Sparse text with OSD</option> + <option value="13">Raw line. Treat the image as a single text line, bypassing hacks that are Tesseract-specific</option> + </param> + <param argument="--dpi" type="integer" label="Image DPI (dots per inch)" min="100" help="When left empty, the resolution in retrieved from the image metadata. If this information in not icluded, Tesseract will make a guess. Tesseract performes best on images with at least 300 dpi" optional="true" /> + </inputs> + <outputs> + <data name="output_text" format="txt" from_work_dir="output.txt" label="${tool.name} on ${on_string}: Text"> + <filter>'tessedit_create_txt' in output_formats</filter> + </data> + <data name="output_pdf" format="pdf" from_work_dir="output.pdf" label="${tool.name} on ${on_string}: PDF"> + <filter>'tessedit_create_pdf' in output_formats</filter> + </data> + <data name="output_hocr" format="html" from_work_dir="output.hocr" label="${tool.name} on ${on_string}: HOCR"> + <filter>'tessedit_create_hocr' in output_formats</filter> + </data> + <data name="output_tsv" format="tsv" from_work_dir="output.tsv" label="${tool.name} on ${on_string}: TSV"> + <filter>'tessedit_create_tsv' in output_formats</filter> + </data> + </outputs> + <tests> + <test expect_num_outputs="2"> + <param name="input_file" value="eurotext.png,test_image_cherokee.png"/> + <param name="tessdata" value="test_tessdata"/> + <param name="user_words" value="eng.user-words"/> + <param name="user_patterns" value="eng.user-patterns"/> + <param name="language" value="chr"/> + <param name="output_formats" value="tessedit_create_txt,tessedit_create_pdf"/> + <param name="psm" value="3"/> + <output name="output_text" file="output.txt"/> + <output name="output_pdf" file="output.pdf"/> + </test> + <test expect_num_outputs="2"> + <param name="input_file" value="test_image_cherokee.png"/> + <param name="tessdata" value="test_tessdata"/> + <param name="language" value="chr"/> + <param name="output_formats" value="tessedit_create_hocr,tessedit_create_tsv"/> + <param name="psm" value="11"/> + <output name="output_hocr"> + <assert_contents> + <has_text text="Ꮳ"/> + <has_text text="ᏌᎠᏯᏙᏣᎠ"/> + <has_size value="1785" delta="10"/> + </assert_contents> + </output> + <output name="output_tsv" file="output.tsv"/> + </test> + </tests> + <help><![CDATA[ +Tesseract OCR Tool +------------------ +Tesseract is an OCR engine with support for unicode and the ability to recognize more than 100 languages out of the box. + +* `Tesseract User Manual <https://tesseract-ocr.github.io/tessdoc/>`_ + +* `API example for user patterns <https://tesseract-ocr.github.io/tessdoc/APIExample-user_patterns.html>`_ + +**License** + +* `Apache-2.0 <https://raw.githubusercontent.com/tesseract-ocr/tesseract/refs/heads/main/LICENSE>`_ + ]]></help> + <expand macro="citations" /> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/eng.user-patterns Tue Jun 24 09:16:57 2025 +0000 @@ -0,0 +1,2 @@ +1-\d\d\d-GOOG-411 +www.\n\\\*.com
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/eng.user-words Tue Jun 24 09:16:57 2025 +0000 @@ -0,0 +1,5 @@ +the +quick +brown +fox +jumped
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output.tsv Tue Jun 24 09:16:57 2025 +0000 @@ -0,0 +1,12 @@ +level page_num block_num par_num line_num word_num left top width height conf text +1 1 0 0 0 0 0 0 1024 1024 -1 +2 1 1 0 0 0 92 296 875 130 -1 +3 1 1 1 0 0 92 296 875 130 -1 +4 1 1 1 1 0 92 296 875 130 -1 +5 1 1 1 1 1 92 297 174 129 37.403927 Ꮳ +5 1 1 1 1 2 286 296 681 130 16.962364 ᏌᎠᏯᏙᏣᎠ +2 1 2 0 0 0 164 572 694 133 -1 +3 1 2 1 0 0 164 572 694 133 -1 +4 1 2 1 1 0 164 572 694 133 -1 +5 1 2 1 1 1 164 568 364 184 53.049427 ᎪᎶᎩ) +5 1 2 1 1 2 650 572 200 132 35.113586 Ꮾ(Ꭰ
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output.txt Tue Jun 24 09:16:57 2025 +0000 @@ -0,0 +1,14 @@ +ᎢᏂᏮ (ᏄᏌᎥᏟᏦ) (ᏏᎵᏣᎳᏲ). (Ꮅ9Ꮶ). ᎫᏢᏚᎥ +ᎤᏙᏮᎢ (ᏂᏮ ᏕᏎ3Ꮷ,Ꮞ5Ꮪ6.ᏤᎦ “ᎥᏧᏃᎩ- Ꮱ90 ᏧᎾᏴ +(Ꮭ ᏧᏄᏟᏦᎥᏙᏴᏣᎾᏚᏮ, 3Ꮪ 12. 506 ᎾᎴ Ꭼ--Ꮑ311 +ᎥᎢᏣᏁᎸ ᏄᏚᏢᎸ ᎠᏁ 1161 (ᏇᎳᏮᏌᏚᎥ (6. ᏟᏣᎥᎸ ᎥᏚ ᏚᏢᎸᏒ. +ᎠᎴᎢ ..ᏚᏟᏂᏒᏮᎥ16” ᏏᎠᎱᎸᏄᎠᏮ ᎬᎿᏟᏂᏚ ᏚᏢᎱᎥᎸᏰᎥ +(ᏏᏮᎢ ᏧᏮᎯᏒ (Ꭿ016(Ꮢ ᎻᏄᏌᏲᏁᏧ. ᏞᏮ ᎢᏣᏁᎸᎢᏧ ᏏᎢᏛᏌᏲ +ᏄᎢᏄᏢᎠᎥᏧᏮᏅ ᏚᎸᏌᎥᏮ ᎠᎧᏄᎱ-ᏄᎶᏮᏚᏚᏌᏚ 16 ᏟᏂᎥᏮᏁ +ᎠᏰᎸᎱᏮᏚᏚᏮᏔᏦ. ᏞᏘ ᏙᎾᎤᎥᏢᏮ (131ᎢᏣᎠ6 ᎱᎸᎠᎥᏧᎸ +ᏚᎸ1(3 ᏚᏅᏣᏢᎢᎸ ᎥᎥ ᏟᏱᏒᏮ ᎠᎥᏪᎱᎾ. ᎬᎥ ᏃᎾᎱᎱᎾ +ᎢᏁᎸᎢᎱᏫᏒ ᎱᏰᎠᎥᏧᎾ ᏚᏰᎥᎥᎸ ᏚᎾᏏᎱᏮ ᏮᎥ ᎠᎾᎥᎱᎾ +ᎠᏮᎱᏮᏃᏣᏚᎾ. Ꭺ ᎢᎸᎠᏣᏚᏘ (ᏁᎪ1ᎢᏣᎥᏁ ᎢᎦᏁᎠᎥᏧᎸᎸ +ᏚᎸ1(3 ᏚᎾᏣᎠᎢᏮ Ꮕ ᏟᎸᏣ ᎠᎱᎾᎬᏪᏌᎥᏣᎾᏚᎾ. +Ꮳ ᏌᎠᏯᏙᏣᎠ +ᎪᎶᎩ)1 Ꮾ6Ꭰ
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/tessdata.loc Tue Jun 24 09:16:57 2025 +0000 @@ -0,0 +1,1 @@ +test_tessdata Default 4.1.0 4.1.0 ${__HERE__}/tessdata \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/tessdata.loc.sample Tue Jun 24 09:16:57 2025 +0000 @@ -0,0 +1,9 @@ +#This is a sample file distributed with Galaxy that enables tools +#to use a tessdata model folder. The tessdata.loc +#file needs this format (longer white space is the TAB character): + +#<unique_build_id> <display_name> <version> <DB_folder_path> + +# for example: + +# tessdata name 0.1 0.1 /data/tessdata \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Tue Jun 24 09:16:57 2025 +0000 @@ -0,0 +1,7 @@ +<tables> + <!-- Location of tessdata files --> + <table name="tessdata" comment_char="#"> + <columns>value, name, version, path</columns> + <file path="tool-data/tessdata.loc" /> + </table> +</tables> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.test Tue Jun 24 09:16:57 2025 +0000 @@ -0,0 +1,7 @@ +<tables> + <!-- Location of tessdata files --> + <table name="tessdata" comment_char="#"> + <columns>value, name, version, path</columns> + <file path="${__HERE__}/test-data/tessdata.loc" /> + </table> +</tables> \ No newline at end of file