Mercurial > repos > iuc > tesseract
diff tesseract.xml @ 2:56de4ac77c41 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/tesseract commit a5652d3d3d6a07d96d3898659f0254d8265b8215
| author | iuc |
|---|---|
| date | Wed, 29 Oct 2025 17:38:19 +0000 |
| parents | d39f16771b62 |
| children | 521bcbb5aa16 |
line wrap: on
line diff
--- a/tesseract.xml Wed Jul 09 10:36:11 2025 +0000 +++ b/tesseract.xml Wed Oct 29 17:38:19 2025 +0000 @@ -6,7 +6,13 @@ <expand macro="requirements" /> <expand macro="creators" /> <command detect_errors="exit_code"><![CDATA[ - echo '$input_file' | tr ',' '\n' > img_paths && + #if str($input_file.ext) == "pdf" + mkdir extracted_images && + pdfimages -png '$input_file' extracted_images/page && + ls extracted_images/page*.png > img_paths && + #else + echo '$input_file' > img_paths && + #end if tesseract img_paths output --tessdata-dir '${tessdata.fields.path}' #if $language: @@ -27,7 +33,7 @@ #end for ]]></command> <inputs> - <param name="input_file" type="data" format="jpg,png,tif,tiff,bmp" label="Image file(s)" multiple="true"/> + <param name="input_file" type="data" format="jpg,png,tif,tiff,bmp,pdf" label="Image file(s)" multiple="false"/> <param name="tessdata" type="select" label="Tessdata" help="Language data models"> <options from_data_table="tessdata"> <column name="value" index="0"/> @@ -40,7 +46,7 @@ </param> <param name="user_words" type="data" format="txt" label="User words file" optional="true" help="The user words file allows you to specify a list of words that Tesseract should treat as known words. One word per line"/> <param name="user_patterns" type="data" format="txt" label="User patterns file" optional="true" help="One pattern per line in UTF-8 format. For more information please visit the tesseract docs about patterns linked in the help section"/> - <param name="language" type="select" label="OCR Language(s)" multiple="true" help="In the case of a multilingual image(s), more the one language can be selected"> + <param name="language" type="select" label="OCR Language(s)" multiple="true" help="In the case of a multilingual image(s), more the one language can be selected" optional="false"> <option value="afr">Afrikaans</option> <option value="amh">Amharic</option> <option value="ara">Arabic</option> @@ -204,15 +210,15 @@ </outputs> <tests> <test expect_num_outputs="2"> - <param name="input_file" value="eurotext.png,test_image_cherokee.png"/> + <param name="input_file" value="eurotext.png"/> <param name="tessdata" value="test_tessdata"/> <param name="user_words" value="eng.user-words"/> <param name="user_patterns" value="eng.user-patterns"/> <param name="language" value="chr"/> <param name="output_formats" value="tessedit_create_txt,tessedit_create_pdf"/> <param name="psm" value="3"/> - <output name="output_text" file="output.txt"/> - <output name="output_pdf" file="output.pdf"/> + <output name="output_text" file="image_output.txt"/> + <output name="output_pdf" file="image_output.pdf"/> </test> <test expect_num_outputs="2"> <param name="input_file" value="test_image_cherokee.png"/> @@ -227,7 +233,24 @@ <has_size value="1805" delta="10"/> </assert_contents> </output> - <output name="output_tsv" file="output.tsv"/> + <output name="output_tsv" file="image_output.tsv"/> + </test> + <test expect_num_outputs="4"> + <param name="input_file" value="test_input.pdf"/> + <param name="tessdata" value="test_tessdata"/> + <param name="language" value="chr"/> + <param name="output_formats" value="tessedit_create_txt,tessedit_create_pdf,tessedit_create_hocr,tessedit_create_tsv"/> + <param name="psm" value="11"/> + <output name="output_hocr"> + <assert_contents> + <has_text text="Ꮳ"/> + <has_text text="ᏌᎠᏯᏙᏣᎠ"/> + <has_size value="13185" delta="10"/> + </assert_contents> + </output> + <output name="output_tsv" file="pdf_output.tsv"/> + <output name="output_text" file="pdf_output.txt"/> + <output name="output_pdf" file="pdf_output.pdf"/> </test> </tests> <help><![CDATA[ @@ -239,9 +262,13 @@ * `API example for user patterns <https://tesseract-ocr.github.io/tessdoc/APIExample-user_patterns.html>`_ -**License** +**Tesseract license** * `Apache-2.0 <https://raw.githubusercontent.com/tesseract-ocr/tesseract/refs/heads/main/LICENSE>`_ + +**Poppler license** + +* `GPL-2.0-only <https://gitlab.freedesktop.org/poppler/poppler/-/raw/master/COPYING>`_ ]]></help> <expand macro="citations" /> </tool>
