diff tesseract.xml @ 2:56de4ac77c41 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/tesseract commit a5652d3d3d6a07d96d3898659f0254d8265b8215
author iuc
date Wed, 29 Oct 2025 17:38:19 +0000
parents d39f16771b62
children 521bcbb5aa16
line wrap: on
line diff
--- a/tesseract.xml	Wed Jul 09 10:36:11 2025 +0000
+++ b/tesseract.xml	Wed Oct 29 17:38:19 2025 +0000
@@ -6,7 +6,13 @@
     <expand macro="requirements" />
     <expand macro="creators" />
     <command detect_errors="exit_code"><![CDATA[
-        echo '$input_file' | tr ',' '\n' > img_paths &&
+        #if str($input_file.ext) == "pdf"
+            mkdir extracted_images &&
+            pdfimages -png '$input_file' extracted_images/page &&
+            ls extracted_images/page*.png > img_paths &&
+        #else
+            echo '$input_file' > img_paths &&
+        #end if
         tesseract img_paths output
         --tessdata-dir '${tessdata.fields.path}'
         #if $language:
@@ -27,7 +33,7 @@
         #end for
     ]]></command>
     <inputs>
-        <param name="input_file" type="data" format="jpg,png,tif,tiff,bmp" label="Image file(s)" multiple="true"/>
+        <param name="input_file" type="data" format="jpg,png,tif,tiff,bmp,pdf" label="Image file(s)" multiple="false"/>
         <param name="tessdata" type="select" label="Tessdata" help="Language data models">
             <options from_data_table="tessdata">
                 <column name="value" index="0"/>
@@ -40,7 +46,7 @@
         </param>
         <param name="user_words" type="data" format="txt" label="User words file" optional="true" help="The user words file allows you to specify a list of words that Tesseract should treat as known words. One word per line"/>
         <param name="user_patterns" type="data" format="txt" label="User patterns file" optional="true" help="One pattern per line in UTF-8 format. For more information please visit the tesseract docs about patterns linked in the help section"/>
-        <param name="language" type="select" label="OCR Language(s)" multiple="true" help="In the case of a multilingual image(s), more the one language can be selected">
+        <param name="language" type="select" label="OCR Language(s)" multiple="true" help="In the case of a multilingual image(s), more the one language can be selected" optional="false">
             <option value="afr">Afrikaans</option>
             <option value="amh">Amharic</option>
             <option value="ara">Arabic</option>
@@ -204,15 +210,15 @@
     </outputs>
     <tests>
         <test expect_num_outputs="2">
-            <param name="input_file" value="eurotext.png,test_image_cherokee.png"/>
+            <param name="input_file" value="eurotext.png"/>
             <param name="tessdata" value="test_tessdata"/>
             <param name="user_words" value="eng.user-words"/>
             <param name="user_patterns" value="eng.user-patterns"/>
             <param name="language" value="chr"/>
             <param name="output_formats" value="tessedit_create_txt,tessedit_create_pdf"/>
             <param name="psm" value="3"/>
-            <output name="output_text" file="output.txt"/>
-            <output name="output_pdf" file="output.pdf"/>
+            <output name="output_text" file="image_output.txt"/>
+            <output name="output_pdf" file="image_output.pdf"/>
         </test>
         <test expect_num_outputs="2">
             <param name="input_file" value="test_image_cherokee.png"/>
@@ -227,7 +233,24 @@
                     <has_size value="1805" delta="10"/>
                 </assert_contents>
             </output>
-            <output name="output_tsv" file="output.tsv"/>
+            <output name="output_tsv" file="image_output.tsv"/>
+        </test>
+        <test expect_num_outputs="4">
+            <param name="input_file" value="test_input.pdf"/>
+            <param name="tessdata" value="test_tessdata"/>
+            <param name="language" value="chr"/>
+            <param name="output_formats" value="tessedit_create_txt,tessedit_create_pdf,tessedit_create_hocr,tessedit_create_tsv"/>
+            <param name="psm" value="11"/>
+            <output name="output_hocr">
+                <assert_contents>
+                    <has_text text="Ꮳ"/>
+                    <has_text text="ᏌᎠᏯᏙᏣᎠ"/>
+                    <has_size value="13185" delta="10"/>
+                </assert_contents>
+            </output>
+            <output name="output_tsv" file="pdf_output.tsv"/>
+            <output name="output_text" file="pdf_output.txt"/>
+            <output name="output_pdf" file="pdf_output.pdf"/>
         </test>
     </tests>
     <help><![CDATA[
@@ -239,9 +262,13 @@
 
 * `API example for user patterns <https://tesseract-ocr.github.io/tessdoc/APIExample-user_patterns.html>`_
 
-**License**
+**Tesseract license**
 
 * `Apache-2.0 <https://raw.githubusercontent.com/tesseract-ocr/tesseract/refs/heads/main/LICENSE>`_
+
+**Poppler license**
+
+* `GPL-2.0-only <https://gitlab.freedesktop.org/poppler/poppler/-/raw/master/COPYING>`_
     ]]></help>
     <expand macro="citations" />
 </tool>