Mercurial > repos > iuc > pdfimages

<tool name="pdfimages" id="pdfimages" version="@TOOL_VERSION@+@VERSION_SUFFIX@" profile="24.2" license="GPL-2.0-only">
    <description>Extract images from a PDF file</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements"/>
    <expand macro="creators"/>
    <command detect_errors="exit_code"><![CDATA[
        mkdir ./outputs/ &&
        pdfimages
        #if str($f) != ""
            -f $f
        #end if
        #if str($l) != ""
            -l $l
        #end if
        $p
        #if $output_format != 'ppm'
            $output_format
        #end if
        '$input_file' './outputs/$output_prefix'
    ]]></command>
    <inputs>
        <param name="input_file" type="data" format="pdf" label="Input PDF file"/>
        <param name="output_format" type="select" label="Select desired image format">
            <option value="ppm" selected="true">ppm</option>
            <option value="-png">png</option>
            <option value="-tiff">tiff</option>
        </param>
        <param argument="-f" type="integer" min="1" label="First page" optional="true"/>
        <param argument="-l" type="integer" min="1" label="Last page" optional="true"/>
        <param argument="-p" type="boolean" truevalue="-p" falsevalue="" label="Include page numbers in output file names"/>
        <param name="output_prefix" type="text" value="image" label="Output name prefix">
            <validator type="regex" message="Use only letters (A–Z, a–z), numbers, underscores (_), or dashes (-).">^[A-Za-z0-9_-]+$</validator>
        </param>
    </inputs>
    <outputs>
        <collection name="ppm_output_collection" type="list" format="ppm" label="${tool.name} on ${on_string}: Extracted images in ppm format">
            <filter>output_format == "ppm"</filter>
            <discover_datasets pattern="(?P&lt;designation&gt;.*).ppm" recurse="true" format="ppm" directory="outputs" />
        </collection>
        <collection name="png_output_collection" type="list" format="png" label="${tool.name} on ${on_string}: Extracted images in png">
            <filter>output_format == "-png"</filter>
            <discover_datasets pattern="(?P&lt;designation&gt;.*).png" recurse="true" format="png" directory="outputs" />
        </collection>
        <collection name="tiff_output_collection" type="list" format="tiff" label="${tool.name} on ${on_string}: Extracted images in tiff format">
            <filter>output_format == "-tiff"</filter>
            <discover_datasets pattern="(?P&lt;designation&gt;.*).tif" recurse="true" format="tiff" directory="outputs" />
        </collection>
    </outputs>
    <tests>
    <test expect_num_outputs="1">
        <param name="input_file" value="test.pdf"/>
        <param name="output_format" value="ppm"/>
        <param name="p" value="false"/>
        <param name="output_prefix" value="image"/>
        <output_collection name="ppm_output_collection" type="list" count="3">
            <element name="image-000" ftype="ppm">
                <assert_contents>
                    <has_size value="3145745"/>
                </assert_contents>
            </element>
            <element name="image-001" ftype="ppm">
                <assert_contents>
                    <has_size value="3145745"/>
                </assert_contents>
            </element>
            <element name="image-002" ftype="ppm">
                <assert_contents>
                    <has_size value="3145745"/>
                </assert_contents>
            </element>
        </output_collection>
    </test>
    <test expect_num_outputs="1">
        <param name="input_file" value="test.pdf"/>
        <param name="output_format" value="-tiff"/>
        <param name="p" value="true"/>
        <param name="output_prefix" value="image"/>
        <output_collection name="tiff_output_collection" type="list" count="3">
            <element name="image-001-000" ftype="tiff">
                <assert_contents>
                    <has_size value="3149004"/>
                </assert_contents>
            </element>
            <element name="image-002-001" ftype="tiff">
                <assert_contents>
                    <has_size value="3149004"/>
                </assert_contents>
            </element>
            <element name="image-003-002" ftype="tiff">
                <assert_contents>
                    <has_size value="1049542"/>
                </assert_contents>
            </element>
        </output_collection>
    </test>
    <test expect_num_outputs="1">
        <param name="input_file" value="test.pdf"/>
        <param name="output_format" value="-png"/>
        <param name="f" value="1"/>
        <param name="l" value="2"/>
        <param name="p" value="true"/>
        <param name="output_prefix" value="output_prefix-0"/>
        <output_collection name="png_output_collection" type="list" count="2">
            <element name="output_prefix-0-001-000" ftype="png" file="output_prefix-0-001-000.png"/>
            <element name="output_prefix-0-002-001" ftype="png" file="output_prefix-0-002-001.png"/>
        </output_collection>
    </test>
    </tests>
    <help><![CDATA[
**What it does**

`pdfimages` extracts images from a PDF file. This tool is useful for retrieving high-quality embedded images from PDFs for further analysis or reuse.

**Inputs**

- **Input PDF file**: The source PDF from which images will be extracted.
- **Output format**: Choose the desired output format (`ppm`, `png`, or `tiff`).
- **First page / Last page** (optional): Limit extraction to a specific page range.
- **Include page numbers**: Adds page numbers to the output filenames for better traceability.
- **Output name prefix**: Sets a custom prefix for all extracted image files.

**Outputs**

A collection of images in the selected format, named according to the specified prefix and page/image order.
]]></help>
    <expand macro="citations"/>
</tool>
author	iuc
date	Mon, 11 Aug 2025 12:29:13 +0000
parents	fe8e52e52961
children