view markitdown.xml @ 1:f6fa7e70120f draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/markitdown commit 1df47411ce8651c1d4f68cd032b2afe7d5a721de
author bgruening
date Mon, 13 Oct 2025 13:22:13 +0000
parents 5ad32046903b
children 4926706c13db
line wrap: on
line source

<tool id="markitdown" name="Markitdown" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>Convert documents to Markdown</description>
    <macros>
        <token name="@TOOL_VERSION@">0.1.3</token>
        <token name="@VERSION_SUFFIX@">0</token>
        <token name="@PROFILE@">23.0</token>
    </macros>
    <requirements>
        <requirement type="package" version="3.12">python</requirement>
        <requirement type="package" version="@TOOL_VERSION@">markitdown</requirement>
    </requirements>

    <command detect_errors="exit_code"><![CDATA[
        #set ext_map = {
            'pdf': 'pdf', 'docx': 'docx', 'pptx': 'pptx', 'xlsx': 'xlsx',
            'html': 'html', 'txt': 'txt', 'ipynb': 'ipynb',
            'markdown': 'md', 'zip': 'zip', 'tabular': 'csv', 'csv': 'csv'
        }

        #set file_ext = ext_map.get($input.ext, '')
        #set final_ext = $ext_hint if $ext_hint else $file_ext

        markitdown 
            ${input}
            -x $final_ext
            #if $mime_type:
                -m $mime_opt
            #end if
            #if $charset:
                -c "$charset_opt"
            #end if
            $keep_data_uris
            -o '$output'
    ]]></command>

    <inputs>
        <param name="input" type="data" format="pdf,docx,pptx,xlsx,html,txt,ipynb,markdown,zip,tabular"
               label="Input file"/>
        <param name="ext_hint" type="text" optional="true" label="Extension override"/>
        <param name="mime_type" type="text" optional="true" label="MIME type hint"/>
        <param name="charset" type="text" optional="true" label="Character set (e.g. UTF-8)"/>
        <param name="keep_data_uris" type="boolean" truevalue="--keep-data-uris" falsevalue="" label="Keep embedded data URIs"/>
    </inputs>

    <outputs>
        <data name="output" format="markdown" label="Converted Markdown output"/>
    </outputs>

    <tests>
        <test>
            <param name="input" value="EAR.pdf" ftype="pdf"/>
            <output name="output">
                <assert_contents>
                    <has_text text="Tags: ERGA-BGE"/>
                    <has_text text="Lineage: mammalia_odb10"/>
                </assert_contents>
            </output>
        </test>

        <test>
            <param name="input" value="example.docx" ftype="docx"/>
            <output name="output">
                <assert_contents>
                    <has_text text="# Lorem ipsum dolor sit amet, consectetur adipiscing elit."/>
                </assert_contents>
            </output>
        </test>

        <!--test>
            <param name="input" value="example.odt"/>
            <param name="ext_hint" value="odt"/>
            <output name="output">
                <assert_contents>
                    <has_text text="This is a Word document"/>
                </assert_contents>
            </output>
        </test-->

        <test>
            <param name="input" value="report_4.html" ftype="html"/>
            <param name="keep_data_uris" value="true"/>
            <output name="output">
                <assert_contents>
                    <has_text text="is the contig length such that using longer or equal length contigs produces"/>
                </assert_contents>
            </output>
        </test>

        <test>
            <param name="input" value="example.txt" ftype="txt"/>
            <param name="ext_hint" value="txt"/>
            <output name="output">
                <assert_contents>
                    <has_text text="This is a plain text file"/>
                </assert_contents>
            </output>
        </test>

        <test>
            <param name="input" value="example.ipynb" ftype="ipynb"/>
            <output name="output">
                <assert_contents>
                    <has_text text="print(&quot;Hello, world!&quot;)"/>
                </assert_contents>
            </output>
        </test>
    </tests>

    <help format="markdown"><![CDATA[

**Markitdown** converts rich document formats (PDF, DOCX, HTML, etc.) to Markdown.

---

### Supported Formats:

- PDF, DOCX, PPTX, XLSX
- HTML, TXT, Markdown
- Jupyter Notebooks (IPYNB)
- ZIP containing supported formats
- Tabular (CSV)

---

### Options:

- **Extension override** (`-x`): hint for file type if not obvious
- **MIME type** (`-m`): manual MIME hint
- **Charset** (`-c`): text encoding hint
- **Keep data URIs**: retain base64-encoded images

Project: https://github.com/microsoft/markitdown
    ]]></help>

    <citations>
        <citation type="bibtex">
@misc{markitdown2024,
  author       = {Microsoft},
  title        = {markitdown: Convert documents to markdown},
  year         = {2024},
  howpublished = {\url{https://github.com/microsoft/markitdown}}
}
        </citation>
    </citations>
</tool>