Mercurial > repos > bgruening > markitdown
diff markitdown.xml @ 0:5ad32046903b draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/markitdown commit aaa9d49fda30d2aaf99030ff8a099d890a7c1d01
| author | bgruening |
|---|---|
| date | Mon, 21 Apr 2025 12:13:33 +0000 |
| parents | |
| children | f6fa7e70120f |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/markitdown.xml Mon Apr 21 12:13:33 2025 +0000 @@ -0,0 +1,144 @@ +<tool id="markitdown" name="Markitdown" version="@TOOL_VERSION@" profile="22.05"> + <description>Convert documents to Markdown</description> + + <!-- Macros inline --> + <macros> + <token name="@TOOL_VERSION@">0.1.1</token> + </macros> + + <requirements> + <container type="docker">quay.io/bgruening/markitdown:@TOOL_VERSION@</container> + </requirements> + + <command detect_errors="exit_code"><![CDATA[ + #set ext_map = { + 'pdf': 'pdf', 'docx': 'docx', 'pptx': 'pptx', 'xlsx': 'xlsx', + 'html': 'html', 'txt': 'txt', 'ipynb': 'ipynb', + 'markdown': 'md', 'zip': 'zip', 'tabular': 'csv', 'csv': 'csv' + } + + #set file_ext = ext_map.get($input.ext, '') + #set final_ext = $ext_hint if $ext_hint else $file_ext + + markitdown + ${input} + -x $final_ext + #if $mime_type: + -m $mime_opt + #end if + #if $charset: + -c "$charset_opt" + #end if + $keep_data_uris + -o '$output' + ]]></command> + + <inputs> + <param name="input" type="data" format="pdf,docx,pptx,xlsx,html,txt,ipynb,markdown,zip,tabular" + label="Input file"/> + <param name="ext_hint" type="text" optional="true" label="Extension override"/> + <param name="mime_type" type="text" optional="true" label="MIME type hint"/> + <param name="charset" type="text" optional="true" label="Character set (e.g. UTF-8)"/> + <param name="keep_data_uris" type="boolean" truevalue="--keep-data-uris" falsevalue="" label="Keep embedded data URIs"/> + </inputs> + + <outputs> + <data name="output" format="markdown" label="Converted Markdown output"/> + </outputs> + + <tests> + <test> + <param name="input" value="EAR.pdf" ftype="pdf"/> + <output name="output"> + <assert_contents> + <has_text text="Tags: ERGA-BGE"/> + <has_text text="Lineage: mammalia_odb10"/> + </assert_contents> + </output> + </test> + + <test> + <param name="input" value="example.docx" ftype="docx"/> + <output name="output"> + <assert_contents> + <has_text text="# Lorem ipsum dolor sit amet, consectetur adipiscing elit."/> + </assert_contents> + </output> + </test> + + <!--test> + <param name="input" value="example.odt"/> + <param name="ext_hint" value="odt"/> + <output name="output"> + <assert_contents> + <has_text text="This is a Word document"/> + </assert_contents> + </output> + </test--> + + <test> + <param name="input" value="report_4.html" ftype="html"/> + <param name="keep_data_uris" value="true"/> + <output name="output"> + <assert_contents> + <has_text text="is the contig length such that using longer or equal length contigs produces"/> + </assert_contents> + </output> + </test> + + <test> + <param name="input" value="example.txt" ftype="txt"/> + <param name="ext_hint" value="txt"/> + <output name="output"> + <assert_contents> + <has_text text="This is a plain text file"/> + </assert_contents> + </output> + </test> + + <test> + <param name="input" value="example.ipynb" ftype="ipynb"/> + <output name="output"> + <assert_contents> + <has_text text="print("Hello, world!")"/> + </assert_contents> + </output> + </test> + </tests> + + <help><![CDATA[ +**Markitdown** converts rich document formats (PDF, DOCX, HTML, etc.) to Markdown. + +--- + +### Supported Formats: + +- PDF, DOCX, PPTX, XLSX +- HTML, TXT, Markdown +- Jupyter Notebooks (IPYNB) +- ZIP containing supported formats +- Tabular (CSV) + +--- + +### Options: + +- **Extension override** (`-x`): hint for file type if not obvious +- **MIME type** (`-m`): manual MIME hint +- **Charset** (`-c`): text encoding hint +- **Keep data URIs**: retain base64-encoded images + +Project: https://github.com/microsoft/markitdown + ]]></help> + + <citations> + <citation type="bibtex"> +@misc{markitdown2024, + author = {Microsoft}, + title = {markitdown: Convert documents to markdown}, + year = {2024}, + howpublished = {\url{https://github.com/microsoft/markitdown}} +} + </citation> + </citations> +</tool>
