Mercurial > repos > iuc > fileidentification
changeset 0:ff7cec6bc518 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/fileidentification commit bb234555cae70fdd9be475415855a7a03c4139df
| author | iuc |
|---|---|
| date | Thu, 30 Oct 2025 16:52:27 +0000 |
| parents | |
| children | 639d81cf8ac8 |
| files | fileidentification.xml macros.xml test-data/output_results_linux_gh_action.txt test-data/test-data.zip |
| diffstat | 4 files changed, 104 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fileidentification.xml Thu Oct 30 16:52:27 2025 +0000 @@ -0,0 +1,56 @@ +<tool id="fileidentification" name="File Format Identification" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="24.1"> + <description>Check multimedia files if they are corrupt or duplicated</description> + <macros><import>macros.xml</import></macros> + <expand macro="requirements"/> + <command detect_errors="exit_code"><![CDATA[ +unzip '$input' -d input_dir && +/app/.venv/bin/python /app/identify.py ./input_dir && +/app/.venv/bin/python /app/identify.py ./input_dir --inspect > results.txt + ]]></command> + <inputs> + <param type="data" name="input" format="zip" label="ZIP containing the files to be analysed"/> + </inputs> + <outputs> + <data name="out" format="txt" from_work_dir="results.txt" label="${tool.name} on ${on_string}"/> + </outputs> + <tests> + <test expect_num_outputs="1"> + <param name="input" value="test-data.zip"/> + <output name="out" ftype="txt" file="output_results_linux_gh_action.txt"/> + </test> + </tests> + <help><![CDATA[ +Do you have a huge number of multimedia files and you don't know if they are corrupt, +or if they have the correct extension? +This tool: + +- gives you an overview of what file types there are +- checks if the extension of the files match their content +- checks if there are any duplicates +- checks if the content of the files is intact + +Input: A ZIP containing the files to be analysed (may be a nested folder structure) +Output: A report in plain text form + +A possible use case are digital preservation workflows, +where you want to make sure that you only preserve high-quality files for the future. + +Supported file types: A wide range of image formats (pixel and vector), videos, audios, pdf, MS Office. + +Note: The original fileidentification tool is more feature-rich. +In particular, it offers bulk conversion of files, which is currently not supported on Galaxy. + +Find more information in `the GitHub repo <https://github.com/dasch-swiss/fileidentification>`_. + ]]></help> + <citations> + <citation type="bibtex"> +@misc{githubfileidentification, + author = {Swiss National Data and Service Center for the Humanities}, + year = {2025}, + title = {Fileidentification - A CLI to Identify Multimedia File Formats and Bulk Convert Files}, + publisher = {GitHub}, + journal = {GitHub repository}, + url = {https://github.com/dasch-swiss/fileidentification}, +}</citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Thu Oct 30 16:52:27 2025 +0000 @@ -0,0 +1,10 @@ +<?xml version="1.0"?> +<macros> + <token name="@TOOL_VERSION@">2.5.0</token> + <token name="@VERSION_SUFFIX@">0</token> + <xml name="requirements"> + <requirements> + <container type="docker">daschswiss/fileidentification-galaxy:@TOOL_VERSION@-galaxy@VERSION_SUFFIX@</container> + </requirements> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output_results_linux_gh_action.txt Thu Oct 30 16:52:27 2025 +0000 @@ -0,0 +1,38 @@ + +----------- duplicates ----------- + +md5: 6c0cf64b97d107bdfae8d54ea4cd3375 - files: +test-data/file-sample_13kB.pdf +test-data/file-sample_13kB.docx + +md5: 82b415572a47acd77156a1f69e61c75e - files: +test-data/SampleJPGImage.jpg +test-data/SampleJPGImage.tif + + + +----------- file formats ----------- + +no. of files | combined size | fmt type | policy | convert | format name +2 | 0.024 MB | fmt/17 | | soffice | Acrobat PDF 1.3 - Portable Document Format +1 | 0.154 MB | fmt/569 | | | Matroska +2 | 0.019 MB | fmt/645 | | | Exchangeable Image File Format (Compressed) +4 | 0.405 MB | fmt/199 | | | MPEG-4 Media File +1 | 0.091 MB | fmt/585 | | ffmpeg | MPEG-2 Transport Stream +1 | 0.266 MB | fmt/5 | | ffmpeg | Audio/Video Interleaved Format + +WARNING: you should manually rename test-data/SampleJPGImage.tif +expecting one of the following ext: ['jpeg', 'jpg'] + + +----------- errors ----------- + + 0.049 MB test-data/corrupt.mp4 +filehandler: fmt not detected, falling back on ext +ffmpeg: [ERROR] code=-1094995529 string=Invalid data found when processing input [/ERROR] +filehandler: file is corrupt: removed + +----------- processing errors ----------- + + 0.006 MB test-data/nested folder/bear-320x180-10bit-frame-0.hevc +filehandler: failed to get fmt type for test-data/nested folder/bear-320x180-10bit-frame-0.hevc
