changeset 0:ff7cec6bc518 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/fileidentification commit bb234555cae70fdd9be475415855a7a03c4139df
author iuc
date Thu, 30 Oct 2025 16:52:27 +0000
parents
children 639d81cf8ac8
files fileidentification.xml macros.xml test-data/output_results_linux_gh_action.txt test-data/test-data.zip
diffstat 4 files changed, 104 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fileidentification.xml	Thu Oct 30 16:52:27 2025 +0000
@@ -0,0 +1,56 @@
+<tool id="fileidentification" name="File Format Identification" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="24.1">
+    <description>Check multimedia files if they are corrupt or duplicated</description>
+    <macros><import>macros.xml</import></macros>
+    <expand macro="requirements"/>
+    <command detect_errors="exit_code"><![CDATA[
+unzip '$input' -d input_dir &&
+/app/.venv/bin/python /app/identify.py ./input_dir &&
+/app/.venv/bin/python /app/identify.py ./input_dir --inspect > results.txt
+    ]]></command>
+    <inputs>
+        <param type="data" name="input" format="zip" label="ZIP containing the files to be analysed"/>
+    </inputs>
+    <outputs>
+        <data name="out" format="txt" from_work_dir="results.txt" label="${tool.name} on ${on_string}"/>
+    </outputs>
+    <tests>
+        <test expect_num_outputs="1">
+            <param name="input" value="test-data.zip"/>
+            <output name="out" ftype="txt" file="output_results_linux_gh_action.txt"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+Do you have a huge number of multimedia files and you don't know if they are corrupt,
+or if they have the correct extension?
+This tool:
+
+- gives you an overview of what file types there are
+- checks if the extension of the files match their content
+- checks if there are any duplicates
+- checks if the content of the files is intact
+
+Input: A ZIP containing the files to be analysed (may be a nested folder structure)
+Output: A report in plain text form
+
+A possible use case are digital preservation workflows,
+where you want to make sure that you only preserve high-quality files for the future.
+
+Supported file types: A wide range of image formats (pixel and vector), videos, audios, pdf, MS Office.
+
+Note: The original fileidentification tool is more feature-rich. 
+In particular, it offers bulk conversion of files, which is currently not supported on Galaxy.
+
+Find more information in `the GitHub repo <https://github.com/dasch-swiss/fileidentification>`_.
+    ]]></help>
+    <citations>
+        <citation type="bibtex">
+@misc{githubfileidentification,
+  author = {Swiss National Data and Service Center for the Humanities},
+  year = {2025},
+  title = {Fileidentification - A CLI to Identify Multimedia File Formats and Bulk Convert Files},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  url = {https://github.com/dasch-swiss/fileidentification},
+}</citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Thu Oct 30 16:52:27 2025 +0000
@@ -0,0 +1,10 @@
+<?xml version="1.0"?>
+<macros>
+    <token name="@TOOL_VERSION@">2.5.0</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <xml name="requirements">
+        <requirements>
+            <container type="docker">daschswiss/fileidentification-galaxy:@TOOL_VERSION@-galaxy@VERSION_SUFFIX@</container>
+        </requirements>
+    </xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_results_linux_gh_action.txt	Thu Oct 30 16:52:27 2025 +0000
@@ -0,0 +1,38 @@
+
+----------- duplicates -----------
+
+md5: 6c0cf64b97d107bdfae8d54ea4cd3375 - files: 
+test-data/file-sample_13kB.pdf
+test-data/file-sample_13kB.docx
+
+md5: 82b415572a47acd77156a1f69e61c75e - files: 
+test-data/SampleJPGImage.jpg
+test-data/SampleJPGImage.tif
+
+
+
+----------- file formats -----------
+
+no. of files  | combined size  | fmt type   | policy     | convert    | format name
+2             | 0.024 MB       | fmt/17     |            | soffice    | Acrobat PDF 1.3 - Portable Document Format
+1             | 0.154 MB       | fmt/569    |            |            | Matroska
+2             | 0.019 MB       | fmt/645    |            |            | Exchangeable Image File Format (Compressed)
+4             | 0.405 MB       | fmt/199    |            |            | MPEG-4 Media File
+1             | 0.091 MB       | fmt/585    |            | ffmpeg     | MPEG-2 Transport Stream
+1             | 0.266 MB       | fmt/5      |            | ffmpeg     | Audio/Video Interleaved Format
+
+WARNING: you should manually rename test-data/SampleJPGImage.tif
+expecting one of the following ext: ['jpeg', 'jpg']
+
+
+----------- errors -----------
+
+  0.049 MB    test-data/corrupt.mp4
+filehandler:    fmt not detected, falling back on ext
+ffmpeg:    [ERROR] code=-1094995529 string=Invalid data found when processing input [/ERROR] 
+filehandler:    file is corrupt: removed
+
+----------- processing errors -----------
+
+  0.006 MB    test-data/nested folder/bear-320x180-10bit-frame-0.hevc
+filehandler:    failed to get fmt type for test-data/nested folder/bear-320x180-10bit-frame-0.hevc
Binary file test-data/test-data.zip has changed