changeset 25:5c7b70713fb5 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit a4c17efb8ec4b3a315766f5b5602effa275fade3
author iuc
date Wed, 03 Aug 2022 16:33:45 +0000
parents cfcf33df7fc0
children 5b80f544c67f
files gbk2fa.py snpEff_create_db.xml snpEff_macros.xml
diffstat 3 files changed, 94 insertions(+), 111 deletions(-) [+]
line wrap: on
line diff
--- a/gbk2fa.py	Wed Oct 13 23:30:29 2021 +0000
+++ b/gbk2fa.py	Wed Aug 03 16:33:45 2022 +0000
@@ -1,43 +1,47 @@
 import argparse
 import bz2
-import contextlib
 import gzip
-import sys
 
-import magic
 from Bio import SeqIO
 
-parser = argparse.ArgumentParser()
-parser.add_argument("genbank_file", help="GenBank input file. Can be compressed with gzip or bzip2")
-parser.add_argument("fasta_file", help="FASTA output datset")
-parser.add_argument("--remove_version", dest="remove_version", action="store_true", help="Remove version number from NCBI form formatted accession numbers. For example, this would convert 'B000657.2' to 'B000657'")
-args = parser.parse_args()
 
-gbk_filename = args.genbank_file
-fa_filename = args.fasta_file
+def get_opener(gbk_filename):
+    try:
+        bz2.open(gbk_filename).read(1)
+        return bz2.open
+    except OSError:
+        pass
+    try:
+        gzip.open(gbk_filename).read(1)
+        return gzip.open
+    except OSError:
+        return open
 
 
-@contextlib.contextmanager
-def get_file_handle(gbk_filename):
-    f_type = magic.from_file(args.genbank_file, mime=True)
-    if f_type == 'text/plain':
-        input_handle = open(gbk_filename, "r")
-    elif f_type == 'application/gzip' or f_type == 'application/x-gzip':
-        input_handle = gzip.open(gbk_filename, "rt")
-    elif f_type == 'application/x-bzip2':
-        input_handle = bz2.open(gbk_filename, "rt")
-    else:
-        sys.exit("Cannot process file of type {}. Only plain, gzip'ed, and bzip2'ed genbank files are accepted ".format(f_type))
-    yield input_handle
-    input_handle.close()
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "genbank_file",
+    help="GenBank input file. Can be compressed with gzip or bzip2"
+)
+parser.add_argument(
+    "fasta_file", help="FASTA output datset"
+)
+parser.add_argument(
+    "--remove_version", action="store_true",
+    help="Remove version number from NCBI form formatted accession numbers. "
+         "For example, this would convert 'B000657.2' to 'B000657'"
+)
+args = parser.parse_args()
 
 
-with get_file_handle(gbk_filename) as input_handle, open(fa_filename, "w") as output_handle:
-
-    for seq_record in SeqIO.parse(input_handle, "genbank"):
+gbk_open = get_opener(args.genbank_file)
+with gbk_open(args.genbank_file, 'rt') as input_handle, \
+     open(args.fasta_file, 'w') as output_handle:
+    for seq_record in SeqIO.parse(input_handle, 'genbank'):
         if args.remove_version:
             seq_id = seq_record.id.split('.')[0]
         else:
             seq_id = seq_record.id
         print('Writing FASTA record: {}'.format(seq_id))
-        output_handle.write(">{}\n{}\n".format(seq_id, seq_record.seq))
+        print('>' + seq_id, file=output_handle)
+        print(seq_record.seq, file=output_handle)
--- a/snpEff_create_db.xml	Wed Oct 13 23:30:29 2021 +0000
+++ b/snpEff_create_db.xml	Wed Aug 03 16:33:45 2022 +0000
@@ -1,77 +1,66 @@
-<tool id="snpEff_build_gb" name="SnpEff build:" version="@WRAPPER_VERSION@.galaxy4">
+<tool id="snpEff_build_gb" name="SnpEff build:" version="@WRAPPER_VERSION@.galaxy5">
     <description> database from Genbank or GFF record</description>
     <macros>
         <import>snpEff_macros.xml</import>
     </macros>
     <requirements>
         <expand macro="requirement" />
-        <requirement type="package" version="3.6">python</requirement>
-        <requirement type="package" version="1.70">biopython</requirement>
-        <requirement type="package" version="0.4.15">python-magic</requirement>
-        <requirement type="package" version="5.32">libmagic</requirement>
+        <requirement type="package" version="1.79">biopython</requirement>
     </requirements>
     <expand macro="stdio" />
     <expand macro="version_command" />
     <command><![CDATA[
-        #if str( $input_type.input_type_selector ) == "gb":
-            #if str( $input_type.fasta ) == "yes":
-                python3 '$__tool_directory__/gbk2fa.py' '${input_type.input_gbk}' '${output_fasta}'
-                #if $input_type.remove_version:
-                    '${input_type.remove_version}'
-                #end if
-                &&
-            #end if
+        #if str($input_type.input_type_selector) == "gb" and str($input_type.fasta) == "yes":
+            python3 '$__tool_directory__/gbk2fa.py' '${input_type.input}' '${output_fasta}' ${input_type.remove_version} &&
         #end if
 
         mkdir -p '${snpeff_output.files_path}'/'${genome_version}' &&
 
-        #if str( $input_type.input_type_selector ) == "gb":
-            #if $input_type.input_gbk.is_of_type("genbank"):
-                ln -s '${input_type.input_gbk}' '${snpeff_output.files_path}'/'${genome_version}'/genes.gbk &&
-            #elif $input_type.input_gbk.is_of_type("genbank.gz"):
-                ln -s '${input_type.input_gbk}' '${snpeff_output.files_path}'/'${genome_version}'/genes.gbk.gz &&
+        #if str($input_type.input_type_selector) == "gb":
+            #if $input_type.input.is_of_type("genbank"):
+                ln -s '${input_type.input}' '${snpeff_output.files_path}/${genome_version}/genes.gbk' &&
+            #elif $input_type.input.is_of_type("genbank.gz"):
+                ln -s '${input_type.input}' '${snpeff_output.files_path}/${genome_version}/genes.gbk.gz' &&
             #end if
-        #elif str( $input_type.input_type_selector ) == "gff":
-	        #if $input_type.reference_source.reference_source_selector == "history":
-			  #if $input_type.reference_source.input_fasta.is_of_type("fasta"):
-                ln -s '${input_type.reference_source.input_fasta}' '${snpeff_output.files_path}'/'${genome_version}'/sequences.fa &&
-	            #elif $input_type.reference_source.input_fasta.is_of_type("fasta.gz"):
-	                ln -s '${input_type.reference_source.input_fasta}' '${snpeff_output.files_path}'/'${genome_version}'/sequences.fa.gz &&
-	            #end if
-			#elif $input_type.reference_source.reference_source_selector == "cached":
-			  ln -s '${input_type.reference_source.ref_file.fields.path}' '${snpeff_output.files_path}'/'${genome_version}'/sequences.fa &&
-			#end if
-            ln -s '${input_type.input_gff}' '${snpeff_output.files_path}'/'${genome_version}'/genes.gff &&
-        #elif str( $input_type.input_type_selector ) == "gtf":
-	        #if $input_type.reference_source.reference_source_selector == "history":
-			  #if $input_type.reference_source.input_fasta.is_of_type("fasta"):
-                ln -s '${input_type.reference_source.input_fasta}' '${snpeff_output.files_path}'/'${genome_version}'/sequences.fa &&
-	            #elif $input_type.reference_source.input_fasta.is_of_type("fasta.gz"):
-	                ln -s '${input_type.reference_source.input_fasta}' '${snpeff_output.files_path}'/'${genome_version}'/sequences.fa.gz &&
-	            #end if
-			#elif $input_type.reference_source.reference_source_selector == "cached":
-			  ln -s '${input_type.reference_source.ref_file.fields.path}' '${snpeff_output.files_path}'/'${genome_version}'/sequences.fa &&
-			#end if
-            ln -s '${input_type.input_gtf}' '${snpeff_output.files_path}'/'${genome_version}'/genes.gtf &&
+        #else:
+            #if $input_type.reference_source.reference_source_selector == "history":
+                #if $input_type.reference_source.input_fasta.is_of_type("fasta"):
+                    ln -s '${input_type.reference_source.input_fasta}' '${snpeff_output.files_path}/${genome_version}/sequences.fa' &&
+                #elif $input_type.reference_source.input_fasta.is_of_type("fasta.gz"):
+                    ln -s '${input_type.reference_source.input_fasta}' '${snpeff_output.files_path}/${genome_version}/sequences.fa.gz' &&
+                #end if
+            #elif $input_type.reference_source.reference_source_selector == "cached":
+                ln -s '${input_type.reference_source.ref_file.fields.path}' '${snpeff_output.files_path}/${genome_version}/sequences.fa' &&
+            #end if
+            ln -s '${input_type.input}' '${snpeff_output.files_path}/${genome_version}/genes.${input_type.input_type_selector}' &&
         #end if
 
         snpEff @JAVA_OPTIONS@ build -v
         -configOption '${genome_version}'.genome='${genome_version}'
         -configOption '${genome_version}'.codonTable='${codon_table}'
-        #if str( $input_type.input_type_selector ) == "gb":
+        #if str($input_type.input_type_selector) == "gb":
             -genbank
-        #elif str( $input_type.input_type_selector ) == "gff":
+        #elif str($input_type.input_type_selector) == "gff":
             -gff3
-        #elif str( $input_type.input_type_selector ) == "gtf":
+        #elif str($input_type.input_type_selector) == "gtf":
             -gtf22
         #end if
         -dataDir '${snpeff_output.files_path}' '${genome_version}' &&
-        echo "${genome_version}.genome : ${genome_version}" >> '${snpeff_output.files_path}'/snpEff.config &&
-        echo "${genome_version}.codonTable : ${codon_table}" >> '${snpeff_output.files_path}'/snpEff.config
-
+        echo '${genome_version}.genome : ${genome_version}' >> '${snpeff_output.files_path}'/snpEff.config &&
+        echo '${genome_version}.codonTable : ${codon_table}' >> '${snpeff_output.files_path}'/snpEff.config
     ]]></command>
     <inputs>
-        <param name="genome_version" type="text" value="" label="Name for the database" help="For E. coli K12 you may want to use 'EcK12' etc.">
+        <param name="genome_version" type="text" value="" label="Name for the database" help="For E. coli K12, for example, you may want to use 'EcK12'. Note: Spaces are not allowed in the name and will get converted to underscores.">
+            <sanitizer>
+                <valid initial="string.ascii_letters,string.digits">
+                    <add value="_" />
+                    <add value="." />
+                    <add value="-" />
+                </valid>
+                <mapping>
+                    <add source=" " target="_" />
+                </mapping>
+            </sanitizer>
             <validator type="empty_field" message="A genome version name is required" />
         </param>
         <conditional name="input_type">
@@ -81,46 +70,20 @@
                 <option value="gtf">GTF</option>
             </param>
             <when value="gb">
-                <param name="input_gbk" type="data" format="genbank,genbank.gz" label="Genbank dataset to build database from" help="This Genbank file will be used to generate snpEff database"/>
+                <param name="input" type="data" format="genbank,genbank.gz" label="Genbank dataset to build database from" help="This Genbank file will be used to generate snpEff database"/>
                 <param name="fasta" type="select" display="radio" label="Parse Genbank into Fasta" help="This will generate an additional dataset containing all sequences from Genbank file in FASTA format">
                         <option value="yes" selected="true">Yes</option>
                         <option value="no">No</option>
                 </param>
-                <param argument="--remove_version" type="boolean" truevalue="--remove_version" falsevalue="" checked="true" label="Remove sequence version label?" help="Genbank sequences have vesion numbers such as B000564.2. This option removes them leaving only B000564" />
+                <param argument="--remove_version" type="boolean" truevalue="--remove_version" falsevalue="" checked="true" label="Remove sequence version label?" help="Genbank sequences have version numbers such as B000564.2. This option removes them leaving only B000564" />
             </when>
             <when value="gff">
-                <param name="input_gff" type="data" format="gff3" label="GFF dataset to build database from" help="This GFF file will be used to generate snpEff database"/>
-                <conditional name="reference_source">
-		            <param name="reference_source_selector" type="select" label="Choose the source for the reference genome">
-		                <option value="cached">Locally cached</option>
-		                <option value="history">History</option>
-		            </param>
-		            <when value="cached">
-		                <param name="ref_file" type="select" label="Select reference genome">
-		                    <options from_data_table="fasta_indexes"/>
-		                </param>
-		            </when>
-		            <when value="history"> 
-		                <param name="input_fasta" type="data" format="fasta,fasta.gz" label="Genome in FASTA format" help="This dataset is required for generating SnpEff database. See help section below."/>
-		            </when>
-		        </conditional>
+                <param name="input" type="data" format="gff3" label="GFF dataset to build database from" help="This GFF file will be used to generate snpEff database"/>
+                <expand macro="ref_select" />
             </when>
             <when value="gtf">
-                <param name="input_gtf" type="data" format="gtf" label="GTF dataset to build database from" help="This GTF file will be used to generate snpEff database"/>
-                <conditional name="reference_source">
-		            <param name="reference_source_selector" type="select" label="Choose the source for the reference genome">
-		                <option value="cached">Locally cached</option>
-		                <option value="history">History</option>
-		            </param>
-		            <when value="cached">
-		                <param name="ref_file" type="select" label="Select reference genome">
-		                    <options from_data_table="fasta_indexes"/>
-		                </param>
-		            </when>
-		            <when value="history"> 
-		                <param name="input_fasta" type="data" format="fasta,fasta.gz" label="Genome in FASTA format" help="This dataset is required for generating SnpEff database. See help section below."/>
-		            </when>
-		        </conditional>
+                <param name="input" type="data" format="gtf" label="GTF dataset to build database from" help="This GTF file will be used to generate snpEff database"/>
+                <expand macro="ref_select" />
             </when>
         </conditional>
         <param name="codon_table" type="select" label="Select genetic code for this sequence" help="If this sequence uses non-standard genetic code, select one from these options">
@@ -162,7 +125,7 @@
         <test>
             <param name="genome_version" value="pBR322"/>
             <param name="input_type_selector" value="gb"/>
-            <param name="input_gbk" value="pBR322.gbk" />
+            <param name="input" value="pBR322.gbk" />
             <output name="snpeff_output">
                 <assert_contents>
                     <has_text text="pBR322" />
@@ -173,7 +136,7 @@
         <test>
             <param name="genome_version" value="pBR322"/>
             <param name="input_type_selector" value="gb"/>
-            <param name="input_gbk" value="pBR322.gbk.gz" />
+            <param name="input" value="pBR322.gbk.gz" />
             <output name="snpeff_output">
                 <assert_contents>
                     <has_text text="pBR322" />
@@ -186,7 +149,7 @@
             <param name="input_type_selector" value="gff"/>
             <param name="reference_source_selector" value="history"/>
             <param name="input_fasta" value="pBR322_test2.fna" />
-            <param name="input_gff" value="pBR322.gff3"/>
+            <param name="input" value="pBR322.gff3"/>
             <output name="snpeff_output">
                 <assert_contents>
                     <has_text text="pBR322" />
@@ -198,7 +161,7 @@
             <param name="input_type_selector" value="gff"/>
             <param name="reference_source_selector" value="history"/>
             <param name="input_fasta" value="pBR322_test2.fna.gz" />
-            <param name="input_gff" value="pBR322.gff3"/>
+            <param name="input" value="pBR322.gff3"/>
             <output name="snpeff_output">
                 <assert_contents>
                     <has_text text="pBR322" />
@@ -210,7 +173,7 @@
             <param name="input_type_selector" value="gtf"/>
             <param name="reference_source_selector" value="history"/>
             <param name="input_fasta" value="Saccharomyces_mito.fa.gz" />
-            <param name="input_gtf" value="Saccharomyces_mito.gtf" />
+            <param name="input" value="Saccharomyces_mito.gtf" />
             <output name="snpeff_output">
                 <assert_contents>
                     <has_text text="Saccharomyces_mito" />
--- a/snpEff_macros.xml	Wed Oct 13 23:30:29 2021 +0000
+++ b/snpEff_macros.xml	Wed Aug 03 16:33:45 2022 +0000
@@ -18,6 +18,22 @@
   <token name="@SNPEFF_VERSION@">SnpEff4.3</token>
   <token name="@SNPEFF_DATABASE_URL@">https://sourceforge.net/projects/snpeff/files/databases/v4_3/</token>
   <token name="@JAVA_OPTIONS@">-Xmx\${GALAXY_MEMORY_MB:-8192}m</token>
+  <xml name="ref_select">
+    <conditional name="reference_source">
+        <param name="reference_source_selector" type="select" label="Choose the source for the reference genome">
+            <option value="cached">Locally cached</option>
+            <option value="history">History</option>
+        </param>
+        <when value="cached">
+            <param name="ref_file" type="select" label="Select reference genome">
+                <options from_data_table="fasta_indexes"/>
+            </param>
+        </when>
+        <when value="history">
+            <param name="input_fasta" type="data" format="fasta,fasta.gz" label="Genome in FASTA format" help="This dataset is required for generating SnpEff database. See help section below."/>
+        </when>
+    </conditional>
+  </xml>
   <token name="@EXTERNAL_DOCUMENTATION@">