Mercurial > repos > iuc > snpeff
changeset 25:5c7b70713fb5 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit a4c17efb8ec4b3a315766f5b5602effa275fade3
author | iuc |
---|---|
date | Wed, 03 Aug 2022 16:33:45 +0000 |
parents | cfcf33df7fc0 |
children | 5b80f544c67f |
files | gbk2fa.py snpEff_create_db.xml snpEff_macros.xml |
diffstat | 3 files changed, 94 insertions(+), 111 deletions(-) [+] |
line wrap: on
line diff
--- a/gbk2fa.py Wed Oct 13 23:30:29 2021 +0000 +++ b/gbk2fa.py Wed Aug 03 16:33:45 2022 +0000 @@ -1,43 +1,47 @@ import argparse import bz2 -import contextlib import gzip -import sys -import magic from Bio import SeqIO -parser = argparse.ArgumentParser() -parser.add_argument("genbank_file", help="GenBank input file. Can be compressed with gzip or bzip2") -parser.add_argument("fasta_file", help="FASTA output datset") -parser.add_argument("--remove_version", dest="remove_version", action="store_true", help="Remove version number from NCBI form formatted accession numbers. For example, this would convert 'B000657.2' to 'B000657'") -args = parser.parse_args() -gbk_filename = args.genbank_file -fa_filename = args.fasta_file +def get_opener(gbk_filename): + try: + bz2.open(gbk_filename).read(1) + return bz2.open + except OSError: + pass + try: + gzip.open(gbk_filename).read(1) + return gzip.open + except OSError: + return open -@contextlib.contextmanager -def get_file_handle(gbk_filename): - f_type = magic.from_file(args.genbank_file, mime=True) - if f_type == 'text/plain': - input_handle = open(gbk_filename, "r") - elif f_type == 'application/gzip' or f_type == 'application/x-gzip': - input_handle = gzip.open(gbk_filename, "rt") - elif f_type == 'application/x-bzip2': - input_handle = bz2.open(gbk_filename, "rt") - else: - sys.exit("Cannot process file of type {}. Only plain, gzip'ed, and bzip2'ed genbank files are accepted ".format(f_type)) - yield input_handle - input_handle.close() +parser = argparse.ArgumentParser() +parser.add_argument( + "genbank_file", + help="GenBank input file. Can be compressed with gzip or bzip2" +) +parser.add_argument( + "fasta_file", help="FASTA output datset" +) +parser.add_argument( + "--remove_version", action="store_true", + help="Remove version number from NCBI form formatted accession numbers. " + "For example, this would convert 'B000657.2' to 'B000657'" +) +args = parser.parse_args() -with get_file_handle(gbk_filename) as input_handle, open(fa_filename, "w") as output_handle: - - for seq_record in SeqIO.parse(input_handle, "genbank"): +gbk_open = get_opener(args.genbank_file) +with gbk_open(args.genbank_file, 'rt') as input_handle, \ + open(args.fasta_file, 'w') as output_handle: + for seq_record in SeqIO.parse(input_handle, 'genbank'): if args.remove_version: seq_id = seq_record.id.split('.')[0] else: seq_id = seq_record.id print('Writing FASTA record: {}'.format(seq_id)) - output_handle.write(">{}\n{}\n".format(seq_id, seq_record.seq)) + print('>' + seq_id, file=output_handle) + print(seq_record.seq, file=output_handle)
--- a/snpEff_create_db.xml Wed Oct 13 23:30:29 2021 +0000 +++ b/snpEff_create_db.xml Wed Aug 03 16:33:45 2022 +0000 @@ -1,77 +1,66 @@ -<tool id="snpEff_build_gb" name="SnpEff build:" version="@WRAPPER_VERSION@.galaxy4"> +<tool id="snpEff_build_gb" name="SnpEff build:" version="@WRAPPER_VERSION@.galaxy5"> <description> database from Genbank or GFF record</description> <macros> <import>snpEff_macros.xml</import> </macros> <requirements> <expand macro="requirement" /> - <requirement type="package" version="3.6">python</requirement> - <requirement type="package" version="1.70">biopython</requirement> - <requirement type="package" version="0.4.15">python-magic</requirement> - <requirement type="package" version="5.32">libmagic</requirement> + <requirement type="package" version="1.79">biopython</requirement> </requirements> <expand macro="stdio" /> <expand macro="version_command" /> <command><![CDATA[ - #if str( $input_type.input_type_selector ) == "gb": - #if str( $input_type.fasta ) == "yes": - python3 '$__tool_directory__/gbk2fa.py' '${input_type.input_gbk}' '${output_fasta}' - #if $input_type.remove_version: - '${input_type.remove_version}' - #end if - && - #end if + #if str($input_type.input_type_selector) == "gb" and str($input_type.fasta) == "yes": + python3 '$__tool_directory__/gbk2fa.py' '${input_type.input}' '${output_fasta}' ${input_type.remove_version} && #end if mkdir -p '${snpeff_output.files_path}'/'${genome_version}' && - #if str( $input_type.input_type_selector ) == "gb": - #if $input_type.input_gbk.is_of_type("genbank"): - ln -s '${input_type.input_gbk}' '${snpeff_output.files_path}'/'${genome_version}'/genes.gbk && - #elif $input_type.input_gbk.is_of_type("genbank.gz"): - ln -s '${input_type.input_gbk}' '${snpeff_output.files_path}'/'${genome_version}'/genes.gbk.gz && + #if str($input_type.input_type_selector) == "gb": + #if $input_type.input.is_of_type("genbank"): + ln -s '${input_type.input}' '${snpeff_output.files_path}/${genome_version}/genes.gbk' && + #elif $input_type.input.is_of_type("genbank.gz"): + ln -s '${input_type.input}' '${snpeff_output.files_path}/${genome_version}/genes.gbk.gz' && #end if - #elif str( $input_type.input_type_selector ) == "gff": - #if $input_type.reference_source.reference_source_selector == "history": - #if $input_type.reference_source.input_fasta.is_of_type("fasta"): - ln -s '${input_type.reference_source.input_fasta}' '${snpeff_output.files_path}'/'${genome_version}'/sequences.fa && - #elif $input_type.reference_source.input_fasta.is_of_type("fasta.gz"): - ln -s '${input_type.reference_source.input_fasta}' '${snpeff_output.files_path}'/'${genome_version}'/sequences.fa.gz && - #end if - #elif $input_type.reference_source.reference_source_selector == "cached": - ln -s '${input_type.reference_source.ref_file.fields.path}' '${snpeff_output.files_path}'/'${genome_version}'/sequences.fa && - #end if - ln -s '${input_type.input_gff}' '${snpeff_output.files_path}'/'${genome_version}'/genes.gff && - #elif str( $input_type.input_type_selector ) == "gtf": - #if $input_type.reference_source.reference_source_selector == "history": - #if $input_type.reference_source.input_fasta.is_of_type("fasta"): - ln -s '${input_type.reference_source.input_fasta}' '${snpeff_output.files_path}'/'${genome_version}'/sequences.fa && - #elif $input_type.reference_source.input_fasta.is_of_type("fasta.gz"): - ln -s '${input_type.reference_source.input_fasta}' '${snpeff_output.files_path}'/'${genome_version}'/sequences.fa.gz && - #end if - #elif $input_type.reference_source.reference_source_selector == "cached": - ln -s '${input_type.reference_source.ref_file.fields.path}' '${snpeff_output.files_path}'/'${genome_version}'/sequences.fa && - #end if - ln -s '${input_type.input_gtf}' '${snpeff_output.files_path}'/'${genome_version}'/genes.gtf && + #else: + #if $input_type.reference_source.reference_source_selector == "history": + #if $input_type.reference_source.input_fasta.is_of_type("fasta"): + ln -s '${input_type.reference_source.input_fasta}' '${snpeff_output.files_path}/${genome_version}/sequences.fa' && + #elif $input_type.reference_source.input_fasta.is_of_type("fasta.gz"): + ln -s '${input_type.reference_source.input_fasta}' '${snpeff_output.files_path}/${genome_version}/sequences.fa.gz' && + #end if + #elif $input_type.reference_source.reference_source_selector == "cached": + ln -s '${input_type.reference_source.ref_file.fields.path}' '${snpeff_output.files_path}/${genome_version}/sequences.fa' && + #end if + ln -s '${input_type.input}' '${snpeff_output.files_path}/${genome_version}/genes.${input_type.input_type_selector}' && #end if snpEff @JAVA_OPTIONS@ build -v -configOption '${genome_version}'.genome='${genome_version}' -configOption '${genome_version}'.codonTable='${codon_table}' - #if str( $input_type.input_type_selector ) == "gb": + #if str($input_type.input_type_selector) == "gb": -genbank - #elif str( $input_type.input_type_selector ) == "gff": + #elif str($input_type.input_type_selector) == "gff": -gff3 - #elif str( $input_type.input_type_selector ) == "gtf": + #elif str($input_type.input_type_selector) == "gtf": -gtf22 #end if -dataDir '${snpeff_output.files_path}' '${genome_version}' && - echo "${genome_version}.genome : ${genome_version}" >> '${snpeff_output.files_path}'/snpEff.config && - echo "${genome_version}.codonTable : ${codon_table}" >> '${snpeff_output.files_path}'/snpEff.config - + echo '${genome_version}.genome : ${genome_version}' >> '${snpeff_output.files_path}'/snpEff.config && + echo '${genome_version}.codonTable : ${codon_table}' >> '${snpeff_output.files_path}'/snpEff.config ]]></command> <inputs> - <param name="genome_version" type="text" value="" label="Name for the database" help="For E. coli K12 you may want to use 'EcK12' etc."> + <param name="genome_version" type="text" value="" label="Name for the database" help="For E. coli K12, for example, you may want to use 'EcK12'. Note: Spaces are not allowed in the name and will get converted to underscores."> + <sanitizer> + <valid initial="string.ascii_letters,string.digits"> + <add value="_" /> + <add value="." /> + <add value="-" /> + </valid> + <mapping> + <add source=" " target="_" /> + </mapping> + </sanitizer> <validator type="empty_field" message="A genome version name is required" /> </param> <conditional name="input_type"> @@ -81,46 +70,20 @@ <option value="gtf">GTF</option> </param> <when value="gb"> - <param name="input_gbk" type="data" format="genbank,genbank.gz" label="Genbank dataset to build database from" help="This Genbank file will be used to generate snpEff database"/> + <param name="input" type="data" format="genbank,genbank.gz" label="Genbank dataset to build database from" help="This Genbank file will be used to generate snpEff database"/> <param name="fasta" type="select" display="radio" label="Parse Genbank into Fasta" help="This will generate an additional dataset containing all sequences from Genbank file in FASTA format"> <option value="yes" selected="true">Yes</option> <option value="no">No</option> </param> - <param argument="--remove_version" type="boolean" truevalue="--remove_version" falsevalue="" checked="true" label="Remove sequence version label?" help="Genbank sequences have vesion numbers such as B000564.2. This option removes them leaving only B000564" /> + <param argument="--remove_version" type="boolean" truevalue="--remove_version" falsevalue="" checked="true" label="Remove sequence version label?" help="Genbank sequences have version numbers such as B000564.2. This option removes them leaving only B000564" /> </when> <when value="gff"> - <param name="input_gff" type="data" format="gff3" label="GFF dataset to build database from" help="This GFF file will be used to generate snpEff database"/> - <conditional name="reference_source"> - <param name="reference_source_selector" type="select" label="Choose the source for the reference genome"> - <option value="cached">Locally cached</option> - <option value="history">History</option> - </param> - <when value="cached"> - <param name="ref_file" type="select" label="Select reference genome"> - <options from_data_table="fasta_indexes"/> - </param> - </when> - <when value="history"> - <param name="input_fasta" type="data" format="fasta,fasta.gz" label="Genome in FASTA format" help="This dataset is required for generating SnpEff database. See help section below."/> - </when> - </conditional> + <param name="input" type="data" format="gff3" label="GFF dataset to build database from" help="This GFF file will be used to generate snpEff database"/> + <expand macro="ref_select" /> </when> <when value="gtf"> - <param name="input_gtf" type="data" format="gtf" label="GTF dataset to build database from" help="This GTF file will be used to generate snpEff database"/> - <conditional name="reference_source"> - <param name="reference_source_selector" type="select" label="Choose the source for the reference genome"> - <option value="cached">Locally cached</option> - <option value="history">History</option> - </param> - <when value="cached"> - <param name="ref_file" type="select" label="Select reference genome"> - <options from_data_table="fasta_indexes"/> - </param> - </when> - <when value="history"> - <param name="input_fasta" type="data" format="fasta,fasta.gz" label="Genome in FASTA format" help="This dataset is required for generating SnpEff database. See help section below."/> - </when> - </conditional> + <param name="input" type="data" format="gtf" label="GTF dataset to build database from" help="This GTF file will be used to generate snpEff database"/> + <expand macro="ref_select" /> </when> </conditional> <param name="codon_table" type="select" label="Select genetic code for this sequence" help="If this sequence uses non-standard genetic code, select one from these options"> @@ -162,7 +125,7 @@ <test> <param name="genome_version" value="pBR322"/> <param name="input_type_selector" value="gb"/> - <param name="input_gbk" value="pBR322.gbk" /> + <param name="input" value="pBR322.gbk" /> <output name="snpeff_output"> <assert_contents> <has_text text="pBR322" /> @@ -173,7 +136,7 @@ <test> <param name="genome_version" value="pBR322"/> <param name="input_type_selector" value="gb"/> - <param name="input_gbk" value="pBR322.gbk.gz" /> + <param name="input" value="pBR322.gbk.gz" /> <output name="snpeff_output"> <assert_contents> <has_text text="pBR322" /> @@ -186,7 +149,7 @@ <param name="input_type_selector" value="gff"/> <param name="reference_source_selector" value="history"/> <param name="input_fasta" value="pBR322_test2.fna" /> - <param name="input_gff" value="pBR322.gff3"/> + <param name="input" value="pBR322.gff3"/> <output name="snpeff_output"> <assert_contents> <has_text text="pBR322" /> @@ -198,7 +161,7 @@ <param name="input_type_selector" value="gff"/> <param name="reference_source_selector" value="history"/> <param name="input_fasta" value="pBR322_test2.fna.gz" /> - <param name="input_gff" value="pBR322.gff3"/> + <param name="input" value="pBR322.gff3"/> <output name="snpeff_output"> <assert_contents> <has_text text="pBR322" /> @@ -210,7 +173,7 @@ <param name="input_type_selector" value="gtf"/> <param name="reference_source_selector" value="history"/> <param name="input_fasta" value="Saccharomyces_mito.fa.gz" /> - <param name="input_gtf" value="Saccharomyces_mito.gtf" /> + <param name="input" value="Saccharomyces_mito.gtf" /> <output name="snpeff_output"> <assert_contents> <has_text text="Saccharomyces_mito" />
--- a/snpEff_macros.xml Wed Oct 13 23:30:29 2021 +0000 +++ b/snpEff_macros.xml Wed Aug 03 16:33:45 2022 +0000 @@ -18,6 +18,22 @@ <token name="@SNPEFF_VERSION@">SnpEff4.3</token> <token name="@SNPEFF_DATABASE_URL@">https://sourceforge.net/projects/snpeff/files/databases/v4_3/</token> <token name="@JAVA_OPTIONS@">-Xmx\${GALAXY_MEMORY_MB:-8192}m</token> + <xml name="ref_select"> + <conditional name="reference_source"> + <param name="reference_source_selector" type="select" label="Choose the source for the reference genome"> + <option value="cached">Locally cached</option> + <option value="history">History</option> + </param> + <when value="cached"> + <param name="ref_file" type="select" label="Select reference genome"> + <options from_data_table="fasta_indexes"/> + </param> + </when> + <when value="history"> + <param name="input_fasta" type="data" format="fasta,fasta.gz" label="Genome in FASTA format" help="This dataset is required for generating SnpEff database. See help section below."/> + </when> + </conditional> + </xml> <token name="@EXTERNAL_DOCUMENTATION@">