Mercurial > repos > cmonjeau > discosnp_plus_plus
changeset 0:1beb3ed9e1e3
Imported from capsule None
author | cmonjeau |
---|---|
date | Fri, 05 Jun 2015 11:40:18 -0400 |
parents | |
children | 7ecd10051eff |
files | bbric_disco.py bbric_disco.xml tool_dependencies.xml |
diffstat | 3 files changed, 286 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bbric_disco.py Fri Jun 05 11:40:18 2015 -0400 @@ -0,0 +1,126 @@ +import sys, tempfile, subprocess, glob +import os, re, shutil, stat +import optparse +from os.path import basename + +""" + +Created by Cyril Monjeaud +Cyril.Monjeaud@irisa.fr +Modified by Fabrice Legeai +fabrice.legeai@rennes.inra.fr + +Last modifications : 04/21/2015 + +WARNING : + +discoSNP++.py needs: + +- run_discoSnp++.sh +- discoSNP++_to_genotypes.py +- the build repository next to the scripts + +All these files are available after compiling the sources of discoSNP : + +https://colibread.inria.fr/files/2013/10/DiscoSNPpp-2.0.6-Source.zip + +or with the package_discoSnp_plus_plus package in the toolshed + +""" + + +def __main__(): + + # store inputs in an array + parser = optparse.OptionParser() + parser.add_option("-r", dest="data_files") + parser.add_option("-b", dest="branching_bubbles") + parser.add_option("-D", dest="deletions") + parser.add_option("-P", dest="min_snps") + parser.add_option("-l", action="store_true", dest="low_complexity") + parser.add_option("-k", dest="kmer") + parser.add_option("-t", action="store_true", dest="left_right_unitigs") + parser.add_option("-T", action="store_true", dest="left_right_contigs") + parser.add_option("-c", dest="coverage") + parser.add_option("-C", dest="maxcoverage") + parser.add_option("-d", dest="error_threshold") + parser.add_option("-n", action="store_true", dest="genotypes") + parser.add_option("-G", dest="reference") + parser.add_option("-M", dest="mapping_error") + + (options, args) = parser.parse_args() + + # create the working dir inside job_working_dir + output_dir = os.mkdir("job_outputs") + + cmd_line=[] + cmd_line.append("/bin/bash") + #cmd_line.append("/home/genouest/inrarennes/flegeai/local/DiscoSNP/DiscoSNP++-2.1.4-Source/run_discoSnp++.sh") + cmd_line.append("run_discoSnp++.sh") + #cmd_line.append("-B /local/bwa/bwa-0.7.10/") + + # transform .dat into .fasta or .fastq for kissreads2 + link_files=[] + f = open(options.data_files, 'r') + files = f.readlines() + for file in files: + file=file.strip() + if re.search("^$",file): continue + tagfile=[] + tagfile=re.split('::', file) + number = int(tagfile[0])+1 + if re.search("^>.*", open(tagfile[1]).readline()): + link_file = 'input'+str(number)+'.fasta' + else: + link_file = 'input'+str(number)+'.fastq' + + os.symlink(tagfile[1], link_file) + link_files.append(link_file) + + + # edit the command line + cmd_line.extend(["-r",' '.join(link_files),"-b",options.branching_bubbles,"-D",options.deletions,"-P",options.min_snps,"-k",options.kmer,"-c",options.coverage,"-C",options.maxcoverage,"-d",options.error_threshold]) + if options.low_complexity: + cmd_line.append("-l") + if options.left_right_unitigs: + cmd_line.append("-t") + if options.left_right_contigs: + cmd_line.append("-T") + if options.genotypes: + cmd_line.append("-n") + + # genotype part + if options.reference: + cmd_line.extend(["-G", options.reference]) + cmd_line.extend(["-M", options.mapping_error]) + + cmd_line.extend(["-p","job_outputs/galaxy"]) + + # execute job + p=subprocess.Popen(cmd_line, + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + stdoutput, stderror = p.communicate() + + # report file + logfile=open("report.txt", "w") + logfile.write("[COMMAND LINE]"+' '.join(cmd_line)+"\n\n") + logfile.write(stdoutput) + + # print stderror because it's informations + logfile.write(stderror) + + # close logfile + logfile.close() + + # change .fa extension to .fasta for a correct print inside Galaxy + fafiles = glob.glob("job_outputs/*_coherent.fa") + for fafile in fafiles: + shutil.move(fafile, "coherent.fasta") + vcffiles = glob.glob("job_outputs/*_coherent.vcf") + for vcffile in vcffiles: + shutil.move(vcffile, "coherent.vcf") + + +if __name__ == "__main__": __main__() +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bbric_disco.xml Fri Jun 05 11:40:18 2015 -0400 @@ -0,0 +1,151 @@ +<tool id="discosnp_pp" name="discoSnp++" version="2.1.7"> +<description>is an efficient tool for detecting SNPs without a reference genome.</description> + <requirements> + <requirement type="package" version="2.1.7">discoSnp_plus_plus</requirement> + <requirement type="package" version="0.6.2">bwa</requirement> + </requirements> + <command interpreter="python"> +bbric_disco.py +-r ${datfile} +-b $branching_bubbles +-D $deletions +-P $limit_snp +#if $low_complexity +-l +#end if +-k $kmer +#if (str($extension) == 't'): +-t +#end if +#if (str($extension) == 'T'): +-T +#end if +-c $coverage +-C ${maxcoverage} +-d $error_threshold +#if (str($VCF_option.mapping) == 'reference'): +-G ${VCF_option.reference} +-M ${VCF_option.mapping_error} +#end if + </command> + + <inputs> + <repeat name="input_list" title="input files" min="1"> + <param name="input" type="data" format="fasta,fastq,fastq.gz" label="input"/> + </repeat> + + <param name="kmer" type="integer" label="Size of kmers" value="31" /> + <param name="coverage" type="integer" label="Minimal coverage per read set" value="4" /> + <param name="maxcoverage" type="integer" label="Maximal coverage per read set" value="2147483647" help="default value = 2^31-1" /> + <param name="error_threshold" type="integer" label="Max number of errors per read" value="1" help="Max number of errors per read" /> + + <param name="branching_bubbles" type="select" label="branching strategy"> + <option value="0">variants for which any of the two paths is branching are discarded</option> + <option value="1">forbid SNPs for wich the two paths are branching</option> + <option value="2">No limitation on branching</option> + </param> + + <param name="deletions" type="integer" label="deletion size" value="0" help="If different of 0, discoSnp++ will search for deletions of size from 1 to the value included"/> + <param name="limit_snp" type="integer" label="maximum SNPs per bubble" value="1" help="discoSnp++ will search up to the value SNPs in a unique bubble"/> + <param name="low_complexity" type="boolean" default="False" checked="False" label="accept low complexity bubbles" /> + + <param name="extension" type="select" label="extension strategy"> + <option value="n">extends to 30bp on left and right</option> + <option value="t">extends left and right until a polymorphism s found (unitigs)</option> + <option value="T">extends left and right using local assembly (contigs)</option> + </param> + <conditional name="VCF_option" > + <param name="mapping" type="select" label="VCF option"> + <option value="default">Do not use reference genome</option> + <option value="reference">Mapping with a reference genome</option> + </param> + <when value="defaut"></when> + <when value="reference"> + <param name="reference" type="data" format="fasta,fastq" label="Reference genome file" /> + <param name="mapping_error" type="integer" value="4" label="Maximal number of mapping errors" help="during BWA mapping phase" /> + </when> + </conditional> + + </inputs> + + <outputs> +<!-- <data name="report" from_work_dir="report.txt" format="txt" label="Output of ${tool.name} on $on_string"/> --> + <data name="vcf" from_work_dir="coherent.vcf" format="vcf" label="VCF of ${tool.name} on $on_string"/> + <data name="fasta" from_work_dir="coherent.fasta" format="fasta" label="Multifasta of the polymorphisms - ${tool.name} on $on_string"/> +</outputs> + + <configfiles> + <configfile name="datfile"> + #for $i, $lib in enumerate ($input_list) + ${i}::${lib.input} + #end for + </configfile> + </configfiles> + <help> + +**Description** + +Software discoSnp is designed for discovering Single Nucleotide Polymorphism (SNP) from raw set(s) of reads obtained with Next Generation Sequencers (NGS). +Note that number of input read sets is not constrained, it can be one, two, or more. Note also that no other data as reference genome or annotations are needed. +The software is composed by two modules. First module, kissnp2, detects SNPs from read sets. A second module, kissreads, enhance the kissnp2 results by computing per read set and for each found SNP i/ its mean read coverage and ii/ the (phred) quality of reads generating the polymorphism. + +Note that from release of DiscoSnp++-2.0.6, the tool also detects close SNPs and indels. + +------- + +.. class:: warningmark + +**Input parameters** + +-Sequences files in fasta, fastq or fastq.gz, each allele will be counted in each file individually + +-Fasta sequence of a genome if case of you are willing to map the sequence extension on a reference in order to get a compliant VCF + +------- + +.. class:: warningmark + +**Ouput parameters** + +-VCF file with coordinates on the higher branch sequences or on a reference genome if provided + +-Fasta file with sequence extensions around the SNP. + + +------- + +**Web site** + +https://colibread.inria.fr/software/discosnp/ + +------- + +**Integrated by** + +Cyril Monjeaud and Fabrice Legeai + +GenOuest Bio-informatics Core Facility + +UMR 6074 IRISA INRIA-CNRS-UR1 Rennes (France) + +support@genouest.org + +If you use this tool in Galaxy, please cite : + +`Y. Le Bras, A. Roult, C. Monjeaud, M. Bahin, O. Quenez, C. Heriveau, A. Bretaudeau, O. Sallou, O. Collin, Towards a Life Sciences Virtual Research Environment : an e-Science initiative in Western France. JOBIM 2013. <https://www.e-biogenouest.org/resources/128>`_ + + </help> +<citations> +<citation type="doi">10.1093/nar/gku1187</citation> +<citation type="bibtex">@INPROCEEDINGS{JOBIM2013, + author = {Le Bras, Y. and ROULT, A. and Monjeaud, C. and Bahin, M. and Quenez, O. and Heriveau, C. and Bretaudeau, A. and Sallou, O. and Collin, O.}, + title = {Towards a Life Sciences Virtual Research Environment: An e-Science initiative in Western France}, + booktitle = {JOBIM 2013 Proceedings}, + year = {2013}, + url = {https://www.e-biogenouest.org/resources/128}, + pages = {97-106} + } +</citation> +</citations> +</tool> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Fri Jun 05 11:40:18 2015 -0400 @@ -0,0 +1,9 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="discoSnp_plus_plus" version="2.1.7"> + <repository name="package_discosnp_plus_plus" owner="cmonjeau" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> + <package name="bwa" version="0.6.2"> + <repository name="package_bwa_0_6_2" owner="cmonjeau" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency>