Mercurial > repos > cmonjeau > stacks
changeset 0:d6ba40f6c824
first commit
| author | cmonjeau | 
|---|---|
| date | Mon, 24 Aug 2015 09:29:12 +0000 | 
| parents | |
| children | ccfa8e539bdf | 
| files | STACKS_denovomap.py STACKS_denovomap.xml STACKS_genotypes.py STACKS_genotypes.xml STACKS_population.py STACKS_population.xml STACKS_prepare_population_map.py STACKS_prepare_population_map.xml STACKS_procrad.py STACKS_procrad.xml STACKS_refmap.py STACKS_refmap.xml STACKS_sort_read_pairs.py STACKS_sort_read_pairs.xml bwa_index.loc.sample bwa_wrapper.py bwa_wrapper.xml stacks.py tool_data_table_conf.xml.sample tool_dependencies.xml | 
| diffstat | 20 files changed, 4716 insertions(+), 0 deletions(-) [+] | 
line wrap: on
 line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/STACKS_denovomap.py Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,265 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +import sys +import re +import os +import tempfile +import shutil +import subprocess +import glob +import argparse +from os.path import basename +import zipfile +import tarfile +import gzip +from galaxy.datatypes.checkers import * +from stacks import * + + +def __main__(): + + # arguments recuperation + + parser = argparse.ArgumentParser() + parser.add_argument('-p') + parser.add_argument('-b') + parser.add_argument('-r') + parser.add_argument('-s') + parser.add_argument('-O') + parser.add_argument('-m') + parser.add_argument('-P') + parser.add_argument('-M') + parser.add_argument('-N') + parser.add_argument('-n') + parser.add_argument('-t') + parser.add_argument('-H') + parser.add_argument('--bound_low') + parser.add_argument('--bound_high') + parser.add_argument('--alpha') + parser.add_argument('--logfile') + parser.add_argument('--compress_output') + parser.add_argument('--catalogsnps') + parser.add_argument('--catalogalleles') + parser.add_argument('--catalogtags') + + # additionnal outputs + parser.add_argument('--total_output') + parser.add_argument('--tags_output') + parser.add_argument('--snps_output') + parser.add_argument('--alleles_output') + parser.add_argument('--matches_output') + + options = parser.parse_args() + + # create working directories + + os.mkdir('inputs') + os.mkdir('job_outputs') + os.mkdir('galaxy_outputs') + + cmd_line = [] + cmd_line.append('denovo_map.pl') + + # if genetic map + + if options.p: + + # parse config files + + tab_parent_files = galaxy_config_to_tabfiles_for_STACKS(options.p) + + # check if zipped files are into the tab and change tab content + + extract_compress_files_from_tabfiles(tab_parent_files, 'inputs') + + # check files extension (important to have .fq or .fasta files) + + check_fastq_extension_and_add(tab_parent_files, 'inputs') + + # create symlink into the temp dir + + create_symlinks_from_tabfiles(tab_parent_files, 'inputs') + + # parse the input dir and store all file names into a tab + + fastq_files = [] + for fastq_file in glob.glob('inputs/*'): + # if is a file (skip repository created after a decompression) + if os.path.isfile(fastq_file): + fastq_files.append(fastq_file) + + fastq_files.sort() + + # test if fastq are paired-end + if options.b == 'true': + for n in range(0, len(fastq_files), 2): + cmd_line.extend(['-p', fastq_files[n]]) + else: + for myfastqfile in fastq_files: + cmd_line.extend(['-p', myfastqfile]) + + # if genetic map with progeny files + + if options.r: + + # parse config files + tab_progeny_files = galaxy_config_to_tabfiles_for_STACKS(options.r) + + # check if zipped files are into the tab and change tab content + extract_compress_files_from_tabfiles(tab_progeny_files, 'inputs') + + # check files extension (important to have .fq or .fasta files) + check_fastq_extension_and_add(tab_progeny_files, 'inputs') + + # create symlink into the temp dir + create_symlinks_from_tabfiles(tab_progeny_files, 'inputs') + + for key in tab_progeny_files: + + # if is a file (skip repository created after a decompression) + + if os.path.isfile('inputs/' + key): + cmd_line.extend(['-r', 'inputs/' + key]) + + # if population is checked + if options.s: + + tab_individual_files = galaxy_config_to_tabfiles_for_STACKS(options.s) + + # check if zipped files are into the tab and change tab content + extract_compress_files_from_tabfiles(tab_individual_files, 'inputs') + + # check files extension (important to have .fq or .fasta files) + check_fastq_extension_and_add(tab_individual_files, 'inputs') + + # create symlink into the temp dir + create_symlinks_from_tabfiles(tab_individual_files, 'inputs') + + # create the command input line + for key in tab_individual_files: + + # if is a file (skip repository created after a decompression) + if os.path.isfile('inputs/' + key): + cmd_line.extend(['-s', 'inputs/' + key]) + + # create the command line + cmd_line.extend([ + '-S', + '-b', + '1', + '-T', + '4', + '-o', + 'job_outputs/' + ]) + + if options.O: + cmd_line.extend(['-O', options.O]) + + if options.m and options.m != '-1': + cmd_line.extend(['-m', options.m]) + + if options.P and options.P != '-1': + cmd_line.extend(['-P', options.P]) + + if options.M and options.M != '-1': + cmd_line.extend(['-M', options.M]) + + if options.N and options.N != '-1': + cmd_line.extend(['-N', options.N]) + + if options.n and options.n != '-1': + cmd_line.extend(['-n', options.n]) + + if options.t and options.t == 'true': + cmd_line.append('-t') + + if options.H and options.H == 'true': + cmd_line.append('-H') + + ## SNP model + if options.bound_low: + cmd_line.extend(['--bound_low', options.bound_low]) + cmd_line.extend(['--bound_high', options.bound_high]) + + if options.alpha: + cmd_line.extend(['--alpha', options.alpha]) + + # launch the command line + print "[CMD_LINE] : "+' '.join(cmd_line) + + p = subprocess.call(cmd_line) + + # postprocesses + try: + shutil.move('job_outputs/denovo_map.log', options.logfile) + except: + sys.stderr.write('Error in denovo_map execution; Please read the additional output (stdout)\n') + sys.exit(1) + + # go inside the outputs dir + os.chdir('job_outputs') + + # move files + for i in glob.glob('*'): + if re.search('catalog.snps.tsv$', i): + shutil.copy(i, options.catalogsnps) + if re.search('catalog.alleles.tsv$', i): + shutil.copy(i, options.catalogalleles) + if re.search('catalog.tags.tsv$', i): + shutil.copy(i, options.catalogtags) + + list_files = glob.glob('*') + + # if compress output is total + if options.compress_output == 'total': + + mytotalzipfile = zipfile.ZipFile('total.zip.temp', 'w', + allowZip64=True) + + for i in list_files: + mytotalzipfile.write(os.path.basename(i)) + + # return the unique archive + shutil.move('total.zip.temp', options.total_output) + elif options.compress_output == 'categories': + + # if compress output is by categories + mytagszip = zipfile.ZipFile('tags.zip.temp', 'w', allowZip64=True) + mysnpszip = zipfile.ZipFile('snps.zip.temp', 'w', allowZip64=True) + myalleleszip = zipfile.ZipFile('alleles.zip.temp', 'w', allowZip64=True) + mymatcheszip = zipfile.ZipFile('matches.zip.temp', 'w', allowZip64=True) + + for i in list_files: + # for each type of files + if re.search("tags\.tsv$", i) and not re.search('batch', i): + mytagszip.write(os.path.basename(i)) + os.remove(i) + elif re.search("snps\.tsv$", i) and not re.search('batch', i): + mysnpszip.write(os.path.basename(i)) + os.remove(i) + elif re.search("alleles\.tsv$", i) and not re.search('batch', i): + myalleleszip.write(os.path.basename(i)) + os.remove(i) + elif re.search("matches\.tsv$", i) and not re.search('batch', i): + mymatcheszip.write(os.path.basename(i)) + os.remove(i) + else: + shutil.move(os.path.basename(i), '../galaxy_outputs') + + # return archives.... + shutil.move('tags.zip.temp', options.tags_output) + shutil.move('snps.zip.temp', options.snps_output) + shutil.move('alleles.zip.temp', options.alleles_output) + shutil.move('matches.zip.temp', options.matches_output) + else: + # else no compression + for i in list_files: + shutil.move(os.path.basename(i), '../galaxy_outputs') + + +if __name__ == '__main__': + __main__() + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/STACKS_denovomap.xml Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,387 @@ +<tool id="STACKSdenovomap" name="STACKS : De novo map" force_history_refresh="True"> + <description>Run the STACKS denovo_map.pl wrapper</description> + +<configfiles> +<configfile name="parent_sequences"> +#if str( $options_usage.options_usage_selector ) == "genetic" +#for $input in $options_usage.parent_sequence: +${input.display_name}::${input} +#end for +#end if +</configfile> +<configfile name="progeny_sequences"> +#if str( $options_usage.options_usage_selector ) == "genetic" and str( $options_usage.options_progeny.options_progeny_selector ) == "yes" +#for $input in $options_usage.options_progeny.progeny_sequence: +${input.display_name}::${input} +#end for +#end if +</configfile> +<configfile name="individual_samples"> +#if str( $options_usage.options_usage_selector ) == "population" +#for $input in $options_usage.individual_sample: +${input.display_name}::${input} +#end for +#end if +</configfile> +</configfiles> + +<requirements> + <requirement type="package" version="1.18">stacks</requirement> +</requirements> + +<command interpreter="python"> +STACKS_denovomap.py +#if str( $options_usage.options_usage_selector ) == "genetic" +-p $parent_sequences +-b $options_usage.paired +#if str( $options_usage.options_progeny.options_progeny_selector ) == "yes" +-r $progeny_sequences +#end if +#else +-s $individual_samples +#if str( $options_usage.options_popmap.popmap_selector) == "yes" +-O $options_usage.options_popmap.popmap +#end if +#end if +-m $advanced_options.minident +-P $advanced_options.minidentprogeny +-M $advanced_options.mismatchbetlociproc +-N $advanced_options.mismatchsecond +-n $advanced_options.mismatchbetlocibuild +-t $advanced_options.remove_hightly +-H $advanced_options.disable_calling +## snp_model +#if str( $snp_options.select_model.model_type) == "bounded" +--bound_low $snp_options.select_model.boundlow +--bound_high $snp_options.select_model.boundhigh +--alpha $snp_options.select_model.alpha +#else +--alpha $snp_options.select_model.alpha +#end if +## outputs +--catalogsnps $catalogsnps +--catalogalleles $catalogalleles +--catalogtags $catalogtags +--logfile $output +--compress_output $output_compress +##additionnal outputs +--total_output $total_output +--tags_output $tags_output +--snps_output $snps_output +--alleles_output $alleles_output +--matches_output $matches_output + +</command> + +<inputs> + <conditional name="options_usage"> + <param name="options_usage_selector" type="select" label="Select your usage"> + <option value="genetic" selected="true">Genetic map</option> + <option value="population">Population</option> + </param> + <when value="genetic"> + <param name="parent_sequence" format="fastq,fasta,zip,tar.gz" type="data" multiple="true" label="Files containing parent sequences" help="FASTQ/FASTA/ZIP/TAR.GZ files containing parent sequences from a mapping cross" /> + <param name="paired" type="boolean" checked="false" default="false" label="Paired-end fastq files?" help="be careful, all files must have a paired-end friend"/> + <conditional name="options_progeny"> + <param name="options_progeny_selector" type="select" label="Use progeny files"> + <option value="yes" selected="true">Yes</option> + <option value="no">No</option> + </param> + <when value="yes"> + <param name="progeny_sequence" format="fastq,fasta,zip,tar.gz" type="data" multiple="true" label="Files containing progeny sequences" help="FASTQ/FASTA/ZIP/TAR.GZ files containing progeny sequences from a mapping cross" /> + </when> + <when value="no"> + </when> + </conditional> + </when> + <when value="population"> + <param name="individual_sample" format="fastq,fasta,zip,tar.gz" type="data" multiple="true" label="Files containing an individual sample from a population" help="FASTQ/FASTA/ZIP/TAR.GZ files contiaining an individual sample from a population" /> + <conditional name="options_popmap"> + <param name="popmap_selector" type="select" label="Analyzing one or more populations?" > + <option value="no" selected="true">No</option> + <option value="yes">Yes</option> + </param> + <when value="no"></when> + <when value="yes"> + <param name="popmap" type="data" format="tabular,txt" label="Specify a population map" help="If analyzing one or more populations, specify a population map" /> + </when> + </conditional> + + </when> + </conditional> + <!-- stack assembly options --> + <section name="advanced_options" title="advanced_options" expanded="False"> + <param name="minident" type="integer" value="-1" label="Minimum number of identical raw reads required to create a stack" help="leave -1 if you don't use the parameter" /> + <param name="minidentprogeny" type="integer" value="-1" label="Minimum number of identical raw reads required to create a stack (progeny)" help="leave -1 if you don't use the parameter" /> + <param name="mismatchbetlociproc" type="integer" value="2" label="Number of mismatches allowed between loci when processing a single individual"/> + <param name="mismatchsecond" type="integer" value="-1" label="Number of mismatches allowed when aligning secondary reads" help="leave -1 if you don't use the parameter" /> + <param name="mismatchbetlocibuild" type="integer" value="0" label="specify the number of mismatches allowed between loci when building the catalog"/> + <param name="remove_hightly" type="boolean" checked="false" default="false" label="remove, or break up, highly repetitive RAD-Tags in the ustacks program" /> + <param name="disable_calling" type="boolean" checked="false" default="false" label="disable calling haplotypes from secondary reads" /> + </section> + <!-- SNP Model options --> + <section name="snp_options" title="SNP_Model_Options" expanded="False"> + <conditional name="select_model"> + <param name="model_type" type="select" label="Choose the model"> + <option value="snp" selected="true">SNP</option> + <option value="bounded">Bounded</option> + </param> + <when value="snp"> + <param name="alpha" type="float" value="0.05" min="0.001" max="0.1" label="chi square significance level required to call a heterozygote or homozygote" help="either 0.1, 0.05 (default), 0.01, or 0.001" /> + </when> + <when value="bounded"> + <param name="boundlow" type="float" value="0.0" min="0.0" max="1.0" label="lower bound for epsilon, the error rate" help="between 0 and 1.0"/> + <param name="boundhigh" type="float" value="1.0" min="0.0" max="1.0" label="upper bound for epsilon, the error rate" help="between 0 and 1.0" /> + <param name="alpha" type="float" value="0.05" min="0.001" max="0.1" label="chi square significance level required to call a heterozygote or homozygote" help="either 0.1, 0.05 (default), 0.01, or 0.001" /> + </when> + </conditional> + </section> + <!-- Output options --> + <param name="output_compress" type="select" label="Output type" help="please see below for details"> + <option value="default" selected="true">No compression</option> + <option value="categories">Compressed by categories</option> + <option value="total">Compressed all outputs</option> + </param> +</inputs> +<outputs> + + <data format="txt" name="output" label="result.log with ${tool.name} on ${on_string}" /> + <data format="txt" name="additional" label="additional file with ${tool.name}" hidden="true"> + <discover_datasets pattern="__designation_and_ext__" directory="galaxy_outputs" visible="true" /> + </data> + <data format="tabular" name="catalogsnps" label="catalog.snps with ${tool.name} on ${on_string}" /> + <data format="tabular" name="catalogalleles" label="catalog.alleles with ${tool.name} on ${on_string}" /> + <data format="tabular" name="catalogtags" label="catalog.tags with ${tool.name} on ${on_string}" /> + + + <!-- additionnal output archives --> + <data format="zip" name="total_output" label="total_output.zip with ${tool.name} on ${on_string}" > + <filter>output_compress == "total"</filter> + </data> + <data format="zip" name="tags_output" label="tags_output.zip with ${tool.name} on ${on_string}" > + <filter>output_compress == "categories"</filter> + </data> + <data format="zip" name="snps_output" label="snps_output.zip with ${tool.name} on ${on_string}" > + <filter>output_compress == "categories"</filter> + </data> + <data format="zip" name="alleles_output" label="alleles_output.zip with ${tool.name} on ${on_string}" > + <filter>output_compress == "categories"</filter> + </data> + <data format="zip" name="matches_output" label="matches_output.zip with ${tool.name} on ${on_string}" > + <filter>output_compress == "categories"</filter> + </data> + +</outputs> +<stdio> +<exit_code range="1" level="fatal" description="Error in Stacks Denovo execution" /> +</stdio> +<help> + +.. class:: infomark + +**What it does** + +This program will run each of the Stacks components: first, running ustacks on each of the samples specified, building loci and calling SNPs in each. Second, cstacks will be run to create a catalog of all loci that were marked as 'parents' or 'samples' on the command line, and finally, sstacks will be executed to match each sample against the catalog. A bit more detail on this process can be found in the FAQ. The denovo_map.pl program will also load the results of each stage of the analysis: individual loci, the catalog, and matches against the catalog into the database (although this can be disabled). After matching, the program will build a database index to speed up access (index_radtags.pl) and enable web-based filtering. + +-------- + +**Created by:** + +Stacks was developed by Julian Catchen with contributions from Angel Amores, Paul Hohenlohe, and Bill Cresko + +-------- + +**Example:** + +Input files: + +FASTQ, FASTA, zip, tar.gz + +- Population map:: + + indv_01 1 + indv_02 1 + indv_03 1 + indv_04 2 + indv_05 2 + indv_06 2 + + +Output files: + +- XXX.tags.tsv file:: + + Column Name Description + 1 Sql ID This field will always be "0", however the MySQL database will assign an ID when it is loaded. + 2 Sample ID Each sample passed through Stacks gets a unique id for that sample. + 3 Stack ID Each stack formed gets an ID. + 4 Chromosome If aligned to a reference genome using pstacks, otherwise it is blank. + 5 Basepair If aligned to ref genome using pstacks. + 6 Strand If aligned to ref genome using pstacks. + 7 Sequence Type Either 'consensus', 'primary' or 'secondary', see the Stacks paper for definitions of these terms. + 8 Sequence ID The individual sequence read that was merged into this stack. + 9 Sequence The raw sequencing read. + 10 Deleveraged Flag If "1", this stack was processed by the deleveraging algorithm and was broken down from a larger stack. + 11 Blacklisted Flag If "1", this stack was still confounded depsite processing by the deleveraging algorithm. + 12 Lumberja ckstack Flag If "1", this stack was set aside due to having an extreme depth of coverage. + +Notes: For the tags file, each stack will start in the file with a consensus sequence for the entire stack followed by the flags for that stack. Then, each individual read that was merged into that stack will follow. The next stack will start with another consensus sequence. + + +- XXX.snps.tsv file:: + + Column Name Description + 1 Sql ID This field will always be "0", however the MySQL database will assign an ID when it is loaded. + 2 Sample ID + 3 Stack ID + 4 SNP Column + 5 Likelihood ratio From the SNP-calling model. + 6 Rank_1 Majority nucleotide. + 7 Rank_2 Alternative nucleotide. + +Notes: If a stack has two SNPs called within it, then there will be two lines in this file listing each one. + + +- XXX.alleles.tsv file:: + + Column Name Description + 1 Sql ID This field will always be "0", however the MySQL database will assign an ID when it is loaded. + 2 Sample ID + 3 Stack ID + 4 Haplotype The haplotype, as constructed from the called SNPs at each locus. + 5 Percent Percentage of reads that have this haplotype + 6 Count Raw number of reads that have this haplotype + + +- XXX.matches.tsv file:: + + Column Name Description + 1 Sql ID This field will always be "0", however the MySQL database will assign an ID when it is loaded. + 2 Batch ID + 3 Catalog ID + 4 Sample ID + 5 Stack ID + 6 Haplotype + 7 Stack Depth + +Notes: Each line in this file records a match between a catalog locus and a locus in an individual, for a particular haplotype. The Batch ID plus the Catalog ID together represent a unique locus in the entire population, while the Sample ID and the Stack ID together represent a unique locus in an individual sample. + + +- batch_X.sumstats.tsv Summary Statistics Output:: + + Batch ID The batch identifier for this data set. + Locus ID Catalog locus identifier. + Chromosome If aligned to a reference genome. + Basepair If aligned to a reference genome. This is the alignment of the whole catalog locus. The exact basepair reported is aligned to the location of the RAD site (depending on whether alignment is to the positive or negative strand). + Column The nucleotide site within the catalog locus. + Population ID The ID supplied to the populations program, as written in the population map file. + P Nucleotide The most frequent allele at this position in this population. + Q Nucleotide The alternative allele. + Number of Individuals Number of individuals sampled in this population at this site. + P Frequency of most frequent allele. + Observed Heterozygosity The proportion of individuals that are heterozygotes in this population. + Observed Homozygosity The proportion of individuals that are homozygotes in this population. + Expected Heterozygosity Heterozygosity expected under Hardy-Weinberg equilibrium. + Expected Homozygosity Homozygosity expected under Hardy-Weinberg equilibrium. + pi An estimate of nucleotide diversity. + Smoothed pi A weighted average of p depending on the surrounding 3s of sequence in both directions. + Smoothed pi P-value If bootstrap resampling is enabled, a p-value ranking the significance of p within this population. + FIS The inbreeding coefficient of an individual (I) relative to the subpopulation (S). + Smoothed FIS A weighted average of FIS depending on the surrounding 3s of sequence in both directions. + Smoothed FIS P-value If bootstrap resampling is enabled, a p-value ranking the significance of FIS within this population. + Private allele True (1) or false (0), depending on if this allele is only occurs in this population. + +- batch_X.fst_Y-Z.tsv Pairwise FST Output:: + + Batch ID The batch identifier for this data set. + Locus ID Catalog locus identifier. + Population ID 1 The ID supplied to the populations program, as written in the population map file. + Population ID 2 The ID supplied to the populations program, as written in the population map file. + Chromosome If aligned to a reference genome. + Basepair If aligned to a reference genome. This is the alignment of the whole catalog locus. The exact basepair reported is aligned to the location of the RAD site (depending on whether alignment is to the positive or negative strand). + Column The nucleotide site within the catalog locus. + Overall pi An estimate of nucleotide diversity across the two populations. + FST A measure of population differentiation. + FET p-value P-value describing if the FST measure is statistically significant according to Fisher's Exact Test. + Odds Ratio Fisher's Exact Test odds ratio + CI High Fisher's Exact Test confidence interval. + CI Low Fisher's Exact Test confidence interval. + LOD Score Logarithm of odds score. + Expected Heterozygosity Heterozygosity expected under Hardy-Weinberg equilibrium. + Expected Homozygosity Homozygosity expected under Hardy-Weinberg equilibrium. + Corrected FST FST with either the FET p-value, or a window-size or genome size Bonferroni correction. + Smoothed FST A weighted average of FST depending on the surrounding 3s of sequence in both directions. + Smoothed FST P-value If bootstrap resampling is enabled, a p-value ranking the significance of FST within this pair of populations. + + +Instructions to add the functionality of archives management in Galaxy on the `eBiogenouest HUB wiki <https://www.e-biogenouest.org/wiki/ManArchiveGalaxy>`_ . + +-------- + +**Output type:** + +- Output type details:: + + No compression All files will be added in the current history. + Compressed by categories Files will be compressed by categories (snps, allele, matches and tags) into 4 zip archives. These archives and batch files will be added in the current history. + Compressed all outputs All files will be compressed in an unique zip archive. Batch files will be added in the current history with the archive. + + +-------- + +**Project links:** + +`STACKS website <http://creskolab.uoregon.edu/stacks/>`_ . + +`STACKS manual <http://creskolab.uoregon.edu/stacks/stacks_manual.pdf>`_ . + +`STACKS google group <https://groups.google.com/forum/#!forum/stacks-users>`_ . + +-------- + +**References:** + +-J. Catchen, P. Hohenlohe, S. Bassham, A. Amores, and W. Cresko. Stacks: an analysis tool set for population genomics. Molecular Ecology. 2013. + +-J. Catchen, S. Bassham, T. Wilson, M. Currey, C. O'Brien, Q. Yeates, and W. Cresko. The population structure and recent colonization history of Oregon threespine stickleback determined using restriction-site associated DNA-sequencing. Molecular Ecology. 2013. + +-J. Catchen, A. Amores, P. Hohenlohe, W. Cresko, and J. Postlethwait. Stacks: building and genotyping loci de novo from short-read sequences. G3: Genes, Genomes, Genetics, 1:171-182, 2011. + +-A. Amores, J. Catchen, A. Ferrara, Q. Fontenot and J. Postlethwait. Genome evolution and meiotic maps by massively parallel DNA sequencing: Spotted gar, an outgroup for the teleost genome duplication. Genetics, 188:799'808, 2011. + +-P. Hohenlohe, S. Amish, J. Catchen, F. Allendorf, G. Luikart. RAD sequencing identifies thousands of SNPs for assessing hybridization between rainbow trout and westslope cutthroat trout. Molecular Ecology Resources, 11(s1):117-122, 2011. + +-K. Emerson, C. Merz, J. Catchen, P. Hohenlohe, W. Cresko, W. Bradshaw, C. Holzapfel. Resolving postglacial phylogeography using high-throughput sequencing. Proceedings of the National Academy of Science, 107(37):16196-200, 2010. + +-------- + +**Integrated by:** + +Yvan Le Bras and Cyril Monjeaud + +GenOuest Bio-informatics Core Facility + +UMR 6074 IRISA INRIA-CNRS-UR1 Rennes (France) + +support@genouest.org + +</help> +<citations> + <citation type="doi">10.1111/mec.12354</citation> + <citation type="doi">10.1111/mec.12330</citation> + <citation type="doi">10.1534/g3.111.000240</citation> + <citation type="doi">10.1534/genetics.111.127324</citation> + <citation type="doi">10.1111/j.1755-0998.2010.02967.x</citation> + <citation type="doi">10.1073/pnas.1006538107</citation> + + <citation type="bibtex">@INPROCEEDINGS{JOBIM2013, + author = {Le Bras, Y. and ROULT, A. and Monjeaud, C. and Bahin, M. and Quenez, O. and Heriveau, C. and Bretaudeau, A. and Sallou, O. and Collin, O.}, + title = {Towards a Life Sciences Virtual Research Environment: An e-Science initiative in Western France}, + booktitle = {JOBIM 2013 Proceedings}, + year = {2013}, + url = {https://www.e-biogenouest.org/resources/128}, + pages = {97-106} + }</citation> +</citations> +</tool> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/STACKS_genotypes.py Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,143 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +import sys +import re +import os +import tempfile +import shutil +import subprocess +import glob +import argparse +from os.path import basename +import zipfile +import tarfile +import gzip +from galaxy.datatypes.checkers import * +from stacks import * + + +def __main__(): + + # arguments recuperation + + parser = argparse.ArgumentParser() + parser.add_argument('-P') + parser.add_argument('-b') + parser.add_argument('-c') + parser.add_argument('-t') + parser.add_argument('-o') + parser.add_argument('-e') + parser.add_argument('--active_advanced') + parser.add_argument('-r') + parser.add_argument('-m') + parser.add_argument('-B') + parser.add_argument('-W') + parser.add_argument('--active_autocorrect') + parser.add_argument('--min_hom_seqs') + parser.add_argument('--min_het_seqs') + parser.add_argument('--max_het_seqs') + + # multifile management + + parser.add_argument('--logfile') + parser.add_argument('--compress_output') + + # additionnal outputs + + parser.add_argument('--total_output') + + options = parser.parse_args() + + # create the working dir + + os.mkdir('job_outputs') + os.mkdir('galaxy_outputs') + + os.chdir('job_outputs') + + # edit the command line + + cmd_line = [] + cmd_line.append("genotypes") + + # STACKS_archive + # check if zipped files are into the tab + + extract_compress_files(options.P, os.getcwd()) + + # create the genotypes command input line + + cmd_line.extend(["-b", options.b, "-P", os.getcwd()]) + + # create the genotypes command line + + if options.e: + cmd_line.extend(["-e", options.e]) + if options.c == 'true': + cmd_line.append("-c") + if options.t: + cmd_line.extend(["-t", options.t]) + if options.o: + cmd_line.extend(["-o", options.o]) + + # if advanced is activate + if options.active_advanced == "true": + cmd_line.extend(["-r", options.r]) + cmd_line.extend(["-m", options.m]) + if options.B: + cmd_line.extend(["-B", options.B]) + if options.W: + cmd_line.extend(["-W", options.W]) + + # if autocorrect is activate + if options.active_autocorrect == "true": + cmd_line.extend(["--min_hom_seqs", options.min_hom_seqs]) + cmd_line.extend(["--min_het_seqs", options.min_het_seqs]) + cmd_line.extend(["--max_het_seqs", options.max_het_seqs]) + + # command with dependencies installed + print "[CMD]:"+' '.join(cmd_line) + subprocess.call(cmd_line) + + # postprocesses + try: + shutil.copy('batch_1.haplotypes_1.tsv', options.logfile) + except: + sys.stderr.write('Error in genotypes execution; Please read the additional output (stdout)\n') + sys.exit(1) + + # copy all files inside tmp_dir into workdir + + list_files = glob.glob('*') + + # if compress output is total + + if options.compress_output == 'total': + mytotalzipfile = zipfile.ZipFile('total.zip.temp', 'w') + + for i in list_files: + if re.search('^batch', os.path.basename(i)) \ + and not re.search("\.tsv$", os.path.basename(i)) \ + or re.search(".*_[0-9]*\.tsv$", os.path.basename(i)) \ + or re.search('.*genotypes.*', os.path.basename(i)): + mytotalzipfile.write(i, os.path.basename(i)) + + # return the unique archive + + shutil.move('total.zip.temp', options.total_output) + + # if compress output is default + if options.compress_output == 'default': + for i in list_files: + if re.search('^batch', os.path.basename(i)) \ + and not re.search("\.tsv$", os.path.basename(i)) \ + or re.search(".*_[0-9]*\.tsv$", os.path.basename(i)) \ + or re.search('.*genotypes.*', os.path.basename(i)): + shutil.move(i, '../galaxy_outputs') + + +if __name__ == '__main__': + __main__() + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/STACKS_genotypes.xml Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,379 @@ +<tool id="STACKSgenotypes" name="STACKS : genotypes" force_history_refresh="True"> + <description>Run the STACKS genotypes program</description> + + +<requirements> + <requirement type="package" version="1.18">stacks</requirement> +</requirements> + +<command interpreter="python"> + +STACKS_genotypes.py +-P $STACKS_archive +-b $batchid +-c $corrections +#if str( $options_output.options_output_selector ) == "1": +-t $options_output.map +-o $options_output.filetype +#end if +#if str( $options_enzyme.options_enzyme_selector ) == "1": +-e $options_enzyme.enzyme +#end if +--active_advanced $active_advanced +-r $advanced_options.minprogeny +-m $advanced_options.mindepth +#if str( $advanced_options.blacklistselect.advanced_blackoptions_selector) == "advanced" +-B $advanced_options.blacklistselect.blacklist +#end if +#if str( $advanced_options.whitelistselect.advanced_whiteoptions_selector) == "advanced" +-W $advanced_options.whitelistselect.whitelist +#end if +--active_autocorrect $active_autocorrect +--min_hom_seqs $options_autocorrect.hom +--min_het_seqs $options_autocorrect.het +--max_het_seqs $options_autocorrect.hetmax +--logfile $output +--compress_output $output_compress +##additionnal outputs +--total_output $total_output + + +</command> + +<inputs> + <param name="STACKS_archive" format="zip,tar.gz" type="data" label="Archive from STACKS pipeline regrouping all outputs" /> + <param name="batchid" type="integer" value="1" label="Batch ID" help="Batch ID to examine when exporting from the catalog" /> + <conditional name="options_output"> + <param name="options_output_selector" type="select" label="Did you want to use the file type output option?"> + <option value="1">Yes</option> + <option value="2" selected="true">No</option> + </param> + <when value="1"> + <param name="map" type="select" format="text" label="map type" help="map type to write. 'CP', 'DH', 'F2', 'BC1', and 'GEN' are the currently supported map types" > + <option value="CP">CP</option> + <option value="DH">DH</option> + <option value="F2">F2</option> + <option value="BC1">BC1</option> + <option value="GEN">GEN</option> + </param> + <param name="filetype" type="select" format="text" label="output file type" help="output file type to write, 'joinmap', 'onemap', 'rqtl', and 'genomic' are currently supported" > + <option value="joinmap">joinmap</option> + <option value="onemap">onemap</option> + <option value="rqtl">rqtl</option> + <option value="genomic">genomic</option> + </param> + </when> + <when value="2"> + </when> + </conditional> + <conditional name="options_enzyme"> + <param name="options_enzyme_selector" type="select" label="Did you want to use the genomic output option?"> + <option value="1">Yes</option> + <option value="2" selected="true">No</option> + </param> + <when value="1"> + <param name="enzyme" type="select" format="text" label="provide the restriction enzyme used" help="required if generating genomic output" > + <option value="apeKI">apeKI</option> + <option value="bamHI">bamHI</option> + <option value="claI">claI</option> + <option value="dpnII">dpnII</option> + <option value="eaeI">eaeI</option> + <option value="ecoRI">ecoRI</option> + <option value="ecoT22I">ecoT22I</option> + <option value="hindIII">hindIII</option> + <option value="mluCI">mluCI</option> + <option value="mseI">mseI</option> + <option value="mspI">mspI</option> + <option value="ndeI">ndeI</option> + <option value="nlaIII">nlaIII</option> + <option value="notI">notI</option> + <option value="nsiI">nsiI</option> + <option value="pstI">pstI</option> + <option value="sau3AI">sau3AI</option> + <option value="sbfI">sbfI</option> + <option value="sexAI">sexAI</option> + <option value="sgrAI">sgrAI</option> + <option value="sphI">sphI</option> + <option value="taqI">taqI</option> + <option value="xbaI">xbaI</option> + </param> + </when> + <when value="2"> + </when> + </conditional> + <param name="corrections" type="boolean" checked="false" default="false" label="Activate automated corrections" /> + <param name="active_autocorrect" type="boolean" checked="false" label="Activate automated corrections advanced options" help="autocorrect options are defined below" /> + <section name="options_autocorrect" title="autocorrect options" expanded="False"> + <param name="hom" type="integer" value="5" label="min number of reads for homozygous genotype" help="minimum number of reads required at a stack to call a homozygous genotype (default 5)" /> + <param name="het" type="float" value="0.05" label="homozygote minor minimum allele frequency" help="below this minor allele frequency a stack is called a homozygote, above it (next choice) it is called unknown (default 0.05)" /> + <param name="hetmax" type="float" value="0.1" label="heterozygote minor minimum allele frequency" help=" minimum frequency of minor allele to call a heterozygote (default 0.1)" /> + </section> + <!-- Output options --> + <param name="active_advanced" type="boolean" checked="false" label="Activate advanced options" help="advanced options are defined below" /> + <section name="advanced_options" title="advanced options" expanded="False"> + <conditional name="whitelistselect"> + <param name="advanced_whiteoptions_selector" type="select" label="whitelist advanced options"> + <option value="default" selected="true">Default</option> + <option value="advanced">Advanced</option> + </param> + <when value="default"></when> + <when value="advanced"> + <param name="whitelist" format="txt, tabular" type="data" label="specify a file containing Whitelisted markers to include in the export" /> + </when> + </conditional> + <conditional name="blacklistselect"> + <param name="advanced_blackoptions_selector" type="select" label="blacklist advanced options"> + <option value="default" selected="true">Default</option> + <option value="advanced">Advanced</option> + </param> + <when value="default"></when> + <when value="advanced"> + <param name="blacklist" format="txt, tabular" type="data" label="specify a file containing Blacklisted markers to be excluded from the export" /> + </when> + </conditional> + <param name="minprogeny" type="integer" value="50" label="min number of progeny" help="minimum number of progeny required to print a marker" /> + <param name="mindepth" type="integer" value="1" label="min stack depth" help="specify a minimum stack depth required for individuals at a locus" /> + </section> + <param name="output_compress" type="select" label="Output type" help="please see below for details"> + <option value="default" selected="true">No compression</option> + <option value="total">Compressed all outputs</option> + </param> +</inputs> +<outputs> + + <data format="tabular" name="output" label="results with ${tool.name} on ${on_string}" /> + + <data format="txt" name="additional" label="additional file with ${tool.name}" hidden="true"> + <discover_datasets pattern="__designation__" ext="tabular" directory="galaxy_outputs" visible="true" /> + </data> + + + <!-- additionnal output archives --> + <data format="zip" name="total_output" label="total_output.zip with ${tool.name} on ${on_string}" > + <filter>output_compress == "total"</filter> + </data> + +</outputs> + +<stdio> + <exit_code range="1" level="fatal" description="Error in Stacks Denovo execution" /> +</stdio> +<help> + +.. class:: infomark + +**What it does** + +This program exports a Stacks data set either as a set of observed haplotypes at each locus in the population, or with the haplotypes encoded into genotypes. The -r option allows only loci that exist in a certain number of population individuals to be exported. In a mapping context, raising or lowering this limit is an effective way to control the quality level of markers exported as genuine markers will be found in a large number of progeny. If exporting a set of observed haplotypes in a population, the "min stack depth" option can be used to restict exported loci to those that have a minimum depth of reads. + +By default, when executing the pipeline (either denovo_map or ref_map) the genotypes program will be executed last and will identify mappable markers in the population and export both a set of observed haplotypes and a set of generic genotypes with "min number of progeny" option = 1. + + +Making Corrections + +If enabled with the "make automated corrections to the data" option, the genotypes program will make automated corrections to the data. Since loci are matched up in the population, the script can correct false-negative heterozygote alleles since it knows the existence of alleles at a particular locus in the other individuals. For example, the program will identify loci with SNPs that didn’t have high enough coverage to be identified by the SNP caller. It will also check that homozygous tags have a minimum depth of coverage, since a low-coverage polymorphic locus may appear homozygous simply because the other allele wasn’t sequenced. + + +Correction Thresholds + +The thresholds for automatic corrections can be modified by using the "automated corrections option" and changing the default values for the "min number of reads for homozygous genotype", "homozygote minor minimum allele frequency" and "heterozygote minor minimum allele frequency" parameters to genotypes. "min number of reads for homozygous genotype" is the minimum number of reads required to consider a stack homozygous (default of 5). The "homozygote minor minimum allele frequency" and "heterozygote minor minimum allele frequency" variables represent fractions. If the ratio of the depth of the the smaller allele to the bigger allele is greater than "heterozygote minor minimum allele frequency" (default of 1/10) a stack is called a het. If the ratio is less than homozygote minor minimum allele frequency (default of 1/20) a stack is called homozygous. If the ratio is in between the two values it is unknown and a genotype will not be assigned. + +Automated corrections made by the program are shown in the output file in capital letters. + +-------- + +**Created by:** + +Stacks was developed by Julian Catchen with contributions from Angel Amores, Paul Hohenlohe, and Bill Cresko + +-------- + +**Example:** + +Input files: + +FASTQ, FASTA, zip, tar.gz + +Output files: + +- XXX.tags.tsv file:: + + Column Name Description + 1 Sql ID This field will always be "0", however the MySQL database will assign an ID when it is loaded. + 2 Sample ID Each sample passed through Stacks gets a unique id for that sample. + 3 Stack ID Each stack formed gets an ID. + 4 Chromosome If aligned to a reference genome using pstacks, otherwise it is blank. + 5 Basepair If aligned to ref genome using pstacks. + 6 Strand If aligned to ref genome using pstacks. + 7 Sequence Type Either 'consensus', 'primary' or 'secondary', see the Stacks paper for definitions of these terms. + 8 Sequence ID The individual sequence read that was merged into this stack. + 9 Sequence The raw sequencing read. + 10 Deleveraged Flag If "1", this stack was processed by the deleveraging algorithm and was broken down from a larger stack. + 11 Blacklisted Flag If "1", this stack was still confounded depsite processing by the deleveraging algorithm. + 12 Lumberja ckstack Flag If "1", this stack was set aside due to having an extreme depth of coverage. + +Notes: For the tags file, each stack will start in the file with a consensus sequence for the entire stack followed by the flags for that stack. Then, each individual read that was merged into that stack will follow. The next stack will start with another consensus sequence. + + +- XXX.snps.tsv file:: + + Column Name Description + 1 Sql ID This field will always be "0", however the MySQL database will assign an ID when it is loaded. + 2 Sample ID + 3 Stack ID + 4 SNP Column + 5 Likelihood ratio From the SNP-calling model. + 6 Rank_1 Majority nucleotide. + 7 Rank_2 Alternative nucleotide. + +Notes: If a stack has two SNPs called within it, then there will be two lines in this file listing each one. + + +- XXX.alleles.tsv file:: + + Column Name Description + 1 Sql ID This field will always be "0", however the MySQL database will assign an ID when it is loaded. + 2 Sample ID + 3 Stack ID + 4 Haplotype The haplotype, as constructed from the called SNPs at each locus. + 5 Percent Percentage of reads that have this haplotype + 6 Count Raw number of reads that have this haplotype + + +- XXX.matches.tsv file:: + + Column Name Description + 1 Sql ID This field will always be "0", however the MySQL database will assign an ID when it is loaded. + 2 Batch ID + 3 Catalog ID + 4 Sample ID + 5 Stack ID + 6 Haplotype + 7 Stack Depth + +Notes: Each line in this file records a match between a catalog locus and a locus in an individual, for a particular haplotype. The Batch ID plus the Catalog ID together represent a unique locus in the entire population, while the Sample ID and the Stack ID together represent a unique locus in an individual sample. + + +- batch_X.sumstats.tsv Summary Statistics Output:: + + Batch ID The batch identifier for this data set. + Locus ID Catalog locus identifier. + Chromosome If aligned to a reference genome. + Basepair If aligned to a reference genome. This is the alignment of the whole catalog locus. The exact basepair reported is aligned to the location of the RAD site (depending on whether alignment is to the positive or negative strand). + Column The nucleotide site within the catalog locus. + Population ID The ID supplied to the populations program, as written in the population map file. + P Nucleotide The most frequent allele at this position in this population. + Q Nucleotide The alternative allele. + Number of Individuals Number of individuals sampled in this population at this site. + P Frequency of most frequent allele. + Observed Heterozygosity The proportion of individuals that are heterozygotes in this population. + Observed Homozygosity The proportion of individuals that are homozygotes in this population. + Expected Heterozygosity Heterozygosity expected under Hardy-Weinberg equilibrium. + Expected Homozygosity Homozygosity expected under Hardy-Weinberg equilibrium. + pi An estimate of nucleotide diversity. + Smoothed pi A weighted average of p depending on the surrounding 3s of sequence in both directions. + Smoothed pi P-value If bootstrap resampling is enabled, a p-value ranking the significance of p within this population. + FIS The inbreeding coefficient of an individual (I) relative to the subpopulation (S). + Smoothed FIS A weighted average of FIS depending on the surrounding 3s of sequence in both directions. + Smoothed FIS P-value If bootstrap resampling is enabled, a p-value ranking the significance of FIS within this population. + Private allele True (1) or false (0), depending on if this allele is only occurs in this population. + +- batch_X.fst_Y-Z.tsv Pairwise FST Output:: + + Batch ID The batch identifier for this data set. + Locus ID Catalog locus identifier. + Population ID 1 The ID supplied to the populations program, as written in the population map file. + Population ID 2 The ID supplied to the populations program, as written in the population map file. + Chromosome If aligned to a reference genome. + Basepair If aligned to a reference genome. This is the alignment of the whole catalog locus. The exact basepair reported is aligned to the location of the RAD site (depending on whether alignment is to the positive or negative strand). + Column The nucleotide site within the catalog locus. + Overall pi An estimate of nucleotide diversity across the two populations. + FST A measure of population differentiation. + FET p-value P-value describing if the FST measure is statistically significant according to Fisher's Exact Test. + Odds Ratio Fisher's Exact Test odds ratio + CI High Fisher's Exact Test confidence interval. + CI Low Fisher's Exact Test confidence interval. + LOD Score Logarithm of odds score. + Expected Heterozygosity Heterozygosity expected under Hardy-Weinberg equilibrium. + Expected Homozygosity Homozygosity expected under Hardy-Weinberg equilibrium. + Corrected FST FST with either the FET p-value, or a window-size or genome size Bonferroni correction. + Smoothed FST A weighted average of FST depending on the surrounding 3s of sequence in both directions. + Smoothed FST P-value If bootstrap resampling is enabled, a p-value ranking the significance of FST within this pair of populations. + + +Instructions to add the functionality of archives management in Galaxy on the `eBiogenouest HUB wiki <https://www.e-biogenouest.org/wiki/ManArchiveGalaxy>`_ . + +-------- + +**Output type:** + +- Output type details:: + + No compression All files will be added in the current history. + Compressed by categories Files will be compressed by categories (snps, allele, matches and tags) into 4 zip archives. These archives and batch files will be added in the current history. + Compressed all outputs All files will be compressed in an unique zip archive. Batch files will be added in the current history with the archive. + + +-------- + +**Project links:** + +`STACKS website <http://creskolab.uoregon.edu/stacks/>`_ . + +`STACKS manual <http://creskolab.uoregon.edu/stacks/stacks_manual.pdf>`_ . + +`STACKS google group <https://groups.google.com/forum/#!forum/stacks-users>`_ . + +-------- + +**References:** + +-J. Catchen, P. Hohenlohe, S. Bassham, A. Amores, and W. Cresko. Stacks: an analysis tool set for population genomics. Molecular Ecology. 2013. + +-J. Catchen, S. Bassham, T. Wilson, M. Currey, C. O'Brien, Q. Yeates, and W. Cresko. The population structure and recent colonization history of Oregon threespine stickleback determined using restriction-site associated DNA-sequencing. Molecular Ecology. 2013. + +-J. Catchen, A. Amores, P. Hohenlohe, W. Cresko, and J. Postlethwait. Stacks: building and genotyping loci de novo from short-read sequences. G3: Genes, Genomes, Genetics, 1:171-182, 2011. + +-A. Amores, J. Catchen, A. Ferrara, Q. Fontenot and J. Postlethwait. Genome evolution and meiotic maps by massively parallel DNA sequencing: Spotted gar, an outgroup for the teleost genome duplication. Genetics, 188:799'808, 2011. + +-P. Hohenlohe, S. Amish, J. Catchen, F. Allendorf, G. Luikart. RAD sequencing identifies thousands of SNPs for assessing hybridization between rainbow trout and westslope cutthroat trout. Molecular Ecology Resources, 11(s1):117-122, 2011. + +-K. Emerson, C. Merz, J. Catchen, P. Hohenlohe, W. Cresko, W. Bradshaw, C. Holzapfel. Resolving postglacial phylogeography using high-throughput sequencing. Proceedings of the National Academy of Science, 107(37):16196-200, 2010. + +-------- + +**Integrated by:** + +Yvan Le Bras and Cyril Monjeaud + +GenOuest Bio-informatics Core Facility + +UMR 6074 IRISA INRIA-CNRS-UR1 Rennes (France) + +support@genouest.org + +If you use this tool in Galaxy, please cite : + +`Y. Le Bras, A. Roult, C. Monjeaud, M. Bahin, O. Quenez, C. Heriveau, A. Bretaudeau, O. Sallou, O. Collin, Towards a Life Sciences Virtual Research Environment : an e-Science initiative in Western France. JOBIM 2013. <https://www.e-biogenouest.org/resources/128>`_ + + +</help> +<citations> + <citation type="doi">10.1111/mec.12354</citation> + <citation type="doi">10.1111/mec.12330</citation> + <citation type="doi">10.1534/g3.111.000240</citation> + <citation type="doi">10.1534/genetics.111.127324</citation> + <citation type="doi">10.1111/j.1755-0998.2010.02967.x</citation> + <citation type="doi">10.1073/pnas.1006538107</citation> + + <citation type="bibtex">@INPROCEEDINGS{JOBIM2013, + author = {Le Bras, Y. and ROULT, A. and Monjeaud, C. and Bahin, M. and Quenez, O. and Heriveau, C. and Bretaudeau, A. and Sallou, O. and Collin, O.}, + title = {Towards a Life Sciences Virtual Research Environment: An e-Science initiative in Western France}, + booktitle = {JOBIM 2013 Proceedings}, + year = {2013}, + url = {https://www.e-biogenouest.org/resources/128}, + pages = {97-106} + }</citation> +</citations> +</tool> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/STACKS_population.py Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,243 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +import sys +import re +import os +import tempfile +import shutil +import subprocess +import glob +import argparse +from os.path import basename +import zipfile +import tarfile +import gzip +from galaxy.datatypes.checkers import * +from stacks import * + + +def __main__(): + + # arguments recuperation + + parser = argparse.ArgumentParser() + parser.add_argument('-P') + parser.add_argument('-M') + parser.add_argument('-b') + parser.add_argument('--vcf', action='store_true') + parser.add_argument('--genepop', action='store_true') + parser.add_argument('--structure', action='store_true') + parser.add_argument('-e') + parser.add_argument('--genomic', action='store_true') + parser.add_argument('--fasta', action='store_true') + parser.add_argument('--phase', action='store_true') + parser.add_argument('--beagle', action='store_true') + parser.add_argument('--plink', action='store_true') + parser.add_argument('--phylip', action='store_true') + parser.add_argument('--phylip_var', action='store_true') + parser.add_argument('--write_single_snp', action='store_true') + parser.add_argument('-k', action='store_true') + + # advanced options + parser.add_argument('--advanced_options_activate') + parser.add_argument('-B') + parser.add_argument('-W') + parser.add_argument('-r') + parser.add_argument('-p') + parser.add_argument('-m') + parser.add_argument('-a') + parser.add_argument('-f') + parser.add_argument('--p_value_cutoff') + parser.add_argument('--window_size') + parser.add_argument('--bootstrap') + parser.add_argument('--bootstrap_reps') + + # multifile management + parser.add_argument('--logfile') + + # outputs + parser.add_argument('--ss') + parser.add_argument('--s') + + # optional outputs + parser.add_argument('--ov') + parser.add_argument('--op') + parser.add_argument('--ol') + parser.add_argument('--of') + parser.add_argument('--os') + parser.add_argument('--oe') + parser.add_argument('--om') + parser.add_argument('--og') + + parser.add_argument('--unphased_output') + parser.add_argument('--markers_output') + parser.add_argument('--phase_output') + parser.add_argument('--fst_output') + + options = parser.parse_args() + + # create the working dir + os.mkdir('job_outputs') + os.mkdir('galaxy_outputs') + + os.chdir('job_outputs') + + # STACKS_archive + # check if zipped files are into the tab + extract_compress_files(options.P, os.getcwd()) + + # create the populations command input line + cmd_line=['populations'] + cmd_line.extend(['-b', options.b, '-P', os.getcwd(), '-M', options.M]) + + if options.e: + cmd_line.extend(['-e', options.e, options.genomic]) + + # output options + if options.vcf: + cmd_line.append('--vcf') + if options.genepop: + cmd_line.append('--genepop') + if options.structure: + cmd_line.append('--structure') + if options.fasta: + cmd_line.append('--fasta') + if options.phase: + cmd_line.append('--phase') + if options.beagle: + cmd_line.append('--beagle') + if options.plink: + cmd_line.append('--plink') + if options.phylip: + cmd_line.append('--phylip') + if options.phylip_var and options.phylip: + cmd_line.append('--phylip_var') + if options.write_single_snp and (options.genepop or options.structure): + cmd_line.append('--write_single_snp') + + if options.k: + cmd_line.extend(['-k', '--window_size', options.window_size]) + + if options.advanced_options_activate == 'true': + if options.B: + cmd_line.extend(['-B', options.B]) + if options.W: + cmd_line.extend(['-W', options.W]) + + cmd_line.extend(['-r', options.r]) + cmd_line.extend(['-p', options.p]) + cmd_line.extend(['-m', options.m]) + cmd_line.extend(['-a', options.a]) + + if options.f: + cmd_line.extend(['-f', options.f, '--p_value_cutoff', options.p_value_cutoff]) + if options.bootstrap: + cmd_line.extend(['--bootstrap', options.bootstrap, '--bootstrap_reps', options.bootstrap_reps]) + + print "[CMD]:"+' '.join(cmd_line) + subprocess.call(cmd_line) + + # postprocesses + try: + shutil.copy('batch_1.populations.log', options.logfile) + except: + sys.stderr.write('Error in population execution; Please read the additional output (stdout)\n') + sys.exit(1) + + try: + shutil.move(glob.glob('*.sumstats_summary.tsv')[0], options.ss) + except: + print "No sumstats summary file" + + try: + shutil.move(glob.glob('*.sumstats.tsv')[0], options.s) + except: + print "No sumstats file" + + # move additionnal output files + if options.vcf: + try: + shutil.move(glob.glob('*.vcf')[0], options.ov) + except: + print "No VCF files" + + if options.phylip: + try: + shutil.move(glob.glob('*.phylip')[0], options.op) + shutil.move(glob.glob('*.phylip.log')[0], options.ol) + except: + print "No phylip file" + + if options.fasta: + try: + shutil.move(glob.glob('*.fa')[0], options.of) + except: + print "No fasta files" + + if options.structure: + try: + shutil.move(glob.glob('*.structure.tsv')[0], options.os) + except: + print "No structure file" + + if options.plink : + try: + shutil.move(glob.glob('*.ped')[0], options.oe) + shutil.move(glob.glob('*.map')[0], options.om) + except: + print "No ped and map file" + + if options.genepop : + try: + shutil.move(glob.glob('*.genepop')[0], options.og) + except: + print "No genepop file" + + # copy all files inside tmp_dir into workdir or into an archive.... + list_files = glob.glob('*') + + markerszip = zipfile.ZipFile('markers.zip.temp', 'w', + allowZip64=True) + phasezip = zipfile.ZipFile('phase.zip.temp', 'w', allowZip64=True) + unphasedzip = zipfile.ZipFile('unphased.zip.temp', 'w', + allowZip64=True) + fstzip = zipfile.ZipFile('fst.zip.temp', 'w', allowZip64=True) + + for i in list_files: + # for each type of files + if re.search("\.markers$", i): + markerszip.write(i) + elif re.search("phase\.inp$", i): + phasezip.write(i) + elif re.search("unphased\.bgl$", i): + unphasedzip.write(i) + elif re.search('fst', i): + fstzip.write(i) + else: + # else return original files + if re.search('^batch', os.path.basename(i)) \ + and not re.search("\.tsv$", os.path.basename(i)) \ + or re.search(".*_[0-9]*\.tsv$", os.path.basename(i)): + shutil.move(i, '../galaxy_outputs') + + # close zip files + markerszip.close() + phasezip.close() + unphasedzip.close() + fstzip.close() + + # return archives + shutil.move('fst.zip.temp', options.fst_output) + if options.beagle: + shutil.move('markers.zip.temp', options.markers_output) + shutil.move('unphased.zip.temp', options.unphased_output) + if options.phase: + shutil.move('phase.zip.temp', options.phase_output) + + +if __name__ == '__main__': + __main__() + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/STACKS_population.xml Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,497 @@ +<tool id="STACKSpopulation" name="STACKS : populations" force_history_refresh="True"> + <description>Run the STACKS populations program</description> + + +<requirements> + <requirement type="package" version="1.18">stacks</requirement> +</requirements> + +<command interpreter="python"> + +STACKS_population.py +-P $STACKS_archive +-b $batchid +-M $popmap + +#if $options_kernel.kernel + -k + --window_size $options_kernel.window +#end if + +#if str( $options_enzyme.options_enzyme_selector ) == "1": + -e $options_enzyme.enzyme + --genomic $options_enzyme.genomic +#end if + +## advanced options +--advanced_options_activate $advanced_options_activate +#if $advanced_options_activate + -r $advanced_options.minperc + -p $advanced_options.minpop + -m $advanced_options.mindepth + -a $advanced_options.minminor + #if str( $advanced_options.correction_select.correction ) != "no_corr": + -f $advanced_options.correction_select.correction + --p_value_cutoff $advanced_options.correction_select.pcutoff + #end if + #if str( $advanced_options.blacklistselect.advanced_blackoptions_selector) == "advanced" + -B $advanced_options.blacklistselect.blacklist + #end if + #if str( $advanced_options.whitelistselect.advanced_whiteoptions_selector) == "advanced" + -W $advanced_options.whitelistselect.whitelist + #end if + #if str( $advanced_options.bootstrapresampling.advanced_bootoptions_selector) == "advanced" + --bootstrap $advanced_options.bootstrapresampling.bootstrap + --bootstrap_reps $advanced_options.bootstrapresampling.bootstrapreps + #end if +#end if + +## output files +--ss $sumstatssum +--s $sumstats +--fst_output $outfst + +## output section +#if $options_output.vcf +--vcf +--ov $outvcf +#end if +#if $options_output.phylip +--phylip +--op $outphylip +#end if +#if $options_output.phylip +--phylip_var +--ol $outphyliplog +#end if +#if $options_output.fasta +--fasta +--of $outfasta +#end if +#if $options_output.structure +--structure +--os $outstructure +#end if +#if $options_output.plink +--plink +--oe $outplinkped +--om=$outplinkmap +#end if +#if $options_output.phase +--phase +--phase_output $outphase +#end if +#if $options_output.beagle +--beagle +--unphased_output $outbeagle +#end if +--markers_output $outmarkers +#if $options_output.genepop +--genepop +--og=$outgenepop +#end if +#if $options_output.write_single_snp +--write_single_snp +#end if +--logfile $output + +</command> + +<inputs> + <param name="STACKS_archive" format="zip,tar.gz" type="data" label="Archive from STACKS pipeline regrouping all outputs" /> + <param name="batchid" type="integer" value="1" label="Batch ID" help="Batch ID to examine when exporting from the catalog" /> + <param name="popmap" type="data" format="tabular,txt" label="Specify a population map" help="specify a population map" /> + <section name="options_output" title="Output options" expanded="False"> + <param name="vcf" type="boolean" checked="false" default="false" label="output results in Variant Call Format (VCF)" /> + <param name="genepop" type="boolean" checked="false" default="false" label="output results in GenePop Format" /> + <param name="structure" type="boolean" checked="false" default="false" label="output results in Structure Format" /> + <param name="fasta" type="boolean" checked="false" default="false" label="output full sequence for each allele, from each sample locus in FASTA format" /> + <param name="phase" type="boolean" checked="false" default="false" label="output genotypes in PHASE/fastPHASE format" /> + <param name="beagle" type="boolean" checked="false" default="false" label="output genotypes in Beagle format" /> + <param name="plink" type="boolean" checked="false" default="false" label="output genotypes in PLINK format" /> + <param name="phylip" type="boolean" checked="false" default="false" label="output nucleotides that are fixed-within, and variant among populations in Phylip format for phylogenetic tree construction" /> + <param name="phylip_var" type="boolean" checked="false" default="false" label="include variable sites in the phylip output" /> + <param name="write_single_snp" type="boolean" checked="false" default="false" label="write only the first SNP per locus in Genepop and Structure outputs" /> + </section> + <section name="options_kernel" title="Kernel options" expanded="False"> + <param name="kernel" type="boolean" checked="false" default="false" label="enable kernel-smoothed FIS, π, and FST calculations" /> + <param name="window" type="integer" value="150" label="window size" help="distance over which to average values (sigma, default 150Kb)" /> + </section> + + <conditional name="options_enzyme"> + <param name="options_enzyme_selector" type="select" label="Did you want to use the genomic output option?"> + <option value="1">Yes</option> + <option value="2" selected="true">No</option> + </param> + <when value="1"> + <param name="enzyme" type="select" format="text" label="provide the restriction enzyme used" help="required if generating genomic output" > + <option value="apeKI">apeKI</option> + <option value="bamHI">bamHI</option> + <option value="claI">claI</option> + <option value="dpnII">dpnII</option> + <option value="eaeI">eaeI</option> + <option value="ecoRI">ecoRI</option> + <option value="ecoT22I">ecoT22I</option> + <option value="hindIII">hindIII</option> + <option value="mluCI">mluCI</option> + <option value="mseI">mseI</option> + <option value="mspI">mspI</option> + <option value="ndeI">ndeI</option> + <option value="nlaIII">nlaIII</option> + <option value="notI">notI</option> + <option value="nsiI">nsiI</option> + <option value="pstI">pstI</option> + <option value="sau3AI">sau3AI</option> + <option value="sbfI">sbfI</option> + <option value="sexAI">sexAI</option> + <option value="sgrAI">sgrAI</option> + <option value="sphI">sphI</option> + <option value="taqI">taqI</option> + <option value="xbaI">xbaI</option> + </param> + <param name="genomic" type="boolean" checked="false" default="false" label="output each nucleotide position (fixed or polymorphic) in all population members to a file" /> + </when> + <when value="2"> + </when> + </conditional> + <param name="advanced_options_activate" type="boolean" label="Activate advanced options" help="advanced options are defined below" /> + <section name="advanced_options" title="Advanced options"> + <conditional name="whitelistselect"> + <param name="advanced_whiteoptions_selector" type="select" label="whitelist advanced options"> + <option value="default" selected="true">Default</option> + <option value="advanced">Advanced</option> + </param> + <when value="default"></when> + <when value="advanced"> + <param name="whitelist" format="txt, tabular" type="data" label="specify a file containing Whitelisted markers to include in the export" /> + </when> + </conditional> + <conditional name="blacklistselect"> + <param name="advanced_blackoptions_selector" type="select" label="blacklist advanced options"> + <option value="default" selected="true">Default</option> + <option value="advanced">Advanced</option> + </param> + <when value="default"></when> + <when value="advanced"> + <param name="blacklist" format="txt, tabular" type="data" label="specify a file containing Blacklisted markers to be excluded from the export" /> + </when> + </conditional> + <param name="minperc" type="float" value="0.5" min="0" max="1" label="min percentage of individuals by population" help="minimum percentage of individuals in a population required to process a locus for that population" /> + <param name="minpop" type="integer" value="2" label="min number of populations" help="minimum number of populations a locus must be present in to process a locus" /> + <param name="mindepth" type="integer" value="1" label="min stack depth" help="specify a minimum stack depth required for individuals at a locus" /> + <param name="minminor" type="float" value="0.25" label="min minor allele frequency" help="specify a minimum minor allele frequency required before calculating Fst at a locus (between 0 and 0.5)" /> + <conditional name="correction_select"> + <param name="correction" type="select" format="text" label="Correction type" help="specify a correction to be applied to Fst values: 'p_value', 'bonferroni_win', or 'bonferroni_gen'" > + <option value="no_corr">No correction</option> + <option value="p_value">p_value</option> + <option value="bonferroni_win">bonferroni_win</option> + <option value="bonferroni_gen">bonferroni_gen</option> + </param> + <when value="no_corr"></when> + <when value="p_value"> + <param name="pcutoff" type="float" value="0.05" label="p-value" help="required p-value to keep an Fst measurement (0.05 by default). Also used as base for Bonferroni correction" /> + </when> + <when value="bonferroni_win"> + <param name="pcutoff" type="float" value="0.05" label="p-value" help="required p-value to keep an Fst measurement (0.05 by default). Also used as base for Bonferroni correction" /> + </when> + <when value="bonferroni_gen"> + <param name="pcutoff" type="float" value="0.05" label="p-value" help="required p-value to keep an Fst measurement (0.05 by default). Also used as base for Bonferroni correction" /> + </when> + </conditional> + <conditional name="bootstrapresampling"> + <param name="advanced_bootoptions_selector" type="select" label="bootstrap resampling advanced options"> + <option value="default" selected="true">Default</option> + <option value="advanced">Advanced</option> + </param> + <when value="default"></when> + <when value="advanced"> + <param name="bootstrap" type="select" format="text" label="Bootstrap resampling" help="enable bootstrap resampling for population statistics (reference genome required)" > + <option value="exact">exact</option> + <option value="approx">approx</option> + </param> + <param name="bootstrapreps" type="integer" value="100" label="number of resampling" help="number of bootstrap resamplings to calculate" /> + </when> + </conditional> + </section> +</inputs> +<outputs> + <data format="txt" name="output" label="result.log with ${tool.name} on ${on_string}" /> + <data format="txt" name="additional" label="additional file with ${tool.name}" hidden="true"> + <discover_datasets pattern="__designation_and_ext__" directory="galaxy_outputs" visible="true" /> + </data> + + <data format="tabular" name="sumstatssum" label="sumstats_summary.tsv with ${tool.name} on ${on_string}" /> + <data format="tabular" name="sumstats" label="sumstats.tsv with ${tool.name} on ${on_string}" /> + <data format="zip" name="outfst" label="fst.zip with ${tool.name} on ${on_string}" /> + + <data format="vcf" name="outvcf" label="vcf file with ${tool.name} on ${on_string}"> + <filter>options_output['vcf']</filter> + <filter>options_output['options_output_selector'] == '1' </filter> + </data> + <data format="phylip" name="outphylip" label="phylip file with ${tool.name} on ${on_string}"> + <filter>options_output['phylip']</filter> + <filter>options_output['options_output_selector'] == '1' </filter> + </data> + <data format="txt" name="outphyliplog" label="phylip.log file with ${tool.name} on ${on_string}"> + <filter>options_output['phylip']</filter> + <filter>options_output['options_output_selector'] == '1' </filter> + </data> + <data format="txt" name="outunphasedlog" label="unphased.log file with ${tool.name} on ${on_string}"> + <filter>options_output['beagle']</filter> + <filter>options_output['options_output_selector'] == '1' </filter> + </data> + <data format="fasta" name="outfasta" label="fasta file with ${tool.name} on ${on_string}"> + <filter>options_output['fasta']</filter> + <filter>options_output['options_output_selector'] == '1' </filter> + </data> + <data format="tabular" name="outstructure" label="structure file with ${tool.name} on ${on_string}"> + <filter>options_output['structure']</filter> + <filter>options_output['options_output_selector'] == '1' </filter> + </data> + <data format="txt" name="outplinkped" label="plink.bed file with ${tool.name} on ${on_string}"> + <filter>options_output['plink']</filter> + <filter>options_output['options_output_selector'] == '1' </filter> + </data> + <data format="txt" name="outplinkmap" label="plink.map file with ${tool.name} on ${on_string}"> + <filter>options_output['plink']</filter> + <filter>options_output['options_output_selector'] == '1' </filter> + </data> + <data format="txt" name="outgenepop" label="genepop file with ${tool.name} on ${on_string}"> + <filter>options_output['genepop']</filter> + <filter>options_output['options_output_selector'] == '1' </filter> + </data> + <data format="zip" name="outphase" label="phased.zip PHASE/fastPHASE genotype files with ${tool.name} on ${on_string}"> + <filter>options_output['phase']</filter> + <filter>options_output['options_output_selector'] == '1' </filter> + </data> + <data format="zip" name="outbeagle" label="unphase.zip Beagle genotype files with ${tool.name} on ${on_string}"> + <filter>options_output['beagle']</filter> + <filter>options_output['options_output_selector'] == '1' </filter> + </data> + <data format="zip" name="outmarkers" label="markers.zip Genotype files with ${tool.name} on ${on_string}"> + <filter>options_output['beagle']</filter> + <filter>options_output['options_output_selector'] == '1' </filter> + </data> + + +</outputs> + +<stdio> + <exit_code range="1" level="fatal" description="Error in Stacks population execution" /> +</stdio> + +<help> + +.. class:: infomark + +**What it does** + +This program will be executed in place of the genotypes program when a population is being processed through the pipeline. A map specifiying which individuals belong to which population is submitted to the program and the program will then calculate population genetics statistics, expected/observed heterzygosity, π, and FIS at each nucleotide position. The populations program will compare all populations pairwise to compute FST. If a set of data is reference aligned, then a kernel-smoothed FST will also be calculated. + +-------- + +**Created by:** + +Stacks was developed by Julian Catchen with contributions from Angel Amores, Paul Hohenlohe, and Bill Cresko + +-------- + +**Example:** + +Input files: + +FASTQ, FASTA, zip, tar.gz + +- Population map:: + + indv_01 1 + indv_02 1 + indv_03 1 + indv_04 2 + indv_05 2 + indv_06 2 + + +Output files: + +- XXX.tags.tsv file:: + + Column Name Description + 1 Sql ID This field will always be "0", however the MySQL database will assign an ID when it is loaded. + 2 Sample ID Each sample passed through Stacks gets a unique id for that sample. + 3 Stack ID Each stack formed gets an ID. + 4 Chromosome If aligned to a reference genome using pstacks, otherwise it is blank. + 5 Basepair If aligned to ref genome using pstacks. + 6 Strand If aligned to ref genome using pstacks. + 7 Sequence Type Either 'consensus', 'primary' or 'secondary', see the Stacks paper for definitions of these terms. + 8 Sequence ID The individual sequence read that was merged into this stack. + 9 Sequence The raw sequencing read. + 10 Deleveraged Flag If "1", this stack was processed by the deleveraging algorithm and was broken down from a larger stack. + 11 Blacklisted Flag If "1", this stack was still confounded depsite processing by the deleveraging algorithm. + 12 Lumberja ckstack Flag If "1", this stack was set aside due to having an extreme depth of coverage. + +Notes: For the tags file, each stack will start in the file with a consensus sequence for the entire stack followed by the flags for that stack. Then, each individual read that was merged into that stack will follow. The next stack will start with another consensus sequence. + + +- XXX.snps.tsv file:: + + Column Name Description + 1 Sql ID This field will always be "0", however the MySQL database will assign an ID when it is loaded. + 2 Sample ID + 3 Stack ID + 4 SNP Column + 5 Likelihood ratio From the SNP-calling model. + 6 Rank_1 Majority nucleotide. + 7 Rank_2 Alternative nucleotide. + +Notes: If a stack has two SNPs called within it, then there will be two lines in this file listing each one. + + +- XXX.alleles.tsv file:: + + Column Name Description + 1 Sql ID This field will always be "0", however the MySQL database will assign an ID when it is loaded. + 2 Sample ID + 3 Stack ID + 4 Haplotype The haplotype, as constructed from the called SNPs at each locus. + 5 Percent Percentage of reads that have this haplotype + 6 Count Raw number of reads that have this haplotype + + +- XXX.matches.tsv file:: + + Column Name Description + 1 Sql ID This field will always be "0", however the MySQL database will assign an ID when it is loaded. + 2 Batch ID + 3 Catalog ID + 4 Sample ID + 5 Stack ID + 6 Haplotype + 7 Stack Depth + +Notes: Each line in this file records a match between a catalog locus and a locus in an individual, for a particular haplotype. The Batch ID plus the Catalog ID together represent a unique locus in the entire population, while the Sample ID and the Stack ID together represent a unique locus in an individual sample. + + +- batch_X.sumstats.tsv Summary Statistics Output:: + + Batch ID The batch identifier for this data set. + Locus ID Catalog locus identifier. + Chromosome If aligned to a reference genome. + Basepair If aligned to a reference genome. This is the alignment of the whole catalog locus. The exact basepair reported is aligned to the location of the RAD site (depending on whether alignment is to the positive or negative strand). + Column The nucleotide site within the catalog locus. + Population ID The ID supplied to the populations program, as written in the population map file. + P Nucleotide The most frequent allele at this position in this population. + Q Nucleotide The alternative allele. + Number of Individuals Number of individuals sampled in this population at this site. + P Frequency of most frequent allele. + Observed Heterozygosity The proportion of individuals that are heterozygotes in this population. + Observed Homozygosity The proportion of individuals that are homozygotes in this population. + Expected Heterozygosity Heterozygosity expected under Hardy-Weinberg equilibrium. + Expected Homozygosity Homozygosity expected under Hardy-Weinberg equilibrium. + pi An estimate of nucleotide diversity. + Smoothed pi A weighted average of p depending on the surrounding 3s of sequence in both directions. + Smoothed pi P-value If bootstrap resampling is enabled, a p-value ranking the significance of p within this population. + FIS The inbreeding coefficient of an individual (I) relative to the subpopulation (S). + Smoothed FIS A weighted average of FIS depending on the surrounding 3s of sequence in both directions. + Smoothed FIS P-value If bootstrap resampling is enabled, a p-value ranking the significance of FIS within this population. + Private allele True (1) or false (0), depending on if this allele is only occurs in this population. + +- batch_X.fst_Y-Z.tsv Pairwise FST Output:: + + Batch ID The batch identifier for this data set. + Locus ID Catalog locus identifier. + Population ID 1 The ID supplied to the populations program, as written in the population map file. + Population ID 2 The ID supplied to the populations program, as written in the population map file. + Chromosome If aligned to a reference genome. + Basepair If aligned to a reference genome. This is the alignment of the whole catalog locus. The exact basepair reported is aligned to the location of the RAD site (depending on whether alignment is to the positive or negative strand). + Column The nucleotide site within the catalog locus. + Overall pi An estimate of nucleotide diversity across the two populations. + FST A measure of population differentiation. + FET p-value P-value describing if the FST measure is statistically significant according to Fisher's Exact Test. + Odds Ratio Fisher's Exact Test odds ratio + CI High Fisher's Exact Test confidence interval. + CI Low Fisher's Exact Test confidence interval. + LOD Score Logarithm of odds score. + Expected Heterozygosity Heterozygosity expected under Hardy-Weinberg equilibrium. + Expected Homozygosity Homozygosity expected under Hardy-Weinberg equilibrium. + Corrected FST FST with either the FET p-value, or a window-size or genome size Bonferroni correction. + Smoothed FST A weighted average of FST depending on the surrounding 3s of sequence in both directions. + Smoothed FST P-value If bootstrap resampling is enabled, a p-value ranking the significance of FST within this pair of populations. + + +Instructions to add the functionality of archives management in Galaxy on the `eBiogenouest HUB wiki <https://www.e-biogenouest.org/wiki/ManArchiveGalaxy>`_ . + +-------- + +**Output type:** + +- Output type details:: + + No compression All files will be added in the current history. + Compressed by categories Files will be compressed by categories (snps, allele, matches and tags) into 4 zip archives. These archives and batch files will be added in the current history. + Compressed all outputs All files will be compressed in an unique zip archive. Batch files will be added in the current history with the archive. + + +-------- + +**Project links:** + +`STACKS website <http://creskolab.uoregon.edu/stacks/>`_ . + +`STACKS manual <http://creskolab.uoregon.edu/stacks/stacks_manual.pdf>`_ . + +`STACKS google group <https://groups.google.com/forum/#!forum/stacks-users>`_ . + +-------- + +**References:** + +-J. Catchen, P. Hohenlohe, S. Bassham, A. Amores, and W. Cresko. Stacks: an analysis tool set for population genomics. Molecular Ecology. 2013. + +-J. Catchen, S. Bassham, T. Wilson, M. Currey, C. O'Brien, Q. Yeates, and W. Cresko. The population structure and recent colonization history of Oregon threespine stickleback determined using restriction-site associated DNA-sequencing. Molecular Ecology. 2013. + +-J. Catchen, A. Amores, P. Hohenlohe, W. Cresko, and J. Postlethwait. Stacks: building and genotyping loci de novo from short-read sequences. G3: Genes, Genomes, Genetics, 1:171-182, 2011. + +-A. Amores, J. Catchen, A. Ferrara, Q. Fontenot and J. Postlethwait. Genome evolution and meiotic maps by massively parallel DNA sequencing: Spotted gar, an outgroup for the teleost genome duplication. Genetics, 188:799'808, 2011. + +-P. Hohenlohe, S. Amish, J. Catchen, F. Allendorf, G. Luikart. RAD sequencing identifies thousands of SNPs for assessing hybridization between rainbow trout and westslope cutthroat trout. Molecular Ecology Resources, 11(s1):117-122, 2011. + +-K. Emerson, C. Merz, J. Catchen, P. Hohenlohe, W. Cresko, W. Bradshaw, C. Holzapfel. Resolving postglacial phylogeography using high-throughput sequencing. Proceedings of the National Academy of Science, 107(37):16196-200, 2010. + +-------- + +**Integrated by:** + +Yvan Le Bras and Cyril Monjeaud + +GenOuest Bio-informatics Core Facility + +UMR 6074 IRISA INRIA-CNRS-UR1 Rennes (France) + +support@genouest.org + +If you use this tool in Galaxy, please cite : + +`Y. Le Bras, A. Roult, C. Monjeaud, M. Bahin, O. Quenez, C. Heriveau, A. Bretaudeau, O. Sallou, O. Collin, Towards a Life Sciences Virtual Research Environment : an e-Science initiative in Western France. JOBIM 2013. <https://www.e-biogenouest.org/resources/128>`_ + + +</help> +<citations> + <citation type="doi">10.1111/mec.12354</citation> + <citation type="doi">10.1111/mec.12330</citation> + <citation type="doi">10.1534/g3.111.000240</citation> + <citation type="doi">10.1534/genetics.111.127324</citation> + <citation type="doi">10.1111/j.1755-0998.2010.02967.x</citation> + <citation type="doi">10.1073/pnas.1006538107</citation> + + <citation type="bibtex">@INPROCEEDINGS{JOBIM2013, + author = {Le Bras, Y. and ROULT, A. and Monjeaud, C. and Bahin, M. and Quenez, O. and Heriveau, C. and Bretaudeau, A. and Sallou, O. and Collin, O.}, + title = {Towards a Life Sciences Virtual Research Environment: An e-Science initiative in Western France}, + booktitle = {JOBIM 2013 Proceedings}, + year = {2013}, + url = {https://www.e-biogenouest.org/resources/128}, + pages = {97-106} + }</citation> +</citations> +</tool> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/STACKS_prepare_population_map.py Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,74 @@ +#!/usr/bin/env python + +import sys, re +import os +import tempfile +import shutil, subprocess, glob +import optparse +from os.path import basename +import zipfile, tarfile, gzip +from galaxy.datatypes.checkers import * +from stacks import * + +""" + +Created by Cyril Monjeaud +Cyril.Monjeaud@irisa.fr + +Last modifications : 01/10/2014 + +WARNING : + +STACKS_denovomap.py needs: + +- STACKS scripts in your $PATH + +These scripts are available after compiling the sources of STACKS : + +http://creskolab.uoregon.edu/stacks/ + +or with the galaxy_stacks package in the Genouest toolshed (http://toolshed.genouest.org) + +""" +def __main__(): + + # arguments recuperation + parser = optparse.OptionParser() + parser.add_option("-f") + parser.add_option("-s") + parser.add_option("-t") + parser.add_option("-o") + parser.add_option("-d") + (options, args) = parser.parse_args() + + # create the working dir + tmp_dir = tempfile.mkdtemp(dir=options.d) + + print tmp_dir + #os.chdir(tmp_dir) + + # parse config files + tab_fq_files=galaxy_config_to_tabfiles_for_STACKS(options.f) + + # check if zipped files are into the tab and change tab content + extract_compress_files_from_tabfiles(tab_fq_files, tmp_dir) + + # generate population map for denovo map + if not options.s: + generate_popmap_for_denovo(tab_fq_files, options.t, options.o) + else: + # parse config files + tab_sam_files=galaxy_config_to_tabfiles_for_STACKS(options.s) + extract_compress_files_from_tabfiles(tab_sam_files, tmp_dir) + generate_popmap_for_refmap(tab_fq_files, tab_sam_files, options.t, options.o) + + + #clean up temp files + shutil.rmtree( tmp_dir ) + + + + + + +if __name__ == "__main__": __main__()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/STACKS_prepare_population_map.xml Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,168 @@ +<tool id="STACKSpreparepopmap" name="STACKS : Prepare population map file " > + <description>for STACKS denovomap and refmap</description> + +<configfiles> +<configfile name="fastq_files"> +#for $input in $fastq_file: +${input.display_name}::${input} +#end for +</configfile> +<configfile name="sam_files"> +#if str( $options_target.options_target_selector ) == "refmap": +#for $input in $options_target.sam_file: +${input.display_name}::${input} +#end for +#end if +</configfile> +</configfiles> + +<command interpreter="python"> +STACKS_prepare_population_map.py +-f $fastq_files +#if str( $options_target.options_target_selector ) == "refmap": +-s $sam_files +#end if +-t $info_file +-o $output +-d $__new_file_path__ +</command> + +<inputs> +<conditional name="options_target"> + <param name="options_target_selector" type="select" label="Select your target"> + <option value="denovo" selected="true">STACKS De Novo map</option> + <option value="refmap">STACKS Reference map</option> + </param> + <when value="denovo"> + </when> + <when value="refmap"> + <param name="sam_file" format="sam,zip,tar.gz" type="data" multiple="true" label="SAM files generated by your alignment" help="SAM/ZIP/TAR.GZ files." /> + </when> + +</conditional> + <param name="fastq_file" format="fastq,fasta,zip,tar.gz" type="data" multiple="true" label="Fastq files generated by STACKS : Process radtags tool" help="FASTQ/FASTA/ZIP/TAR.GZ files." /> + <param name="info_file" format="tabular,txt" type="data" label="File with population information" help="File looks like : barcode TAB population " /> + + +</inputs> +<outputs> + + <data format="tabular" name="output" label="population_map.txt with ${tool.name} on ${on_string}" /> + +</outputs> +<help> + +.. class:: infomark + +**What it does** + +This program will prepare a population map dataset from a 2 columns file containing relation between barcode and population. + +-------- + +**Created by:** + +Stacks was developed by Julian Catchen with contributions from Angel Amores, Paul Hohenlohe, and Bill Cresko + +-------- + +**Example:** + +Input files: + +- FASTQ, FASTA, zip, tar.gz + + +- File with population informations: + +This file must have exactly 2 columns, separated by a tab, the first with barcode, second with population name or ID :: + + CGATA pop1 + CGGCG pop1 + GAAGC pop1 + GAGAT pop1 + CGATA pop2 + CGGCG pop2 + GAAGC pop2 + GAGAT pop2 + + +Output file: + +- Population map:: + + indv_01 1 + indv_02 1 + indv_03 1 + indv_04 2 + indv_05 2 + indv_06 2 + +WARNING : the file name in the population map output may be different from the history file name. Don't worry about this, it's safe. + + +Instructions to add the functionality of archives management in Galaxy on the `eBiogenouest HUB wiki <https://www.e-biogenouest.org/wiki/ManArchiveGalaxy>`_ . + +-------- + +**Project links:** + +`STACKS website <http://creskolab.uoregon.edu/stacks/>`_ . + +`STACKS manual <http://creskolab.uoregon.edu/stacks/stacks_manual.pdf>`_ . + +`STACKS google group <https://groups.google.com/forum/#!forum/stacks-users>`_ . + +-------- + +**References:** + +-J. Catchen, P. Hohenlohe, S. Bassham, A. Amores, and W. Cresko. Stacks: an analysis tool set for population genomics. Molecular Ecology. 2013. + +-J. Catchen, S. Bassham, T. Wilson, M. Currey, C. O'Brien, Q. Yeates, and W. Cresko. The population structure and recent colonization history of Oregon threespine stickleback determined using restriction-site associated DNA-sequencing. Molecular Ecology. 2013. + +-J. Catchen, A. Amores, P. Hohenlohe, W. Cresko, and J. Postlethwait. Stacks: building and genotyping loci de novo from short-read sequences. G3: Genes, Genomes, Genetics, 1:171-182, 2011. + +-A. Amores, J. Catchen, A. Ferrara, Q. Fontenot and J. Postlethwait. Genome evolution and meiotic maps by massively parallel DNA sequencing: Spotted gar, an outgroup for the teleost genome duplication. Genetics, 188:799'808, 2011. + +-P. Hohenlohe, S. Amish, J. Catchen, F. Allendorf, G. Luikart. RAD sequencing identifies thousands of SNPs for assessing hybridization between rainbow trout and westslope cutthroat trout. Molecular Ecology Resources, 11(s1):117-122, 2011. + +-K. Emerson, C. Merz, J. Catchen, P. Hohenlohe, W. Cresko, W. Bradshaw, C. Holzapfel. Resolving postglacial phylogeography using high-throughput sequencing. Proceedings of the National Academy of Science, 107(37):16196-200, 2010. + +-------- + +**Integrated by:** + +Yvan Le Bras and Cyril Monjeaud + +GenOuest Bio-informatics Core Facility + +UMR 6074 IRISA INRIA-CNRS-UR1 Rennes (France) + +support@genouest.org + +If you use this tool in Galaxy, please cite : + +`Y. Le Bras, A. Roult, C. Monjeaud, M. Bahin, O. Quenez, C. Heriveau, A. Bretaudeau, O. Sallou, O. Collin, Towards a Life Sciences Virtual Research Environment : an e-Science initiative in Western France. JOBIM 2013. <https://www.e-biogenouest.org/resources/128>`_ + + +</help> +<citations> + <citation type="doi">10.1111/mec.12354</citation> + <citation type="doi">10.1111/mec.12330</citation> + <citation type="doi">10.1534/g3.111.000240</citation> + <citation type="doi">10.1534/genetics.111.127324</citation> + <citation type="doi">10.1111/j.1755-0998.2010.02967.x</citation> + <citation type="doi">10.1073/pnas.1006538107</citation> + + <citation type="bibtex">@INPROCEEDINGS{JOBIM2013, + author = {Le Bras, Y. and ROULT, A. and Monjeaud, C. and Bahin, M. and Quenez, O. and Heriveau, C. and Bretaudeau, A. and Sallou, O. and Collin, O.}, + title = {Towards a Life Sciences Virtual Research Environment: An e-Science initiative in Western France}, + booktitle = {JOBIM 2013 Proceedings}, + year = {2013}, + url = {https://www.e-biogenouest.org/resources/128}, + pages = {97-106} + }</citation> +</citations> +</tool> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/STACKS_procrad.py Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,177 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +import sys +import os +import re +import tempfile +import subprocess +import glob +import shutil +import argparse +from os.path import basename +import zipfile +import tarfile +import gzip +from stacks import * + + +def __main__(): + + # arguments recuperation + + parser = argparse.ArgumentParser() + parser.add_argument('--input_type') + parser.add_argument('--input_enzyme') + parser.add_argument('--input_single') + parser.add_argument('--input_paired1') + parser.add_argument('--input_paired2') + parser.add_argument('--inputype') + parser.add_argument('--sample_name') + parser.add_argument('--barcode') + parser.add_argument('--output_choice') + parser.add_argument('--output_archive') + parser.add_argument('--enzyme1') + parser.add_argument('--enzyme2') + parser.add_argument('--outype') + parser.add_argument('--qualitenc') + parser.add_argument('-D', action='store_true') + parser.add_argument('-t') + parser.add_argument('-q', action='store_true') + parser.add_argument('--activate_advanced_options') + parser.add_argument('-r', action='store_true') + parser.add_argument('-w', default='0.15') + parser.add_argument('-s', default='10') + parser.add_argument('-c', action='store_true') + parser.add_argument('--inline_null', action='store_true') + parser.add_argument('--index_null', action='store_true') + parser.add_argument('--inline_inline', action='store_true') + parser.add_argument('--index_index', action='store_true') + parser.add_argument('--inline_index', action='store_true') + parser.add_argument('--index_inline', action='store_true') + parser.add_argument('--logfile') + options = parser.parse_args() + + # create the working dir + os.mkdir('inputs') + os.mkdir('job_outputs') + os.mkdir('galaxy_outputs') + + cmd_line = [] + cmd_line.append('process_radtags') + cmd_line.extend(['-p', 'inputs']) + cmd_line.extend(['-i', options.inputype]) + cmd_line.extend(['-b', options.barcode]) + + # parse config files and create symlink into the temp dir + + if options.input_type == 'single': + + # load the config file + input_single = options.input_single + + # parse the input_file to extract filenames and filepaths + tab_files = galaxy_config_to_tabfiles(input_single) + + # create symlink into the temp dir + create_symlinks_from_tabfiles(tab_files, 'inputs') + else: + + # load config files + input_paired1 = options.input_paired1 + input_paired2 = options.input_paired2 + + # parse the input_file to extract filenames and filepaths + + tab_files_paired1 = galaxy_config_to_tabfiles(input_paired1) + tab_files_paired2 = galaxy_config_to_tabfiles(input_paired2) + + # create symlinks into the temp dir + + create_symlinks_from_tabfiles(tab_files_paired1, 'inputs') + create_symlinks_from_tabfiles(tab_files_paired2, 'inputs') + + cmd_line.append('-P') + + # test nb enzyme + if options.input_enzyme == '1': + cmd_line.extend(['-e', options.enzyme1]) + + if options.input_enzyme == '2': + cmd_line.extend(['---renz_1', options.enzyme1, '--renz_2', options.enzyme2]) + + # quality + cmd_line.extend(['-E', options.qualitenc]) + + # outputs + cmd_line.extend(['-o', 'job_outputs/']) + cmd_line.extend(['-y', options.outype]) + + # test capture discards + if options.D: + cmd_line.append('-D') + + # optional options + if options.activate_advanced_options == "true": + + if options.q: + cmd_line.append('-q') + if options.r: + cmd_line.append('-r') + + cmd_line.extend(['-w', options.w, '-s', options.s]) + + if options.c: + cmd_line.append('-c') + if options.t != '-1': + cmd_line.extend(['-t', options.t]) + if options.inline_null: + cmd_line.append('--inline_null') + if options.index_null: + cmd_line.append('--index_null') + if options.inline_inline: + cmd_line.append('--inline_inline') + if options.index_index: + cmd_line.append('--index_index') + if options.inline_index: + cmd_line.append('--inline_index') + if options.index_inline: + cmd_line.append('--index_inline') + + print '[CMD_LINE] : ' + ' '.join(cmd_line) + + p = subprocess.call(cmd_line) + + # postprocesses + + try: + shutil.move('job_outputs/process_radtags.log', options.logfile) + except: + sys.stderr.write('Error in process_radtags execution; Please read the additional output (stdout)\n') + sys.exit(1) + + if options.discard_file: + discards_file_name = glob.glob('job_outputs/*.discards')[0] + shutil.move(discards_file_name, options.discard_file) + + # manage outputs names + + change_outputs_procrad_name(os.getcwd() + '/job_outputs', options.sample_name) + + # generate additional output archive file + + if options.output_choice != '1': + generate_additional_archive_file(os.getcwd() + '/job_outputs', options.output_archive) + + # if user has not choose the only zip archive + + if options.output_choice != '3': + list_files = glob.glob('job_outputs/*') + for i in list_files: + shutil.move(i, 'galaxy_outputs') + + +if __name__ == '__main__': + __main__() + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/STACKS_procrad.xml Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,382 @@ +<?xml version="1.0"?> +<tool id="STACKSprocrad" name="STACKS : Process radtags" force_history_refresh="True" version="1.1.0"> +<description>Run the STACKS cleaning script</description> +<configfiles> +<configfile name="input_single"> +#if str( $options_type.options_type_selector ) == "single": +#for $input in $options_type.inputs_single: +${input.display_name}::${input} +#end for +#end if +</configfile> +<configfile name="input_paired1"> +#if str( $options_type.options_type_selector ) == "paired": +#for $input in $options_type.inputs_paired1: +${input.display_name}::${input} +#end for +#end if +</configfile> +<configfile name="input_paired2"> +#if str( $options_type.options_type_selector ) == "paired": +#for $input in $options_type.inputs_paired2: +${input.display_name}::${input} +#end for +#end if +</configfile> +</configfiles> +<requirements> + <requirement type="package" version="1.18">stacks</requirement> + </requirements> +<command interpreter="python"> + +STACKS_procrad.py + --input_type $options_type.options_type_selector + #if str( $options_type.options_type_selector ) == "single": + --input_single $input_single + #else + --input_paired1 $input_paired1 + --input_paired2 $input_paired2 + #end if + --inputype $inputype + --barcode $barcode + --sample_name $sample_name + --output_choice $options_output_infos_selector + #if str( $options_output_infos_selector ) != "1": + --output_archive $output_archive + #end if + --input_enzyme $options_enzyme.options_enzyme_selector + --enzyme1 $options_enzyme.enzyme + #if str( $options_enzyme.options_enzyme_selector ) == "2": + --enzyme2 $options_enzyme.enzyme2 + #end if + --outype $outype + --qualitenc $options_quality.qualitenc + #if $capture: + -D + #end if + --activate_advanced_options $activate_advanced_options + -t $options_advanced.truncate + #if $options_advanced.discard: + -q + #end if + #if $options_advanced.rescue: + -r + #end if + -w $options_advanced.sliding + -s $options_advanced.score + #if $options_advanced.remove: + -c + #end if + #if $options_advanced.inline: + --inline_null + #end if + #if $options_advanced.index: + --index_null + #end if + #if $options_advanced.inlinein: + --inline_inline + #end if + #if $options_advanced.indexind: + --index_index + #end if + #if $options_advanced.inlineind: + --inline_index + #end if + #if $options_advanced.indexin: + --index_inline + #end if + --logfile $output + +</command> + +<inputs> + + <conditional name="options_type"> + <param name="options_type_selector" type="select" label="Single-end or paired-end reads files"> + <option value="single" selected="True">Single-end files</option> + <option value="paired">Paired-end files</option> + </param> + <when value="single"> + <param name="inputs_single" format="fastq,fastq.gz" type="data" multiple="true" label="singles-end reads infile(s)" help="input files" /> + </when> + <when value="paired"> + <param name="inputs_paired1" format="fastq,fastq.gz" type="data" multiple="true" label="paired-end reads infile(s) 1" help="Files must have this syntax : name_R1_001.fastq" /> + <param name="inputs_paired2" format="fastq,fastq.gz" type="data" multiple="true" label="paired-end reads infile(s) 2" help="Files must have this syntax : name_R2_001.fastq" /> + </when> + </conditional> + <param name="inputype" type="select" format="text" label="Inputs format"> + <option value="fastq" selected="True">fastq</option> + <option value="gzfastq">fastq.gz</option> + <option value="bustard">Illumina BUSTARD</option> + </param> + <param name="barcode" type="data" format="tabular,txt" label="Barcode file" help="Barcode file" /> + + <param name="sample_name" type="text" value="sample" label="Sample name" help="Precise the sample name if using several NGS runs" /> + + <conditional name="options_enzyme"> + <param name="options_enzyme_selector" type="select" label="Number of enzymes"> + <option value="1" >One</option> + <option value="2">Two</option> + </param> + <when value="1"> + <param name="enzyme" type="select" format="text" label="Enzyme" help="provide the restriction enzyme used" > + <option value="apeKI">apeKI</option> + <option value="bamHI">bamHI</option> + <option value="claI">claI</option> + <option value="dpnII">dpnII</option> + <option value="eaeI">eaeI</option> + <option value="ecoRI">ecoRI</option> + <option value="ecoT22I">ecoT22I</option> + <option value="hindIII">hindIII</option> + <option value="mluCI">mluCI</option> + <option value="mseI">mseI</option> + <option value="mspI">mspI</option> + <option value="ndeI">ndeI</option> + <option value="nlaIII">nlaIII</option> + <option value="notI">notI</option> + <option value="nsiI">nsiI</option> + <option value="pstI">pstI</option> + <option value="sau3AI">sau3AI</option> + <option value="sbfI">sbfI</option> + <option value="sexAI">sexAI</option> + <option value="sgrAI">sgrAI</option> + <option value="sphI">sphI</option> + <option value="taqI">taqI</option> + <option value="xbaI">xbaI</option> + </param> + </when> + <when value="2"> + <param name="enzyme" type="select" format="text" label="Enzyme" help="provide the restriction enzyme used" > + <option value="apeKI">apeKI</option> + <option value="bamHI">bamHI</option> + <option value="claI">claI</option> + <option value="dpnII">dpnII</option> + <option value="eaeI">eaeI</option> + <option value="ecoRI">ecoRI</option> + <option value="ecoT22I">ecoT22I</option> + <option value="hindIII">hindIII</option> + <option value="mluCI">mluCI</option> + <option value="mseI">mseI</option> + <option value="mspI">mspI</option> + <option value="ndeI">ndeI</option> + <option value="nlaIII">nlaIII</option> + <option value="notI">notI</option> + <option value="nsiI">nsiI</option> + <option value="pstI">pstI</option> + <option value="sau3AI">sau3AI</option> + <option value="sbfI">sbfI</option> + <option value="sexAI">sexAI</option> + <option value="sgrAI">sgrAI</option> + <option value="sphI">sphI</option> + <option value="taqI">taqI</option> + <option value="xbaI">xbaI</option> + </param> + <param name="enzyme2" type="select" format="text" label="Second enzyme" help="provide the second restriction enzyme used" > + <option value="apeKI">apeKI</option> + <option value="bamHI">bamHI</option> + <option value="claI">claI</option> + <option value="dpnII">dpnII</option> + <option value="eaeI">eaeI</option> + <option value="ecoRI">ecoRI</option> + <option value="ecoT22I">ecoT22I</option> + <option value="hindIII">hindIII</option> + <option value="mluCI">mluCI</option> + <option value="mseI">mseI</option> + <option value="mspI">mspI</option> + <option value="ndeI">ndeI</option> + <option value="nlaIII">nlaIII</option> + <option value="notI">notI</option> + <option value="nsiI">nsiI</option> + <option value="pstI">pstI</option> + <option value="sau3AI">sau3AI</option> + <option value="sbfI">sbfI</option> + <option value="sexAI">sexAI</option> + <option value="sgrAI">sgrAI</option> + <option value="sphI">sphI</option> + <option value="taqI">taqI</option> + <option value="xbaI">xbaI</option> + </param> + </when> + </conditional> + <param name="capture" type="boolean" label="Capture discarded reads to a file" /> + <section name="options_quality" title="quality options" expanded="False"> + <param name="qualitenc" type="select" format="text" label="Quality encoded type" help="specify how quality scores are encoded, 'phred33' (Illumina 1.8+, Sanger, default) or 'phred64' (Illumina 1.3 - 1.5)" > + <option value="phred33">phred33</option> + <option value="phred64">phred64</option> + </param> + </section> + <param name="activate_advanced_options" type="boolean" label="Activate advanced options" help="advanced options are defined below" /> + <section name="options_advanced" title="advanced options" expanded="False"> + <param name="sliding" type="float" value="0.15" label="set the size of the sliding window as a fraction of the read length, between 0 and 1 (default 0.15)" /> + <param name="score" type="integer" value="10" label="Set the score limit. If the average score within the sliding window drops below this value, the read is discarded (default 10)" /> + <param name="remove" type="boolean" checked="false" default="false" label="Clean data, remove any read with an uncalled base" /> + <param name="discard" type="boolean" checked="false" default="false" label="Discard reads with low quality scores"/> + <param name="rescue" type="boolean" checked="false" default="false" label="Rescue barcodes and RAD-Tags?"/> + <param name="truncate" type="integer" value="-1" label="Truncate final read length to this value" help="default = -1" /> + <param name="inline" type="boolean" checked="true" default="true" label="Barcode options -> inline_null" help="barcode is inline with sequence, occurs only on single-end read" /> + <param name="index" type="boolean" checked="false" default="false" label="Barcode options -> index_null" help="barcode is provided in FASTQ header, occurs only on single-end read"/> + <param name="inlinein" type="boolean" checked="false" default="false" label="Barcode options -> inline_inline" help="barcode is inline with sequence, occurs on single and paired-end read" /> + <param name="indexind" type="boolean" checked="false" default="false" label="Barcode options -> index_index" help="barcode is provided in FASTQ header, occurs on single and paired-end read" /> + <param name="inlineind" type="boolean" checked="false" default="false" label="Barcode options -> inline_index" help="barcode is inline with sequence on single-end read, occurs in FASTQ header for paired-end read" /> + <param name="indexin" type="boolean" checked="false" default="false" label="Barcode options -> index_inline" help="barcode occurs in FASTQ header for single-end read, is inline with sequence on paired-end read" /> + </section> + <param name="outype" type="select" format="text" label="Output format" help="output type, either 'fastq' or 'fasta' (default fastq)" > + <option value="fastq">fastq</option> + <option value="fasta">fasta</option> + </param> + + <param name="options_output_infos_selector" type="select" label="Output type"> + <option value="1">Normal (a fastq file by barcode)</option> + <option value="2" selected="True">Additional zip archive with all files (Normal + one archive with all fastq files)</option> + <option value="3">Only a zip archive with all files (one archive with all fastq files)</option> + </param> + +</inputs> +<outputs> + + <data format="txt" name="output" label="results.log with ${tool.name} on ${on_string}: demultiplexed and cleaned reads" /> + <data format="txt" name="additional" label="fast(a/q) file with ${tool.name}" hidden="true"> + <discover_datasets pattern="__designation_and_ext__" directory="galaxy_outputs" visible="true" /> + </data> + <data format="zip" name="output_archive" label="all_files.zip with ${tool.name} on ${on_string}: demultiplexed and cleaned reads "> + <filter>options_output_infos_selector != "1"</filter> + </data> + <data format="fastq" name="discard_file" label="discard.fastq with ${tool.name} on ${on_string}: demultiplexed and cleaned reads "> + <filter>capture</filter> + </data> + + +</outputs> + +<stdio> + <exit_code range="1" level="fatal" description="Error in Stacks Process radtag execution" /> +</stdio> + + +<help> + +.. class:: infomark + +**What it does** + +This program examines raw reads from an Illumina sequencing run and first, checks that the barcode and the RAD cutsite are intact, and demultiplexes the data. If there are +errors in the barcode or the RAD site within a certain allowance process_radtags can correct them. Second, it slides a window down the length of the read and checks the +average quality score within the window. If the score drops below 90% probability of being correct (a raw phred score of 10), the read is discarded. This allows for some +seqeuncing errors while elimating reads where the sequence is degrading as it is being sequenced. By default the sliding window is 15% of the length of the read, but can be +specified on the command line (the threshold and window size can be adjusted). +The process_radtags program can: +handle data that is barcoded, either inline or using an index, or unbarcoded. +use combinatorial barcodes. +check and correct for a restriction enzyme cutsite for single or double-digested +data. +filter adapter sequence while allowing for sequencing error in the adapter pattern. +process individual files or whole directories of files. +directly read gzipped data +filter reads based on Illumina's Chastity filter + +-------- + +**Help** + +Input files: + +- FASTQ, FASTA, zip, tar.gz + +- Barcode File Format + +The barcode file is a very simple format : one barcode per line. + + CGATA + CGGCG + GAAGC + GAGAT + CGATA + CGGCG + GAAGC + GAGAT + +Combinatorial barcodes are specified, one per column, separated by a tab:: + + CGATA ACGTA + CGGCG CGTA + GAAGC CGTA + GAGAT CGTA + CGATA AGCA + CGGCG AGCA + GAAGC AGCA + GAGAT AGCA + + +Instructions to add the functionality of archives management in Galaxy on the `eBiogenouest HUB wiki <https://www.e-biogenouest.org/wiki/ManArchiveGalaxy>`_ . + +-------- + + +**Created by:** + +Stacks was developed by Julian Catchen with contributions from Angel Amores, Paul Hohenlohe, and Bill Cresko + +-------- + +**Project links:** + +`STACKS website <http://creskolab.uoregon.edu/stacks/>`_ . + +`STACKS manual <http://creskolab.uoregon.edu/stacks/stacks_manual.pdf>`_ . + +`STACKS google group <https://groups.google.com/forum/#!forum/stacks-users>`_ . + +-------- + +**References:** + +-J. Catchen, P. Hohenlohe, S. Bassham, A. Amores, and W. Cresko. Stacks: an analysis tool set for population genomics. Molecular Ecology. 2013. + +-J. Catchen, S. Bassham, T. Wilson, M. Currey, C. O'Brien, Q. Yeates, and W. Cresko. The population structure and recent colonization history of Oregon threespine stickleback determined using restriction-site associated DNA-sequencing. Molecular Ecology. 2013. + +-J. Catchen, A. Amores, P. Hohenlohe, W. Cresko, and J. Postlethwait. Stacks: building and genotyping loci de novo from short-read sequences. G3: Genes, Genomes, Genetics, 1:171-182, 2011. + +-A. Amores, J. Catchen, A. Ferrara, Q. Fontenot and J. Postlethwait. Genome evolution and meiotic maps by massively parallel DNA sequencing: Spotted gar, an outgroup for the teleost genome duplication. Genetics, 188:799'808, 2011. + +-P. Hohenlohe, S. Amish, J. Catchen, F. Allendorf, G. Luikart. RAD sequencing identifies thousands of SNPs for assessing hybridization between rainbow trout and westslope cutthroat trout. Molecular Ecology Resources, 11(s1):117-122, 2011. + +-K. Emerson, C. Merz, J. Catchen, P. Hohenlohe, W. Cresko, W. Bradshaw, C. Holzapfel. Resolving postglacial phylogeography using high-throughput sequencing. Proceedings of the National Academy of Science, 107(37):16196-200, 2010. + +-------- + +**Integrated by:** + +Yvan Le Bras and Cyril Monjeaud + +GenOuest Bio-informatics Core Facility + +UMR 6074 IRISA INRIA-CNRS-UR1 Rennes (France) + +support@genouest.org + +If you use this tool in Galaxy, please cite : + +`Y. Le Bras, A. Roult, C. Monjeaud, M. Bahin, O. Quenez, C. Heriveau, A. Bretaudeau, O. Sallou, O. Collin, Towards a Life Sciences Virtual Research Environment : an e-Science initiative in Western France. JOBIM 2013. <https://www.e-biogenouest.org/resources/128>`_ + + + +</help> +<citations> + <citation type="doi">10.1111/mec.12354</citation> + <citation type="doi">10.1111/mec.12330</citation> + <citation type="doi">10.1534/g3.111.000240</citation> + <citation type="doi">10.1534/genetics.111.127324</citation> + <citation type="doi">10.1111/j.1755-0998.2010.02967.x</citation> + <citation type="doi">10.1073/pnas.1006538107</citation> + + <citation type="bibtex">@INPROCEEDINGS{JOBIM2013, + author = {Le Bras, Y. and ROULT, A. and Monjeaud, C. and Bahin, M. and Quenez, O. and Heriveau, C. and Bretaudeau, A. and Sallou, O. and Collin, O.}, + title = {Towards a Life Sciences Virtual Research Environment: An e-Science initiative in Western France}, + booktitle = {JOBIM 2013 Proceedings}, + year = {2013}, + url = {https://www.e-biogenouest.org/resources/128}, + pages = {97-106} + }</citation> +</citations> +</tool> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/STACKS_refmap.py Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,258 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +import sys +import re +import os +import tempfile +import shutil +import subprocess +import glob +import optparse +from os.path import basename +import zipfile +import tarfile +import gzip +from galaxy.datatypes.checkers import * +from stacks import * + + +def __main__(): + + # arguments recuperation + + parser = optparse.OptionParser() + parser.add_option('-p') + parser.add_option('-r') + parser.add_option('-s') + parser.add_option('-O') + parser.add_option('-n') + parser.add_option('-m') + parser.add_option('--bound_low') + parser.add_option('--bound_high') + parser.add_option('--alpha') + parser.add_option('--logfile') + parser.add_option('--compress_output') + parser.add_option('--catalogsnps') + parser.add_option('--catalogalleles') + parser.add_option('--catalogtags') + + # additionnal outputs + + parser.add_option('--total_output') + parser.add_option('--tags_output') + parser.add_option('--snps_output') + parser.add_option('--alleles_output') + parser.add_option('--matches_output') + (options, args) = parser.parse_args() + + # create working directories + + os.mkdir('inputs') + os.mkdir('job_outputs') + os.mkdir('galaxy_outputs') + + cmd_line = [] + cmd_line.append('ref_map.pl') + + # if genetic map + + if options.p: + + # parse config files + + tab_parent_files = galaxy_config_to_tabfiles_for_STACKS(options.p) + + # check if zipped files are into the tab and change tab content + + extract_compress_files_from_tabfiles(tab_parent_files, 'inputs') + + # check files extension (important to have .sam files) + + check_sam_extension_and_add(tab_parent_files, 'inputs') + + # create symlink into the temp dir + + create_symlinks_from_tabfiles(tab_parent_files, 'inputs') + + # create the command input line + + for key in tab_parent_files: + + # if is a file (skip repository created after a decompression) + + if os.path.isfile('inputs/'+key): + cmd_line.extend(['-p', os.path.normpath('inputs/'+key)]) + + # if genetic map with progeny files + + if options.r: + + # parse config files + + tab_progeny_files = galaxy_config_to_tabfiles_for_STACKS(options.r) + + # check if zipped files are into the tab and change tab content + + extract_compress_files_from_tabfiles(tab_progeny_files, 'inputs') + + # check files extension (important to have .sam files) + + check_sam_extension_and_add(tab_progeny_files, 'inputs') + + # create symlink into the temp dir + + create_symlinks_from_tabfiles(tab_progeny_files, 'inputs') + + for key in tab_progeny_files: + + # if is a file (skip repository created after a decompression) + + if os.path.isfile('inputs/' + key): + cmd_line.extend(['-r', 'inputs/' + key]) + + # parse config files and create symlink if individual files are selected + + if options.s: + + # parse config files + + tab_individual_files = galaxy_config_to_tabfiles_for_STACKS(options.s) + + # check if zipped files are into the tab and change tab content + + extract_compress_files_from_tabfiles(tab_individual_files, 'inputs') + + # check files extension (important to have .sam files) + + check_sam_extension_and_add(tab_individual_files, 'inputs') + + # create symlink into the temp dir + + create_symlinks_from_tabfiles(tab_individual_files, 'inputs') + + # create the command input line + + for key in tab_individual_files: + cmd_line.extend(['-s', 'inputs/' + key]) + + # create the options command line + + cmd_line.extend([ + '-S', + '-b', '1', + '-T', '4', + '-o', 'job_outputs', + '-n', options.n, + '-m', options.m, + ]) + + if options.O: + cmd_line.extend(['-O', options.O]) + + if options.bound_low: + cmd_line.extend(['--bound_low', options.bound_low]) + + if options.bound_high: + cmd_line.extend(['--bound_high', options.bound_high]) + + if options.alpha: + cmd_line.extend(['--alpha', options.alpha]) + + # execute job + + print '[COMMAND LINE]' + ' '.join(cmd_line) + + p = subprocess.Popen(cmd_line, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + + (stdoutput, stderror) = p.communicate() + + print stdoutput + print stderror + + # postprocesses + + try: + shutil.move('job_outputs/ref_map.log', options.logfile) + except: + sys.stderr.write('Error in ref_map execution; Please read the additional output (stdout)\n') + + # go inside the outputs dir + + os.chdir('job_outputs') + + # move files + + for i in glob.glob('*'): + if re.search('catalog.snps.tsv$', i): + shutil.copy(i, options.catalogsnps) + if re.search('catalog.alleles.tsv$', i): + shutil.copy(i, options.catalogalleles) + if re.search('catalog.tags.tsv$', i): + shutil.copy(i, options.catalogtags) + + # copy all files inside tmp_dir into workdir + + list_files = glob.glob('*') + + # if compress output is total + + if options.compress_output == 'total': + + mytotalzipfile = zipfile.ZipFile('total.zip.temp', 'w', + allowZip64=True) + + for i in list_files: + + mytotalzipfile.write(os.path.basename(i)) + + # return the unique archive + + shutil.move('total.zip.temp', options.total_output) + elif options.compress_output == 'categories': + + # if compress output is by categories + + mytagszip = zipfile.ZipFile('tags.zip.temp', 'w', allowZip64=True) + mysnpszip = zipfile.ZipFile('snps.zip.temp', 'w', allowZip64=True) + myalleleszip = zipfile.ZipFile('alleles.zip.temp', 'w', allowZip64=True) + mymatcheszip = zipfile.ZipFile('matches.zip.temp', 'w', allowZip64=True) + + for i in list_files: + + # for each type of files + + if re.search("tags\.tsv$", i) and not re.search('batch', i): + mytagszip.write(os.path.basename(i)) + os.remove(i) + elif re.search("snps\.tsv$", i) and not re.search('batch', i): + mysnpszip.write(os.path.basename(i)) + os.remove(i) + elif re.search("alleles\.tsv$", i) and not re.search('batch', i): + myalleleszip.write(os.path.basename(i)) + os.remove(i) + elif re.search("matches\.tsv$", i) and not re.search('batch', i): + mymatcheszip.write(os.path.basename(i)) + os.remove(i) + else: + shutil.move(os.path.basename(i), '../galaxy_outputs') + + # return archives.... + + shutil.move('tags.zip.temp', options.tags_output) + shutil.move('snps.zip.temp', options.snps_output) + shutil.move('alleles.zip.temp', options.alleles_output) + shutil.move('matches.zip.temp', options.matches_output) + else: + + # else no compression + + for i in list_files: + shutil.move(os.path.basename(i), '../galaxy_outputs') + + +if __name__ == '__main__': + __main__() + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/STACKS_refmap.xml Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,347 @@ +<tool id="STACKSrefmap" name="STACKS : Reference map" force_history_refresh="True"> + <description>Run the STACKS ref_map.pl wrapper</description> + +<configfiles> +<configfile name="parent_sequences"> +#if str( $options_usage.options_usage_selector ) == "genetic" +#for $input in $options_usage.parent_sequence: +${input.display_name}::${input} +#end for +#end if +</configfile> +<configfile name="progeny_sequences"> +#if str( $options_usage.options_usage_selector ) == "genetic" and str( $options_usage.options_progeny.options_progeny_selector ) == "yes" +#for $input in $options_usage.options_progeny.progeny_sequence: +${input.display_name}::${input} +#end for +#end if +</configfile> +<configfile name="individual_samples"> +#if str( $options_usage.options_usage_selector ) == "population" +#for $input in $options_usage.individual_sample: +${input.display_name}::${input} +#end for +#end if +</configfile> +</configfiles> +<requirements> + <requirement type="package" version="1.18">stacks</requirement> +</requirements> + +<command interpreter="python"> +STACKS_refmap.py +#if str( $options_usage.options_usage_selector ) == "genetic" + -p $parent_sequences + #if str( $options_usage.options_progeny.options_progeny_selector ) == "yes" + -r $progeny_sequences + #end if +#else + -s $individual_samples + #if str( $options_usage.options_popmap.popmap_selector) == "yes" + -O $options_usage.options_popmap.popmap + #end if +#end if +-n $mismatchbetlocibuild +-m $mincov +--bound_low $snp_options.boundlow +--bound_high $snp_options.boundhigh +--alpha $snp_options.alpha +--catalogsnps $catalogsnps +--catalogalleles $catalogalleles +--catalogtags $catalogtags +--logfile $output +--compress_output $output_compress +##additionnal outputs +--total_output $total_output +--tags_output $tags_output +--snps_output $snps_output +--alleles_output $alleles_output +--matches_output $matches_output + +</command> + +<inputs> + <conditional name="options_usage"> + <param name="options_usage_selector" type="select" label="Select your usage"> + <option value="genetic" selected="true">Genetic map</option> + <option value="population">Population</option> + </param> + <when value="genetic"> + <param name="parent_sequence" format="sam,zip,tar.gz" type="data" multiple="true" label="Files containing parent sequences" help="SAM/ZIP/TAR.GZ files" /> + <conditional name="options_progeny"> + <param name="options_progeny_selector" type="select" label="Use progeny files"> + <option value="yes" selected="true">Yes</option> + <option value="no">No</option> + </param> + <when value="yes"> + <param name="progeny_sequence" format="sam,zip,tar.gz" type="data" multiple="true" label="Files containing progeny sequences" help="SAM/ZIP/TAR.GZ files containing progeny sequences from a mapping cross" /> + </when> + <when value="no"> + </when> + </conditional> + + </when> + <when value="population"> + <param name="individual_sample" format="sam,zip,tar.gz" type="data" multiple="true" label="Files containing an individual sample from a population" help="SAM/ZIP/TAR.GZ files." /> + <conditional name="options_popmap"> + <param name="popmap_selector" type="select" label="Analyzing one or more populations?" > + <option value="no" selected="true">No</option> + <option value="yes">Yes</option> + </param> + <when value="no"></when> + <when value="yes"> + <param name="popmap" type="data" format="tabular,txt" label="Specify a population map" help="If analyzing one or more populations, specify a population map" /> + </when> + </conditional> + </when> + </conditional> + + <param name="mismatchbetlocibuild" type="integer" value="0" label="specify the number of mismatches allowed between loci when building the catalog" /> + <param name="mincov" type="integer" value="1" label="Minimum depth of coverage" help="specify the minimum depth of coverage to report a stack in pstacks" /> + <!-- SNP Model options --> + <section name="snp_options" title="SNP_Model_Options" expanded="False"> + <param name="boundlow" type="float" value="0.0" min="0.0" max="1.0" label="lower bound for epsilon, the error rate" help="between 0 and 1.0"/> + <param name="boundhigh" type="float" value="1.0" min="0.0" max="1.0" label="upper bound for epsilon, the error rate" help="between 0 and 1.0" /> + <param name="alpha" type="float" value="0.05" min="0.001" max="0.1" label="chi square significance level required to call a heterozygote or homozygote" help="either 0.1, 0.05 (default), 0.01, or 0.001" /> + </section> + <!-- Output options --> + <param name="output_compress" type="select" label="Output type" help="please see below for details"> + <option value="default" selected="true">No compression</option> + <option value="categories">Compressed by categories</option> + <option value="total">Compressed all outputs</option> + </param> +</inputs> +<outputs> + + <data format="txt" name="output" label="result.log with ${tool.name} on ${on_string}" /> + <data format="txt" name="additional" label="additional file with ${tool.name}" hidden="true"> + <discover_datasets pattern="__designation_and_ext__" directory="galaxy_outputs" visible="true" /> + </data> + + <data format="tabular" name="catalogsnps" label="catalog.snps with ${tool.name} on ${on_string}" /> + <data format="tabular" name="catalogalleles" label="catalog.alleles with ${tool.name} on ${on_string}" /> + <data format="tabular" name="catalogtags" label="catalog.tags with ${tool.name} on ${on_string}" /> + + <!-- additionnal output archives --> + <data format="zip" name="total_output" label="total_output.zip with ${tool.name} on ${on_string}" > + <filter>output_compress == "total"</filter> + </data> + <data format="zip" name="tags_output" label="tags_output.zip with ${tool.name} on ${on_string}" > + <filter>output_compress == "categories"</filter> + </data> + <data format="zip" name="snps_output" label="snps_output.zip with ${tool.name} on ${on_string}" > + <filter>output_compress == "categories"</filter> + </data> + <data format="zip" name="alleles_output" label="alleles_output.zip with ${tool.name} on ${on_string}" > + <filter>output_compress == "categories"</filter> + </data> + <data format="zip" name="matches_output" label="matches_output.zip with ${tool.name} on ${on_string}" > + <filter>output_compress == "categories"</filter> + </data> +</outputs> +<help> + +.. class:: infomark + +**What it does** + +This program expects data that have been aligned to a reference genome, and can accept data directly from Bowtie, or from any aligner that can produce SAM format. To avoid datasets names problems, we recommand the use of the *Map with BWA for STACKS tool*. This program will execute each of the Stacks components: first, running pstacks on each of the samples specified, building loci (based on the reference alignment) and calling SNPs in each. Second, cstacks will be run to create a catalog of all loci specified as 'parents' or 'samples' on the command line, again using alignment to match loci in the catalog. Finally, sstacks will be executed to match each sample against the catalog. The ref_map.pl program will also load the results of each stage of the analysis: individual loci, the catalog, and matches against the catalog into the database (although this can be disabled). After matching the program will build a database index to speed up access (index_radtags.pl) and enable web-based filtering. + + +-------- + + +**Created by:** + +Stacks was developed by Julian Catchen with contributions from Angel Amores, Paul Hohenlohe, and Bill Cresko + +-------- + +**Example:** + +Input files: + +- SAM, zip, tar.gz + +- Population map:: + + indv_01 1 + indv_02 1 + indv_03 1 + indv_04 2 + indv_05 2 + indv_06 2 + +Output files: + +- XXX.tags.tsv file:: + + Column Name Description + 1 Sql ID This field will always be "0", however the MySQL database will assign an ID when it is loaded. + 2 Sample ID Each sample passed through Stacks gets a unique id for that sample. + 3 Stack ID Each stack formed gets an ID. + 4 Chromosome If aligned to a reference genome using pstacks, otherwise it is blank. + 5 Basepair If aligned to ref genome using pstacks. + 6 Strand If aligned to ref genome using pstacks. + 7 Sequence Type Either 'consensus', 'primary' or 'secondary', see the Stacks paper for definitions of these terms. + 8 Sequence ID The individual sequence read that was merged into this stack. + 9 Sequence The raw sequencing read. + 10 Deleveraged Flag If "1", this stack was processed by the deleveraging algorithm and was broken down from a larger stack. + 11 Blacklisted Flag If "1", this stack was still confounded depsite processing by the deleveraging algorithm. + 12 Lumberja ckstack Flag If "1", this stack was set aside due to having an extreme depth of coverage. + +Notes: For the tags file, each stack will start in the file with a consensus sequence for the entire stack followed by the flags for that stack. Then, each individual read that was merged into that stack will follow. The next stack will start with another consensus sequence. + + +- XXX.snps.tsv file:: + + Column Name Description + 1 Sql ID This field will always be "0", however the MySQL database will assign an ID when it is loaded. + 2 Sample ID + 3 Stack ID + 4 SNP Column + 5 Likelihood ratio From the SNP-calling model. + 6 Rank_1 Majority nucleotide. + 7 Rank_2 Alternative nucleotide. + +Notes: If a stack has two SNPs called within it, then there will be two lines in this file listing each one. + + +- XXX.alleles.tsv file:: + + Column Name Description + 1 Sql ID This field will always be "0", however the MySQL database will assign an ID when it is loaded. + 2 Sample ID + 3 Stack ID + 4 Haplotype The haplotype, as constructed from the called SNPs at each locus. + 5 Percent Percentage of reads that have this haplotype + 6 Count Raw number of reads that have this haplotype + + +- XXX.matches.tsv file:: + + Column Name Description + 1 Sql ID This field will always be "0", however the MySQL database will assign an ID when it is loaded. + 2 Batch ID + 3 Catalog ID + 4 Sample ID + 5 Stack ID + 6 Haplotype + 7 Stack Depth + +Notes: Each line in this file records a match between a catalog locus and a locus in an individual, for a particular haplotype. The Batch ID plus the Catalog ID together represent a unique locus in the entire population, while the Sample ID and the Stack ID together represent a unique locus in an individual sample. + + +- batch_X.sumstats.tsv Summary Statistics Output:: + + Batch ID The batch identifier for this data set. + Locus ID Catalog locus identifier. + Chromosome If aligned to a reference genome. + Basepair If aligned to a reference genome. This is the alignment of the whole catalog locus. The exact basepair reported is aligned to the location of the RAD site (depending on whether alignment is to the positive or negative strand). + Column The nucleotide site within the catalog locus. + Population ID The ID supplied to the populations program, as written in the population map file. + P Nucleotide The most frequent allele at this position in this population. + Q Nucleotide The alternative allele. + Number of Individuals Number of individuals sampled in this population at this site. + P Frequency of most frequent allele. + Observed Heterozygosity The proportion of individuals that are heterozygotes in this population. + Observed Homozygosity The proportion of individuals that are homozygotes in this population. + Expected Heterozygosity Heterozygosity expected under Hardy-Weinberg equilibrium. + Expected Homozygosity Homozygosity expected under Hardy-Weinberg equilibrium. + pi An estimate of nucleotide diversity. + Smoothed pi A weighted average of p depending on the surrounding 3s of sequence in both directions. + Smoothed pi P-value If bootstrap resampling is enabled, a p-value ranking the significance of p within this population. + FIS The inbreeding coefficient of an individual (I) relative to the subpopulation (S). + Smoothed FIS A weighted average of FIS depending on the surrounding 3s of sequence in both directions. + Smoothed FIS P-value If bootstrap resampling is enabled, a p-value ranking the significance of FIS within this population. + Private allele True (1) or false (0), depending on if this allele is only occurs in this population. + +- batch_X.fst_Y-Z.tsv Pairwise FST Output:: + + Batch ID The batch identifier for this data set. + Locus ID Catalog locus identifier. + Population ID 1 The ID supplied to the populations program, as written in the population map file. + Population ID 2 The ID supplied to the populations program, as written in the population map file. + Chromosome If aligned to a reference genome. + Basepair If aligned to a reference genome. This is the alignment of the whole catalog locus. The exact basepair reported is aligned to the location of the RAD site (depending on whether alignment is to the positive or negative strand). + Column The nucleotide site within the catalog locus. + Overall pi An estimate of nucleotide diversity across the two populations. + FST A measure of population differentiation. + FET p-value P-value describing if the FST measure is statistically significant according to Fisher's Exact Test. + Odds Ratio Fisher's Exact Test odds ratio + CI High Fisher's Exact Test confidence interval. + CI Low Fisher's Exact Test confidence interval. + LOD Score Logarithm of odds score. + Expected Heterozygosity Heterozygosity expected under Hardy-Weinberg equilibrium. + Expected Homozygosity Homozygosity expected under Hardy-Weinberg equilibrium. + Corrected FST FST with either the FET p-value, or a window-size or genome size Bonferroni correction. + Smoothed FST A weighted average of FST depending on the surrounding 3s of sequence in both directions. + Smoothed FST P-value If bootstrap resampling is enabled, a p-value ranking the significance of FST within this pair of populations. + + +Instructions to add the functionality of archives management in Galaxy on the `eBiogenouest HUB wiki <https://www.e-biogenouest.org/wiki/ManArchiveGalaxy>`_ . + + +-------- + +**Project links:** + +`STACKS website <http://creskolab.uoregon.edu/stacks/>`_ . + +`STACKS manual <http://creskolab.uoregon.edu/stacks/stacks_manual.pdf>`_ . + +`STACKS google group <https://groups.google.com/forum/#!forum/stacks-users>`_ . + +-------- + +**References:** + +-J. Catchen, P. Hohenlohe, S. Bassham, A. Amores, and W. Cresko. Stacks: an analysis tool set for population genomics. Molecular Ecology. 2013. + +-J. Catchen, S. Bassham, T. Wilson, M. Currey, C. O'Brien, Q. Yeates, and W. Cresko. The population structure and recent colonization history of Oregon threespine stickleback determined using restriction-site associated DNA-sequencing. Molecular Ecology. 2013. + +-J. Catchen, A. Amores, P. Hohenlohe, W. Cresko, and J. Postlethwait. Stacks: building and genotyping loci de novo from short-read sequences. G3: Genes, Genomes, Genetics, 1:171-182, 2011. + +-A. Amores, J. Catchen, A. Ferrara, Q. Fontenot and J. Postlethwait. Genome evolution and meiotic maps by massively parallel DNA sequencing: Spotted gar, an outgroup for the teleost genome duplication. Genetics, 188:799'808, 2011. + +-P. Hohenlohe, S. Amish, J. Catchen, F. Allendorf, G. Luikart. RAD sequencing identifies thousands of SNPs for assessing hybridization between rainbow trout and westslope cutthroat trout. Molecular Ecology Resources, 11(s1):117-122, 2011. + +-K. Emerson, C. Merz, J. Catchen, P. Hohenlohe, W. Cresko, W. Bradshaw, C. Holzapfel. Resolving postglacial phylogeography using high-throughput sequencing. Proceedings of the National Academy of Science, 107(37):16196-200, 2010. + +-------- + +**Integrated by:** + +Yvan Le Bras and Cyril Monjeaud + +GenOuest Bio-informatics Core Facility + +UMR 6074 IRISA INRIA-CNRS-UR1 Rennes (France) + +support@genouest.org + +If you use this tool in Galaxy, please cite : + +`Y. Le Bras, A. Roult, C. Monjeaud, M. Bahin, O. Quenez, C. Heriveau, A. Bretaudeau, O. Sallou, O. Collin, Towards a Life Sciences Virtual Research Environment : an e-Science initiative in Western France. JOBIM 2013. <https://www.e-biogenouest.org/resources/128>`_ + + +</help> +<citations> + <citation type="doi">10.1111/mec.12354</citation> + <citation type="doi">10.1111/mec.12330</citation> + <citation type="doi">10.1534/g3.111.000240</citation> + <citation type="doi">10.1534/genetics.111.127324</citation> + <citation type="doi">10.1111/j.1755-0998.2010.02967.x</citation> + <citation type="doi">10.1073/pnas.1006538107</citation> + + <citation type="bibtex">@INPROCEEDINGS{JOBIM2013, + author = {Le Bras, Y. and ROULT, A. and Monjeaud, C. and Bahin, M. and Quenez, O. and Heriveau, C. and Bretaudeau, A. and Sallou, O. and Collin, O.}, + title = {Towards a Life Sciences Virtual Research Environment: An e-Science initiative in Western France}, + booktitle = {JOBIM 2013 Proceedings}, + year = {2013}, + url = {https://www.e-biogenouest.org/resources/128}, + pages = {97-106} + }</citation> +</citations> +</tool> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/STACKS_sort_read_pairs.py Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,109 @@ +#!/usr/bin/env python + +import sys, re +import os +import tempfile +import shutil, subprocess, glob +import optparse +from os.path import basename +import zipfile, tarfile, gzip +from galaxy.datatypes.checkers import * +from stacks import * + +""" + +Created by Yvan Le Bras +yvan.le_bras@irisa.fr + +Last modifications : 02/17/2014 + +WARNING : + +STACKS_sort_read_pairs.py needs: + +- STACKS scripts in your $PATH + +These scripts are available after compiling the sources of STACKS : + +http://creskolab.uoregon.edu/stacks/ + +or with the galaxy_stacks package in the Genouest toolshed + + +""" + +def __main__(): + + + # create the working dir + os.mkdir("sort_read_outputs") + os.mkdir("assembly_outputs") + os.mkdir("samples_inputs") + os.mkdir("stacks_inputs") + + # arguments recuperation + parser = optparse.OptionParser() + parser.add_option("-a") + parser.add_option("-e") + parser.add_option("-b") + parser.add_option("-c") + parser.add_option("-d") + parser.add_option("-o") + (options, args) = parser.parse_args() + + # edit the command line + cmd_line1 = ["sort_read_pairs.pl"] + + #parse config files and create symlink if individual files are selected + + # STACKS_archive + # check if zipped files are into the tab + extract_compress_files(options.a, os.getcwd()+"/stacks_inputs") + + # samples_archive + # check if zipped files are into the tab and change tab content + extract_compress_files(options.e, os.getcwd()+"/samples_inputs") + + # create the sort_read_pairs command input line + cmd_line1.extend(["-p", "stacks_inputs", "-s", "samples_inputs", "-o", "sort_read_outputs"]) + + if options.b: + cmd_line1.extend(["-w", options.b]) + if options.c: + cmd_line1.extend(["-r", options.c]) + + # exec command line 1 + p1 = subprocess.Popen(cmd_line1) + p1.communicate() + + # parse all files list and remove empty files from the output dir + for fasta_file in glob.glob("sort_read_outputs/*"): + if os.stat(fasta_file).st_size==0: + print "File "+fasta_file+" is empty" + os.remove(fasta_file) + + + # create the exec_velvet command input line + cmd_line2 = ["exec_velvet.pl"] + cmd_line2.extend(["-s", "sort_read_outputs", "-o", "assembly_outputs"]) + cmd_line2.append("-c") + + if options.d: + cmd_line2.extend(["-M", options.d]) + + # version + #cmd = 'sort_read_pairs.pl'+cmd_files+" "+cmd_options+" 2>&1" + #cmd2 = 'exec_velvet.pl'+cmd_files2+" -c -e /softs/local/velvet/velvet_1.2.03/ "+cmd_options2+" 2>&1" + + # launch the command line 2 + p2 = subprocess.Popen(cmd_line2) + p2.communicate() + + # get collated.fa file + try: + shutil.copy("assembly_outputs/collated.fa", options.o) + except: + print "No result file" + sys.exit(1) + +if __name__ == "__main__": __main__()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/STACKS_sort_read_pairs.xml Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,164 @@ +<tool id="STACKSassembleperead" name="STACKS : assemble read pairs by locus" version="1.1.1"> + <description>Run the STACKS sort_read_pairs.pl and exec_velvet.pl wrappers</description> + +<requirements> + <requirement type="package" version="1.18">stacks</requirement> + <requirement type="package" version="1.2.10">velvet</requirement> + </requirements> +<command interpreter="python"> + +STACKS_sort_read_pairs.py +-a $STACKS_archive +-e $samples_archive +#if str( $options_whitelist.whitelist_selector) == "yes" +-b $whitelist +#end if +#if str( $options_filter.reads_selector) == "yes" +-c $options_filter.threshold +#end if +#if str( $options_filter2.length_selector) == "yes" +-d $options_filter2.threshold2 +#end if +-o $output + + +</command> + +<inputs> +<param name="STACKS_archive" format="zip,tar.gz" type="data" label="Archive from STACKS pipeline regrouping all outputs" /> + +<param name="samples_archive" format="zip,fastq.gz,tar.gz,tar.bz2" type="data" label="Archive with raw reads used to execute previous STACKS pipeline" /> + + <conditional name="options_whitelist"> +<param name="whitelist_selector" type="select" label="Have you got a whitelist?" > +<option value="no" selected="true">No</option> +<option value="yes">Yes</option> +</param> +<when value="no"></when> +<when value="yes"> +<param name="whitelist" format="txt, tabular" type="data" label="Whitelist file containing loci that we want to assemble: those that have SNPs" /> +</when> +</conditional> + + +<conditional name="options_filter"> +<param name="reads_selector" type="select" label="Specify a treshold for the minimum number of reads by locus?" > +<option value="no" selected="true">No</option> +<option value="yes">Yes</option> +</param> +<when value="no"></when> +<when value="yes"> +<param name="threshold" type="integer" value="10" label="Minimum number of reads by locus"/> +</when> +</conditional> +<conditional name="options_filter2"> +<param name="length_selector" type="select" label="Specify a minimum length for asssembled contigs?" > +<option value="no" selected="true">No</option> +<option value="yes">Yes</option> +</param> +<when value="no"></when> +<when value="yes"> +<param name="threshold2" type="integer" value="200" label="Minimum length for asssembled contigs"/> +</when> +</conditional> + +</inputs> +<outputs> + <data format="fasta" name="output" label="collated.fa : ${tool.name} on ${on_string}" /> +</outputs> +<stdio> + <exit_code range="1" level="fatal" description="Error" /> +</stdio> +<help> + +.. class:: infomark + +**What it does** + +This program will run each of the Stacks sort_read_pairs.pl and exec_velvet.pl utilities to assemble pair-end reads from STACKS pipeline results + +-------- + +**Created by:** + +Stacks was developed by Julian Catchen with contributions from Angel Amores, Paul Hohenlohe, and Bill Cresko + +-------- + +**Example:** + +Input file: + +Output archives of STACKS : Reference map or STACKS : De novo map, in zip or tar.gz format + + +Output file: + +A collated.fa file containing assembled contigs for each locus + + +Instructions to add the functionality of archives management in Galaxy on the `eBiogenouest HUB wiki <https://www.e-biogenouest.org/wiki/ManArchiveGalaxy>`_ . + +-------- + + +**Project links:** + +`STACKS website <http://creskolab.uoregon.edu/stacks/>`_ . + +`STACKS manual <http://creskolab.uoregon.edu/stacks/stacks_manual.pdf>`_ . + +`STACKS google group <https://groups.google.com/forum/#!forum/stacks-users>`_ . + +-------- + +**References:** + +-J. Catchen, P. Hohenlohe, S. Bassham, A. Amores, and W. Cresko. Stacks: an analysis tool set for population genomics. Molecular Ecology. 2013. + +-J. Catchen, S. Bassham, T. Wilson, M. Currey, C. O'Brien, Q. Yeates, and W. Cresko. The population structure and recent colonization history of Oregon threespine stickleback determined using restriction-site associated DNA-sequencing. Molecular Ecology. 2013. + +-J. Catchen, A. Amores, P. Hohenlohe, W. Cresko, and J. Postlethwait. Stacks: building and genotyping loci de novo from short-read sequences. G3: Genes, Genomes, Genetics, 1:171-182, 2011. + +-A. Amores, J. Catchen, A. Ferrara, Q. Fontenot and J. Postlethwait. Genome evolution and meiotic maps by massively parallel DNA sequencing: Spotted gar, an outgroup for the teleost genome duplication. Genetics, 188:799'808, 2011. + +-P. Hohenlohe, S. Amish, J. Catchen, F. Allendorf, G. Luikart. RAD sequencing identifies thousands of SNPs for assessing hybridization between rainbow trout and westslope cutthroat trout. Molecular Ecology Resources, 11(s1):117-122, 2011. + +-K. Emerson, C. Merz, J. Catchen, P. Hohenlohe, W. Cresko, W. Bradshaw, C. Holzapfel. Resolving postglacial phylogeography using high-throughput sequencing. Proceedings of the National Academy of Science, 107(37):16196-200, 2010. + +-------- + +**Integrated by:** + +Yvan Le Bras and Cyril Monjeaud + +GenOuest Bio-informatics Core Facility + +UMR 6074 IRISA INRIA-CNRS-UR1 Rennes (France) + +support@genouest.org + +If you use this tool in Galaxy, please cite : + +`Y. Le Bras, A. Roult, C. Monjeaud, M. Bahin, O. Quenez, C. Heriveau, A. Bretaudeau, O. Sallou, O. Collin, Towards a Life Sciences Virtual Research Environment : an e-Science initiative in Western France. JOBIM 2013. <https://www.e-biogenouest.org/resources/128>`_ + + +</help> +<citations> + <citation type="doi">10.1111/mec.12354</citation> + <citation type="doi">10.1111/mec.12330</citation> + <citation type="doi">10.1534/g3.111.000240</citation> + <citation type="doi">10.1534/genetics.111.127324</citation> + <citation type="doi">10.1111/j.1755-0998.2010.02967.x</citation> + <citation type="doi">10.1073/pnas.1006538107</citation> + + <citation type="bibtex">@INPROCEEDINGS{JOBIM2013, + author = {Le Bras, Y. and ROULT, A. and Monjeaud, C. and Bahin, M. and Quenez, O. and Heriveau, C. and Bretaudeau, A. and Sallou, O. and Collin, O.}, + title = {Towards a Life Sciences Virtual Research Environment: An e-Science initiative in Western France}, + booktitle = {JOBIM 2013 Proceedings}, + year = {2013}, + url = {https://www.e-biogenouest.org/resources/128}, + pages = {97-106} + }</citation> +</citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa_index.loc.sample Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,38 @@ +#This is a sample file distributed with Galaxy that enables tools +#to use a directory of BWA indexed sequences data files. You will need +#to create these data files and then create a bwa_index.loc file +#similar to this one (store it in this directory) that points to +#the directories in which those files are stored. The bwa_index.loc +#file has this format (longer white space characters are TAB characters): +# +#<unique_build_id> <dbkey> <display_name> <file_path> +# +#So, for example, if you had phiX indexed stored in +#/depot/data2/galaxy/phiX/base/, +#then the bwa_index.loc entry would look like this: +# +#phiX174 phiX phiX Pretty /depot/data2/galaxy/phiX/base/phiX.fa +# +#and your /depot/data2/galaxy/phiX/base/ directory +#would contain phiX.fa.* files: +# +#-rw-r--r-- 1 james universe 830134 2005-09-13 10:12 phiX.fa.amb +#-rw-r--r-- 1 james universe 527388 2005-09-13 10:12 phiX.fa.ann +#-rw-r--r-- 1 james universe 269808 2005-09-13 10:12 phiX.fa.bwt +#...etc... +# +#Your bwa_index.loc file should include an entry per line for each +#index set you have stored. The "file" in the path does not actually +#exist, but it is the prefix for the actual index files. For example: +# +#phiX174 phiX phiX174 /depot/data2/galaxy/phiX/base/phiX.fa +#hg18canon hg18 hg18 Canonical /depot/data2/galaxy/hg18/base/hg18canon.fa +#hg18full hg18 hg18 Full /depot/data2/galaxy/hg18/base/hg18full.fa +#/orig/path/hg19.fa hg19 hg19 /depot/data2/galaxy/hg19/base/hg19.fa +#...etc... +# +#Note that for backwards compatibility with workflows, the unique ID of +#an entry must be the path that was in the original loc file, because that +#is the value stored in the workflow for that parameter. That is why the +#hg19 entry above looks odd. New genomes can be better-looking. +#
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa_wrapper.py Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,358 @@ +#!/usr/bin/env python + +""" +Runs BWA on single-end or paired-end data. +Produces a SAM file containing the mappings. +Works with BWA version 0.5.9. + +usage: bwa_wrapper.py [options] + +See below for options +""" + +import optparse, os, shutil, subprocess, sys, tempfile +import glob +import gzip, zipfile, tarfile + +def stop_err( msg ): + sys.stderr.write( '%s\n' % msg ) + sys.exit() + +def check_is_double_encoded( fastq ): + # check that first read is bases, not one base followed by numbers + bases = [ 'A', 'C', 'G', 'T', 'a', 'c', 'g', 't', 'N' ] + nums = [ '0', '1', '2', '3' ] + for line in file( fastq, 'rb'): + if not line.strip() or line.startswith( '@' ): + continue + if len( [ b for b in line.strip() if b in nums ] ) > 0: + return False + elif line.strip()[0] in bases and len( [ b for b in line.strip() if b in bases ] ) == len( line.strip() ): + return True + else: + raise Exception, 'First line in first read does not appear to be a valid FASTQ read in either base-space or color-space' + raise Exception, 'There is no non-comment and non-blank line in your FASTQ file' + +def __main__(): + #Parse Command Line + parser = optparse.OptionParser() + parser.add_option( '-t', '--threads', dest='threads', help='The number of threads to use' ) + parser.add_option( '-c', '--color-space', dest='color_space', action='store_true', help='If the input files are SOLiD format' ) + parser.add_option( '-r', '--ref', dest='ref', help='The reference genome to use or index' ) + parser.add_option( '-f', '--input1', dest='fastq', help='The (forward) fastq file to use for the mapping' ) + parser.add_option( '-u', '--output', dest='output', help='The file to save the output (SAM format)' ) + parser.add_option( '-p', '--params', dest='params', help='Parameter setting to use (pre_set or full)' ) + parser.add_option( '-s', '--fileSource', dest='fileSource', help='Whether to use a previously indexed reference sequence or one form history (indexed or history)' ) + parser.add_option( '-n', '--maxEditDist', dest='maxEditDist', help='Maximum edit distance if integer' ) + parser.add_option( '-m', '--fracMissingAligns', dest='fracMissingAligns', help='Fraction of missing alignments given 2% uniform base error rate if fraction' ) + parser.add_option( '-o', '--maxGapOpens', dest='maxGapOpens', help='Maximum number of gap opens' ) + parser.add_option( '-e', '--maxGapExtens', dest='maxGapExtens', help='Maximum number of gap extensions' ) + parser.add_option( '-d', '--disallowLongDel', dest='disallowLongDel', help='Disallow a long deletion within specified bps' ) + parser.add_option( '-i', '--disallowIndel', dest='disallowIndel', help='Disallow indel within specified bps' ) + parser.add_option( '-l', '--seed', dest='seed', help='Take the first specified subsequences' ) + parser.add_option( '-k', '--maxEditDistSeed', dest='maxEditDistSeed', help='Maximum edit distance to the seed' ) + parser.add_option( '-M', '--mismatchPenalty', dest='mismatchPenalty', help='Mismatch penalty' ) + parser.add_option( '-O', '--gapOpenPenalty', dest='gapOpenPenalty', help='Gap open penalty' ) + parser.add_option( '-E', '--gapExtensPenalty', dest='gapExtensPenalty', help='Gap extension penalty' ) + parser.add_option( '-R', '--suboptAlign', dest='suboptAlign', default=None, help='Proceed with suboptimal alignments even if the top hit is a repeat' ) + parser.add_option( '-N', '--noIterSearch', dest='noIterSearch', help='Disable iterative search' ) + parser.add_option( '-T', '--outputTopN', dest='outputTopN', help='Maximum number of alignments to output in the XA tag for reads paired properly' ) + parser.add_option( '', '--outputTopNDisc', dest='outputTopNDisc', help='Maximum number of alignments to output in the XA tag for disconcordant read pairs (excluding singletons)' ) + parser.add_option( '-S', '--maxInsertSize', dest='maxInsertSize', help='Maximum insert size for a read pair to be considered mapped good' ) + parser.add_option( '-P', '--maxOccurPairing', dest='maxOccurPairing', help='Maximum occurrences of a read for pairings' ) + parser.add_option( '', '--rgid', dest='rgid', help='Read group identifier' ) + parser.add_option( '', '--rgcn', dest='rgcn', help='Sequencing center that produced the read' ) + parser.add_option( '', '--rgds', dest='rgds', help='Description' ) + parser.add_option( '', '--rgdt', dest='rgdt', help='Date that run was produced (ISO8601 format date or date/time, like YYYY-MM-DD)' ) + parser.add_option( '', '--rgfo', dest='rgfo', help='Flow order' ) + parser.add_option( '', '--rgks', dest='rgks', help='The array of nucleotide bases that correspond to the key sequence of each read' ) + parser.add_option( '', '--rglb', dest='rglb', help='Library name' ) + parser.add_option( '', '--rgpg', dest='rgpg', help='Programs used for processing the read group' ) + parser.add_option( '', '--rgpi', dest='rgpi', help='Predicted median insert size' ) + parser.add_option( '', '--rgpl', dest='rgpl', choices=[ 'CAPILLARY', 'LS454', 'ILLUMINA', 'SOLID', 'HELICOS', 'IONTORRENT' and 'PACBIO' ], help='Platform/technology used to produce the reads' ) + parser.add_option( '', '--rgpu', dest='rgpu', help='Platform unit (e.g. flowcell-barcode.lane for Illumina or slide for SOLiD)' ) + parser.add_option( '', '--rgsm', dest='rgsm', help='Sample' ) + parser.add_option( '-D', '--dbkey', dest='dbkey', help='Dbkey for reference genome' ) + parser.add_option( '-X', '--do_not_build_index', dest='do_not_build_index', action='store_true', help="Don't build index" ) + parser.add_option( '-H', '--suppressHeader', dest='suppressHeader', help='Suppress header' ) + parser.add_option( '-I', '--illumina1.3', dest='illumina13qual', help='Input FASTQ files have Illuina 1.3 quality scores' ) + (options, args) = parser.parse_args() + + tmp_input_dir = tempfile.mkdtemp() + tmp_output_dir= tempfile.mkdtemp() + + + myarchive = zipfile.ZipFile(options.fastq, 'r', allowZip64=True) + myarchive.extractall(tmp_input_dir) + + for fastq in glob.glob(tmp_input_dir+'/*'): + + sam_output_file=tmp_output_dir+'/'+os.path.splitext(os.path.basename(fastq))[0]+'.sam' + create_sam=open(sam_output_file, "w") + create_sam.close() + + # output version # of tool + try: + tmp = tempfile.NamedTemporaryFile().name + tmp_stdout = open( tmp, 'wb' ) + proc = subprocess.Popen( args='bwa 2>&1', shell=True, stdout=tmp_stdout ) + tmp_stdout.close() + returncode = proc.wait() + stdout = None + for line in open( tmp_stdout.name, 'rb' ): + if line.lower().find( 'version' ) >= 0: + stdout = line.strip() + break + if stdout: + sys.stdout.write( 'BWA %s\n' % stdout ) + else: + raise Exception + except: + sys.stdout.write( 'Could not determine BWA version\n' ) + + # check for color space fastq that's not double-encoded and exit if appropriate + if options.color_space: + if not check_is_double_encoded( options.fastq ): + stop_err( 'Your file must be double-encoded (it must be converted from "numbers" to "bases"). See the help section for details' ) + #if options.genAlignType == 'paired': + #if not check_is_double_encoded( options.rfastq ): + #stop_err( 'Your reverse reads file must also be double-encoded (it must be converted from "numbers" to "bases"). See the help section for details' ) + + #fastq = options.fastq + #if options.rfastq: + #rfastq = options.rfastq + + # set color space variable + if options.color_space: + color_space = '-c' + else: + color_space = '' + + # make temp directory for placement of indices + tmp_index_dir = tempfile.mkdtemp() + tmp_dir = tempfile.mkdtemp() + # index if necessary + if options.fileSource == 'history' and not options.do_not_build_index: + ref_file = tempfile.NamedTemporaryFile( dir=tmp_index_dir ) + ref_file_name = ref_file.name + ref_file.close() + os.symlink( options.ref, ref_file_name ) + # determine which indexing algorithm to use, based on size + try: + size = os.stat( options.ref ).st_size + if size <= 2**30: + indexingAlg = 'is' + else: + indexingAlg = 'bwtsw' + except: + indexingAlg = 'is' + indexing_cmds = '%s -a %s' % ( color_space, indexingAlg ) + cmd1 = 'bwa index %s %s' % ( indexing_cmds, ref_file_name ) + try: + tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name + tmp_stderr = open( tmp, 'wb' ) + proc = subprocess.Popen( args=cmd1, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() ) + returncode = proc.wait() + tmp_stderr.close() + # get stderr, allowing for case where it's very large + tmp_stderr = open( tmp, 'rb' ) + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += tmp_stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + tmp_stderr.close() + if returncode != 0: + raise Exception, stderr + except Exception, e: + # clean up temp dirs + if os.path.exists( tmp_index_dir ): + shutil.rmtree( tmp_index_dir ) + if os.path.exists( tmp_dir ): + shutil.rmtree( tmp_dir ) + stop_err( 'Error indexing reference sequence. ' + str( e ) ) + else: + ref_file_name = options.ref + if options.illumina13qual: + illumina_quals = "-I" + else: + illumina_quals = "" + + # set up aligning and generate aligning command options + if options.params == 'pre_set': + aligning_cmds = '-t %s %s %s' % ( options.threads, color_space, illumina_quals ) + gen_alignment_cmds = '' + else: + if options.maxEditDist != '0': + editDist = options.maxEditDist + else: + editDist = options.fracMissingAligns + if options.seed != '-1': + seed = '-l %s' % options.seed + else: + seed = '' + if options.suboptAlign: + suboptAlign = '-R "%s"' % ( options.suboptAlign ) + else: + suboptAlign = '' + if options.noIterSearch == 'true': + noIterSearch = '-N' + else: + noIterSearch = '' + aligning_cmds = '-n %s -o %s -e %s -d %s -i %s %s -k %s -t %s -M %s -O %s -E %s %s %s %s %s' % \ + ( editDist, options.maxGapOpens, options.maxGapExtens, options.disallowLongDel, + options.disallowIndel, seed, options.maxEditDistSeed, options.threads, + options.mismatchPenalty, options.gapOpenPenalty, options.gapExtensPenalty, + suboptAlign, noIterSearch, color_space, illumina_quals ) + #if options.genAlignType == 'paired': + #gen_alignment_cmds = '-a %s -o %s' % ( options.maxInsertSize, options.maxOccurPairing ) + #if options.outputTopNDisc: + #gen_alignment_cmds += ' -N %s' % options.outputTopNDisc + + gen_alignment_cmds = '' + if options.rgid: + if not options.rglb or not options.rgpl or not options.rgsm: + stop_err( 'If you want to specify read groups, you must include the ID, LB, PL, and SM tags.' ) + readGroup = '@RG\tID:%s\tLB:%s\tPL:%s\tSM:%s' % ( options.rgid, options.rglb, options.rgpl, options.rgsm ) + if options.rgcn: + readGroup += '\tCN:%s' % options.rgcn + if options.rgds: + readGroup += '\tDS:%s' % options.rgds + if options.rgdt: + readGroup += '\tDT:%s' % options.rgdt + if options.rgfo: + readGroup += '\tFO:%s' % options.rgfo + if options.rgks: + readGroup += '\tKS:%s' % options.rgks + if options.rgpg: + readGroup += '\tPG:%s' % options.rgpg + if options.rgpi: + readGroup += '\tPI:%s' % options.rgpi + if options.rgpu: + readGroup += '\tPU:%s' % options.rgpu + gen_alignment_cmds += ' -r "%s"' % readGroup + if options.outputTopN: + gen_alignment_cmds += ' -n %s' % options.outputTopN + # set up output files + tmp_align_out = tempfile.NamedTemporaryFile( dir=tmp_dir ) + tmp_align_out_name = tmp_align_out.name + tmp_align_out.close() + tmp_align_out2 = tempfile.NamedTemporaryFile( dir=tmp_dir ) + tmp_align_out2_name = tmp_align_out2.name + tmp_align_out2.close() + # prepare actual aligning and generate aligning commands + cmd2 = 'bwa aln %s %s %s > %s' % ( aligning_cmds, ref_file_name, fastq, tmp_align_out_name ) + cmd2b = '' + cmd3 = 'bwa samse %s %s %s %s >> %s' % ( gen_alignment_cmds, ref_file_name, tmp_align_out_name, fastq, sam_output_file ) + # perform alignments + buffsize = 1048576 + try: + # need to nest try-except in try-finally to handle 2.4 + try: + # align + try: + tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name + tmp_stderr = open( tmp, 'wb' ) + proc = subprocess.Popen( args=cmd2, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() ) + returncode = proc.wait() + tmp_stderr.close() + # get stderr, allowing for case where it's very large + tmp_stderr = open( tmp, 'rb' ) + stderr = '' + try: + while True: + stderr += tmp_stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + tmp_stderr.close() + if returncode != 0: + raise Exception, stderr + except Exception, e: + raise Exception, 'Error aligning sequence. ' + str( e ) + # and again if paired data + try: + if cmd2b: + tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name + tmp_stderr = open( tmp, 'wb' ) + proc = subprocess.Popen( args=cmd2b, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() ) + returncode = proc.wait() + tmp_stderr.close() + # get stderr, allowing for case where it's very large + tmp_stderr = open( tmp, 'rb' ) + stderr = '' + try: + while True: + stderr += tmp_stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + tmp_stderr.close() + if returncode != 0: + raise Exception, stderr + except Exception, e: + raise Exception, 'Error aligning second sequence. ' + str( e ) + # generate align + try: + tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name + tmp_stderr = open( tmp, 'wb' ) + proc = subprocess.Popen( args=cmd3, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() ) + returncode = proc.wait() + tmp_stderr.close() + # get stderr, allowing for case where it's very large + tmp_stderr = open( tmp, 'rb' ) + stderr = '' + try: + while True: + stderr += tmp_stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + tmp_stderr.close() + if returncode != 0: + raise Exception, stderr + except Exception, e: + raise Exception, 'Error generating alignments. ' + str( e ) + # remove header if necessary + if options.suppressHeader == 'true': + tmp_out = tempfile.NamedTemporaryFile( dir=tmp_dir) + tmp_out_name = tmp_out.name + tmp_out.close() + try: + shutil.move( sam_output_file, tmp_out_name ) + except Exception, e: + raise Exception, 'Error moving output file before removing headers. ' + str( e ) + fout = file( sam_output_file, 'w' ) + for line in file( tmp_out.name, 'r' ): + if not ( line.startswith( '@HD' ) or line.startswith( '@SQ' ) or line.startswith( '@RG' ) or line.startswith( '@PG' ) or line.startswith( '@CO' ) ): + fout.write( line ) + fout.close() + # check that there are results in the output file + if os.path.getsize( sam_output_file ) > 0: + sys.stdout.write( 'BWA run on single-end data') + else: + raise Exception, 'The output file is empty. You may simply have no matches, or there may be an error with your input file or settings.' + except Exception, e: + stop_err( 'The alignment failed.\n' + str( e ) ) + finally: + # clean up temp dir + if os.path.exists( tmp_index_dir ): + shutil.rmtree( tmp_index_dir ) + if os.path.exists( tmp_dir ): + shutil.rmtree( tmp_dir ) + + # put all in an archive + mytotalzipfile=zipfile.ZipFile(options.output, 'w', allowZip64=True) + os.chdir(tmp_output_dir) + for samfile in glob.glob(tmp_output_dir+'/*'): + mytotalzipfile.write(os.path.basename(samfile)) + + +if __name__=="__main__": __main__()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa_wrapper.xml Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,339 @@ +<tool id="bwa_wrapper_stacks" name="Map with BWA for STACKS" version="1.2.3"> + <description>from zip file with fastqsanger files</description> + <requirements> + <requirement type="package" version="0.6.2">bwa</requirement> + </requirements> + <description></description> + <parallelism method="basic"></parallelism> + <command interpreter="python"> + bwa_wrapper.py + --threads="4" + + #if $input1.ext == "fastqillumina": + --illumina1.3 + #end if + + ## reference source + --fileSource="${genomeSource.refGenomeSource}" + #if $genomeSource.refGenomeSource == "history": + ##build index on the fly + --ref="${genomeSource.ownFile}" + --dbkey="${dbkey}" + #else: + ##use precomputed indexes + --ref="${genomeSource.indices.fields.path}" + --do_not_build_index + #end if + + ## input file(s) + --input1="${paired.input1}" + + ## output file + --output="${output}" + + ## run parameters + --params="${params.source_select}" + #if $params.source_select != "pre_set": + --maxEditDist="${params.maxEditDist}" + --fracMissingAligns="${params.fracMissingAligns}" + --maxGapOpens="${params.maxGapOpens}" + --maxGapExtens="${params.maxGapExtens}" + --disallowLongDel="${params.disallowLongDel}" + --disallowIndel="${params.disallowIndel}" + --seed="${params.seed}" + --maxEditDistSeed="${params.maxEditDistSeed}" + --mismatchPenalty="${params.mismatchPenalty}" + --gapOpenPenalty="${params.gapOpenPenalty}" + --gapExtensPenalty="${params.gapExtensPenalty}" + --suboptAlign="${params.suboptAlign}" + --noIterSearch="${params.noIterSearch}" + --outputTopN="${params.outputTopN}" + --outputTopNDisc="${params.outputTopNDisc}" + --maxInsertSize="${params.maxInsertSize}" + --maxOccurPairing="${params.maxOccurPairing}" + #if $params.readGroup.specReadGroup == "yes" + --rgid="${params.readGroup.rgid}" + --rgcn="${params.readGroup.rgcn}" + --rgds="${params.readGroup.rgds}" + --rgdt="${params.readGroup.rgdt}" + --rgfo="${params.readGroup.rgfo}" + --rgks="${params.readGroup.rgks}" + --rglb="${params.readGroup.rglb}" + --rgpg="${params.readGroup.rgpg}" + --rgpi="${params.readGroup.rgpi}" + --rgpl="${params.readGroup.rgpl}" + --rgpu="${params.readGroup.rgpu}" + --rgsm="${params.readGroup.rgsm}" + #end if + #end if + + ## suppress output SAM header + --suppressHeader="${suppressHeader}" + </command> + <inputs> + <conditional name="genomeSource"> + <param name="refGenomeSource" type="select" label="Will you select a reference genome from your history or use a built-in index?"> + <option value="indexed">Use a built-in index</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param name="indices" type="select" label="Select a reference genome"> + <options from_data_table="bwa_indexes"> + <filter type="sort_by" column="2" /> + <validator type="no_options" message="No indexes are available" /> + </options> + </param> + </when> + <when value="history"> + <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select a reference from history" /> + </when> + </conditional> + <conditional name="paired"> + <param name="sPaired" type="select" label="Is this library mate-paired?"> + <option value="single">Single-end</option> + </param> + <when value="single"> + <param name="input1" type="data" format="zip" label="Zip file" help="Zip file with several FASTQ with either Sanger-scaled quality values (fastqsanger) or Illumina-scaled quality values (fastqillumina)" /> + </when> + </conditional> + <conditional name="params"> + <param name="source_select" type="select" label="BWA settings to use" help="For most mapping needs use Commonly Used settings. If you want full control use Full Parameter List"> + <option value="pre_set">Commonly Used</option> + <option value="full">Full Parameter List</option> + </param> + <when value="pre_set" /> + <when value="full"> + <param name="maxEditDist" type="integer" value="0" label="Maximum edit distance (aln -n)" help="Enter this value OR a fraction of missing alignments, not both" /> + <param name="fracMissingAligns" type="float" value="0.04" label="Fraction of missing alignments given 2% uniform base error rate (aln -n)" help="Enter this value OR maximum edit distance, not both" /> + <param name="maxGapOpens" type="integer" value="1" label="Maximum number of gap opens (aln -o)" /> + <param name="maxGapExtens" type="integer" value="-1" label="Maximum number of gap extensions (aln -e)" help="-1 for k-difference mode (disallowing long gaps)" /> + <param name="disallowLongDel" type="integer" value="16" label="Disallow long deletion within [value] bp towards the 3'-end (aln -d)" /> + <param name="disallowIndel" type="integer" value="5" label="Disallow insertion/deletion within [value] bp towards the end (aln -i)" /> + <param name="seed" type="integer" value="-1" label="Number of first subsequences to take as seed (aln -l)" help="Enter -1 for infinity" /> + <param name="maxEditDistSeed" type="integer" value="2" label="Maximum edit distance in the seed (aln -k)" /> + <param name="mismatchPenalty" type="integer" value="3" label="Mismatch penalty (aln -M)" help="BWA will not search for suboptimal hits with a score lower than [value]" /> + <param name="gapOpenPenalty" type="integer" value="11" label="Gap open penalty (aln -O)" /> + <param name="gapExtensPenalty" type="integer" value="4" label="Gap extension penalty (aln -E)" /> + <param name="suboptAlign" type="integer" optional="True" label="Proceed with suboptimal alignments if there are no more than INT equally best hits. (aln -R)" help="For paired-end reads only. By default, BWA only searches for suboptimal alignments if the top hit is unique. Using this option has no effect on accuracy for single-end reads. It is mainly designed for improving the alignment accuracy of paired-end reads. However, the pairing procedure will be slowed down, especially for very short reads (~32bp)" /> + <param name="noIterSearch" type="boolean" truevalue="true" falsevalue="false" checked="no" label="Disable iterative search (aln -N)" help="All hits with no more than maxDiff differences will be found. This mode is much slower than the default" /> + <param name="outputTopN" type="integer" value="3" label="Maximum number of alignments to output in the XA tag for reads paired properly (samse/sampe -n)" help="If a read has more than INT hits, the XA tag will not be written" /> + <param name="outputTopNDisc" type="integer" value="10" label="Maximum number of alignments to output in the XA tag for disconcordant read pairs (excluding singletons) (sampe -N)" help="For paired-end reads only. If a read has more than INT hits, the XA tag will not be written" /> + <param name="maxInsertSize" type="integer" value="500" label="Maximum insert size for a read pair to be considered as being mapped properly (sampe -a)" help="For paired-end reads only. Only used when there are not enough good alignments to infer the distribution of insert sizes" /> + <param name="maxOccurPairing" type="integer" value="100000" label="Maximum occurrences of a read for pairing (sampe -o)" help="For paired-end reads only. A read with more occurrences will be treated as a single-end read. Reducing this parameter helps faster pairing" /> + <conditional name="readGroup"> + <param name="specReadGroup" type="select" label="Specify the read group for this file? (samse/sampe -r)"> + <option value="yes">Yes</option> + <option value="no" selected="True">No</option> + </param> + <when value="yes"> + <param name="rgid" type="text" size="25" label="Read group identifier (ID). Each @RG line must have a unique ID. The value of ID is used in the RG +tags of alignment records. Must be unique among all read groups in header section." help="Required if RG specified. Read group +IDs may be modified when merging SAM files in order to handle collisions." /> + <param name="rgcn" type="text" size="25" label="Sequencing center that produced the read (CN)" help="Optional" /> + <param name="rgds" type="text" size="25" label="Description (DS)" help="Optional" /> + <param name="rgdt" type="text" size="25" label="Date that run was produced (DT)" help="Optional. ISO8601 format date or date/time, like YYYY-MM-DD" /> + <param name="rgfo" type="text" size="25" label="Flow order (FO). The array of nucleotide bases that correspond to the nucleotides used for each +flow of each read." help="Optional. Multi-base flows are encoded in IUPAC format, and non-nucleotide flows by +various other characters. Format : /\*|[ACMGRSVTWYHKDBN]+/" /> + <param name="rgks" type="text" size="25" label="The array of nucleotide bases that correspond to the key sequence of each read (KS)" help="Optional" /> + <param name="rglb" type="text" size="25" label="Library name (LB)" help="Required if RG specified" /> + <param name="rgpg" type="text" size="25" label="Programs used for processing the read group (PG)" help="Optional" /> + <param name="rgpi" type="text" size="25" label="Predicted median insert size (PI)" help="Optional" /> + <param name="rgpl" type="text" size="25" label="Platform/technology used to produce the reads (PL)" help="Required if RG specified. Valid values : CAPILLARY, LS454, ILLUMINA, +SOLID, HELICOS, IONTORRENT and PACBIO" /> + <param name="rgpu" type="text" size="25" label="Platform unit (PU)" help="Optional. Unique identifier (e.g. flowcell-barcode.lane for Illumina or slide for SOLiD)" /> + <param name="rgsm" type="text" size="25" label="Sample (SM)" help="Required if RG specified. Use pool name where a pool is being sequenced" /> + </when> + <when value="no" /> + </conditional> + </when> + </conditional> + <param name="suppressHeader" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Suppress the header in the output SAM file" help="BWA produces SAM with several lines of header information" /> + </inputs> + <outputs> + <data format="zip" name="output" label="${tool.name} on ${on_string}: mapped reads"/> + </outputs> + <help> + +**What it does** + +BWA is a fast light-weighted tool that aligns relatively short sequences (queries) to a sequence database (large), such as the human reference genome. It is developed by Heng Li at the Sanger Insitute. Li H. and Durbin R. (2009) Fast and accurate short read alignment with Burrows-Wheeler transform. Bioinformatics, 25, 1754-60. + +------ + +**Know what you are doing** + +.. class:: warningmark + +There is no such thing (yet) as an automated gearshift in short read mapping. It is all like stick-shift driving in San Francisco. In other words = running this tool with default parameters will probably not give you meaningful results. A way to deal with this is to **understand** the parameters by carefully reading the `documentation`__ and experimenting. Fortunately, Galaxy makes experimenting easy. + + .. __: http://bio-bwa.sourceforge.net/ + + +Instructions to add the functionality of archives management in Galaxy on the `eBiogenouest HUB wiki <https://www.e-biogenouest.org/wiki/ManArchiveGalaxy>`_ . + +------ + +**Input formats** + +BWA accepts files in either Sanger FASTQ format (galaxy type *fastqsanger*) or Illumina FASTQ format (galaxy type *fastqillumina*). Use the FASTQ Groomer to prepare your files. + +------ + +**A Note on Built-in Reference Genomes** + +The default variant for all genomes is "Full", defined as all primary chromosomes (or scaffolds/contigs) including mitochondrial plus associated unmapped, plasmid, and other segments. When only one version of a genome is available in this tool, it represents the default "Full" variant. Some genomes will have more than one variant available. The "Canonical Male" or sometimes simply "Canonical" variant contains the primary chromosomes for a genome. For example a human "Canonical" variant contains chr1-chr22, chrX, chrY, and chrM. The "Canonical Female" variant contains the primary chromosomes excluding chrY. + +------ + +**Outputs** + +The output is in SAM format, and has the following columns:: + + Column Description + -------- -------------------------------------------------------- + 1 QNAME Query (pair) NAME + 2 FLAG bitwise FLAG + 3 RNAME Reference sequence NAME + 4 POS 1-based leftmost POSition/coordinate of clipped sequence + 5 MAPQ MAPping Quality (Phred-scaled) + 6 CIGAR extended CIGAR string + 7 MRNM Mate Reference sequence NaMe ('=' if same as RNAME) + 8 MPOS 1-based Mate POSition + 9 ISIZE Inferred insert SIZE + 10 SEQ query SEQuence on the same strand as the reference + 11 QUAL query QUALity (ASCII-33 gives the Phred base quality) + 12 OPT variable OPTional fields in the format TAG:VTYPE:VALU + +The flags are as follows:: + + Flag Description + ------ ------------------------------------- + 0x0001 the read is paired in sequencing + 0x0002 the read is mapped in a proper pair + 0x0004 the query sequence itself is unmapped + 0x0008 the mate is unmapped + 0x0010 strand of the query (1 for reverse) + 0x0020 strand of the mate + 0x0040 the read is the first read in a pair + 0x0080 the read is the second read in a pair + 0x0100 the alignment is not primary + +It looks like this (scroll sideways to see the entire example):: + + QNAME FLAG RNAME POS MAPQ CIAGR MRNM MPOS ISIZE SEQ QUAL OPT + HWI-EAS91_1_30788AAXX:1:1:1761:343 4 * 0 0 * * 0 0 AAAAAAANNAAAAAAAAAAAAAAAAAAAAAAAAAAACNNANNGAGTNGNNNNNNNGCTTCCCACAGNNCTGG hhhhhhh;;hhhhhhhhhhh^hOhhhhghhhfhhhgh;;h;;hhhh;h;;;;;;;hhhhhhghhhh;;Phhh + HWI-EAS91_1_30788AAXX:1:1:1578:331 4 * 0 0 * * 0 0 GTATAGANNAATAAGAAAAAAAAAAATGAAGACTTTCNNANNTCTGNANNNNNNNTCTTTTTTCAGNNGTAG hhhhhhh;;hhhhhhhhhhhhhhhhhhhhhhhhhhhh;;h;;hhhh;h;;;;;;;hhhhhhhhhhh;;hhVh + +------- + +**BWA settings** + +All of the options have a default value. You can change any of them. All of the options in BWA have been implemented here. + +------ + +**BWA parameter list** + +This is an exhaustive list of BWA options: + +For **aln**:: + + -n NUM Maximum edit distance if the value is INT, or the fraction of missing + alignments given 2% uniform base error rate if FLOAT. In the latter + case, the maximum edit distance is automatically chosen for different + read lengths. [0.04] + -o INT Maximum number of gap opens [1] + -e INT Maximum number of gap extensions, -1 for k-difference mode + (disallowing long gaps) [-1] + -d INT Disallow a long deletion within INT bp towards the 3'-end [16] + -i INT Disallow an indel within INT bp towards the ends [5] + -l INT Take the first INT subsequence as seed. If INT is larger than the + query sequence, seeding will be disabled. For long reads, this option + is typically ranged from 25 to 35 for '-k 2'. [inf] + -k INT Maximum edit distance in the seed [2] + -t INT Number of threads (multi-threading mode) [1] + -M INT Mismatch penalty. BWA will not search for suboptimal hits with a score + lower than (bestScore-misMsc). [3] + -O INT Gap open penalty [11] + -E INT Gap extension penalty [4] + -c Reverse query but not complement it, which is required for alignment + in the color space. + -R Proceed with suboptimal alignments even if the top hit is a repeat. By + default, BWA only searches for suboptimal alignments if the top hit is + unique. Using this option has no effect on accuracy for single-end + reads. It is mainly designed for improving the alignment accuracy of + paired-end reads. However, the pairing procedure will be slowed down, + especially for very short reads (~32bp). + -N Disable iterative search. All hits with no more than maxDiff + differences will be found. This mode is much slower than the default. + +For **samse**:: + + -n INT Maximum number of alignments to output in the XA tag for reads paired + properly. If a read has more than INT hits, the XA tag will not be + written. [3] + -r STR Specify the read group in a format like '@RG\tID:foo\tSM:bar' [null] + +For **sampe**:: + + -a INT Maximum insert size for a read pair to be considered as being mapped + properly. Since version 0.4.5, this option is only used when there + are not enough good alignment to infer the distribution of insert + sizes. [500] + -n INT Maximum number of alignments to output in the XA tag for reads paired + properly. If a read has more than INT hits, the XA tag will not be + written. [3] + -N INT Maximum number of alignments to output in the XA tag for disconcordant + read pairs (excluding singletons). If a read has more than INT hits, + the XA tag will not be written. [10] + -o INT Maximum occurrences of a read for pairing. A read with more + occurrences will be treated as a single-end read. Reducing this + parameter helps faster pairing. [100000] + -r STR Specify the read group in a format like '@RG\tID:foo\tSM:bar' [null] + +For specifying the read group in **samse** or **sampe**, use the following:: + + @RG Read group. Unordered multiple @RG lines are allowed. + ID Read group identifier. Each @RG line must have a unique ID. The value of + ID is used in the RG tags of alignment records. Must be unique among all + read groups in header section. Read group IDs may be modified when + merging SAM files in order to handle collisions. + CN Name of sequencing center producing the read. + DS Description. + DT Date the run was produced (ISO8601 date or date/time). + FO Flow order. The array of nucleotide bases that correspond to the + nucleotides used for each flow of each read. Multi-base flows are encoded + in IUPAC format, and non-nucleotide flows by various other characters. + Format : /\*|[ACMGRSVTWYHKDBN]+/ + KS The array of nucleotide bases that correspond to the key sequence of each read. + LB Library. + PG Programs used for processing the read group. + PI Predicted median insert size. + PL Platform/technology used to produce the reads. Valid values : CAPILLARY, + LS454, ILLUMINA, SOLID, HELICOS, IONTORRENT and PACBIO. + PU Platform unit (e.g. flowcell-barcode.lane for Illumina or slide for + SOLiD). Unique identifier. + SM Sample. Use pool name where a pool is being sequenced. + + </help> + <citations> + <citation type="doi">10.1111/mec.12354</citation> + <citation type="doi">10.1111/mec.12330</citation> + <citation type="doi">10.1534/g3.111.000240</citation> + <citation type="doi">10.1534/genetics.111.127324</citation> + <citation type="doi">10.1111/j.1755-0998.2010.02967.x</citation> + <citation type="doi">10.1073/pnas.1006538107</citation> + + <citation type="bibtex">@INPROCEEDINGS{JOBIM2013, + author = {Le Bras, Y. and ROULT, A. and Monjeaud, C. and Bahin, M. and Quenez, O. and Heriveau, C. and Bretaudeau, A. and Sallou, O. and Collin, O.}, + title = {Towards a Life Sciences Virtual Research Environment: An e-Science initiative in Western France}, + booktitle = {JOBIM 2013 Proceedings}, + year = {2013}, + url = {https://www.e-biogenouest.org/resources/128}, + pages = {97-106} + }</citation> +</citations> +</tool> + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/stacks.py Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,367 @@ +""" + +STACKS METHODS FOR GALAXY + +Created by Cyril Monjeaud & Yvan Le Bras +Cyril.Monjeaud@irisa.fr +yvan.le_bras@irisa.fr + +Last modifications : 01/22/2014 + + +""" + +import os, sys, re, shutil +import glob +import collections +import gzip, zipfile, tarfile +import subprocess +from galaxy.datatypes.checkers import * + + +""" + +STACKS COMMON METHODS + +galaxy_config_to_tabfiles(input_config) +galaxy_config_to_tabfiles_for_STACKS(input_config) +extract_compress_files_from_tabfiles(tab_files, tmp_input_dir) +create_symlinks_from_tabfiles(tab_files, tmp_input_dir) + +""" +def galaxy_config_to_tabfiles(input_config): + + tab_files=collections.OrderedDict() + for line in open(input_config, "r").readlines(): + if line.strip() != '': + extract=line.strip().split("::") + tab_files[extract[0].replace(" (", ".").replace(" ", ".").replace(")", "").replace(":", ".").replace("/", ".")]=extract[1] + + # tabfiles[name]-> path + return tab_files + + +def galaxy_config_to_tabfiles_for_STACKS(input_config): + + tab_files=collections.OrderedDict() + for line in open(input_config, "r").readlines(): + if line.strip() != '': + extract=line.strip().split("::") + parse_name=re.search("^STACKS.*\((.*\.[ATCG]*\.fq)\)$", extract[0]) + # rename galaxy name in a short name + if parse_name: + extract[0]=parse_name.groups(1)[0] + + tab_files[extract[0].replace(" (", ".").replace(" ", ".").replace(")", "").replace(":", ".").replace("/", ".")]=extract[1] + + # tabfiles[name]-> path + return tab_files + + +def extract_compress_files_from_tabfiles(tab_files, tmp_input_dir): + + # for each file + for key in tab_files.keys(): + #test if is zip file + if (check_zip( tab_files[key] )): + + # extract all files names and added it in the tab + myarchive = zipfile.ZipFile(tab_files[key], 'r') + for i in myarchive.namelist(): + tab_files[i]=tmp_input_dir+"/"+i + + # extract all files + myarchive.extractall(tmp_input_dir) + + #remove compress file from the tab + del tab_files[key] + + #test if is tar.gz file + else: + if tarfile.is_tarfile( tab_files[key] ) and check_gzip( tab_files[key] ): + # extract all files names and added it in the tab + mygzfile = tarfile.open(tab_files[key], 'r') + + for i in mygzfile.getnames(): + tab_files[i]=tmp_input_dir+"/"+i + + # extract all files + mygzfile.extractall(tmp_input_dir) + + #remove compress file from the tab + del tab_files[key] + + + +def create_symlinks_from_tabfiles(tab_files, tmp_input_dir): + + for key in tab_files.keys(): + #print "file single: "+key+" -> "+tab_files[key] + #create a sym_link in our temp dir + if not os.path.exists(tmp_input_dir+'/'+key): + os.symlink(tab_files[key], tmp_input_dir+'/'+key) + + +""" + +PROCESS RADTAGS METHODS + +generate_additional_file(tmp_output_dir, output_archive) + +""" + +def change_outputs_procrad_name(tmp_output_dir, sample_name): + + list_files = glob.glob(tmp_output_dir+'/*') + + for file in list_files: + # change sample name + new_file_name=os.path.basename(file.replace("_",".").replace("sample", sample_name)) + + # transform .fa -> .fasta or .fq->.fastq + if os.path.splitext(new_file_name)[1] == ".fa": + new_file_name = os.path.splitext(new_file_name)[0]+'.fasta' + else: + new_file_name = os.path.splitext(new_file_name)[0]+'.fastq' + + shutil.move(tmp_output_dir+'/'+os.path.basename(file), tmp_output_dir+'/'+new_file_name) + + +def generate_additional_archive_file(tmp_output_dir, output_archive): + + list_files = glob.glob(tmp_output_dir+'/*') + + myzip=zipfile.ZipFile("archive.zip.temp", 'w', allowZip64=True) + + # for each fastq file + for fastq_file in list_files: + # add file to the archive output + myzip.write(fastq_file, os.path.basename(fastq_file)) + + shutil.move("archive.zip.temp", output_archive) + + +""" + +DENOVOMAP METHODS + +check_fastq_extension_and_add(tab_files, tmp_input_dir) + +""" + +def check_fastq_extension_and_add(tab_files, tmp_input_dir): + + # for each file + for key in tab_files.keys(): + + if not re.search("\.fq$", key) and not re.search("\.fastq$", key) and not re.search("\.fa$", key) and not re.search("\.fasta$", key): + # open the file + myfastxfile=open(tab_files[key], 'r') + + # get the header + line = myfastxfile.readline() + line = line.strip() + + # fasta rapid test + if line.startswith( '>' ): + tab_files[key+".fasta"]=tab_files[key] + del tab_files[key] + # fastq rapid test + elif line.startswith( '@' ): + tab_files[key+".fq"]=tab_files[key] + del tab_files[key] + else: + print "[WARNING] : your input file "+key+" was not extension and is not recognize as a Fasta or Fastq file" + + myfastxfile.close() + + +""" + +REFMAP METHODS + +""" + +def check_sam_extension_and_add(tab_files, tmp_input_dir): + + # for each file + for key in tab_files.keys(): + + if not re.search("\.sam$", key): + # add the extension + tab_files[key+".sam"]=tab_files[key] + del tab_files[key] + + + + + + +""" + +PREPARE POPULATION MAP METHODS + +generate_popmap_for_denovo(tab_files, infos_file, pop_map) +generate_popmap_for_refmap(tab_fq_files, tab_sam_files, infos_file, pop_map) + + +""" +def generate_popmap_for_denovo(tab_files, infos_file, pop_map): + + # initiate the dict : barcode -> tab[seq] + fq_name_for_barcode={} + + for key in tab_files: + single_barcode=re.search("([ATCG]*)(\.fq|\.fastq)", key).groups(0)[0] + fq_name_for_barcode[single_barcode]=key + + # open the infos file and output file + my_open_info_file=open(infos_file, 'r') + my_output_file=open(pop_map, 'w') + + # conversion tab for population to integer + pop_to_int=[] + + # write infos into the final output + for line in my_open_info_file: + + parse_line=re.search("(^[ATCG]+)\t(.*)", line.strip()) + + if not parse_line: + print "[WARNING] Wrong input infos file structure : "+line + else: + barcode=parse_line.groups(1)[0] + population_name=parse_line.groups(1)[1] + + # if its the first meet with the population + if population_name not in pop_to_int: + pop_to_int.append(population_name) + + # manage ext if present, because the population map file should not have the ext + if re.search("(\.fq$|\.fastq$)", fq_name_for_barcode[barcode]): + fqfile=os.path.splitext(fq_name_for_barcode[barcode])[0] + else: + fqfile=fq_name_for_barcode[barcode] + + + # write in the file + my_output_file.write(fqfile+"\t"+str(pop_to_int.index(population_name))+"\n") + + # close files + my_output_file.close() + my_open_info_file.close() + + + + +def generate_popmap_for_refmap(tab_fq_files, tab_sam_files, infos_file, pop_map): + + # initiate the dict : barcode -> tab[seq] + seq_id_for_barcode={} + + # initiate the dict : barcode -> sam_name + sam_name_for_barcode={} + + ### Parse fastqfiles ### + # insert my barcode into a tab with sequences ID associated + for fastq_file in tab_fq_files.keys(): + single_barcode=re.search("([ATCG]*)(\.fq|\.fastq)", fastq_file).groups(0)[0] + + # open the fasq file + open_fastqfile=open(tab_fq_files[fastq_file], 'r') + + # for each line, get the seq ID + tab_seq_id=[] + for line in open_fastqfile: + my_match_seqID=re.search("^@([A-Z0-9]+\.[0-9]+)\s.*", line) + if my_match_seqID: + tab_seq_id.append(my_match_seqID.groups(0)[0]) + + # push in a dict the tab of seqID for the current barcode + seq_id_for_barcode[single_barcode]=tab_seq_id + + + ### Parse samfiles and get the first seq id ### + for sam_file in tab_sam_files.keys(): + + # open the sam file + open_samfile=open(tab_sam_files[sam_file], 'r') + + # get the first seq id + first_seq_id='' + for line in open_samfile: + if not re.search("^@", line): + first_seq_id=line.split("\t")[0] + break + + + # map with seq_id_for_barcode structure + for barcode in seq_id_for_barcode: + for seq in seq_id_for_barcode[barcode]: + if seq == first_seq_id: + #print "sam -> "+sam_file+" seq -> "+first_seq_id+" barcode -> "+barcode + sam_name_for_barcode[barcode]=sam_file + break + + # open the infos file and output file + my_open_info_file=open(infos_file, 'r') + my_output_file=open(pop_map, 'w') + + # conversion tab for population to integer + pop_to_int=[] + + # write infos into the final output + for line in my_open_info_file: + parse_line=re.search("(^[ATCG]+)\t(.*)", line) + + if not parse_line: + print "[WARNING] Wrong input infos file structure : "+line + else: + + # if its the first meet with the population + if parse_line.groups(1)[1] not in pop_to_int: + pop_to_int.append(parse_line.groups(1)[1]) + + # manage ext if present, because the population map file should not have the ext + if re.search("\.sam", sam_name_for_barcode[parse_line.groups(1)[0]]): + samfile=os.path.splitext(sam_name_for_barcode[parse_line.groups(1)[0]])[0] + else: + samfile=sam_name_for_barcode[parse_line.groups(1)[0]] + + # write in the file + my_output_file.write(samfile+"\t"+str(pop_to_int.index(parse_line.groups(1)[1]))+"\n") + + # close files + my_output_file.close() + my_open_info_file.close() + + +""" + +STACKS POPULATION + + +""" + + +def extract_compress_files(myfile, tmp_input_dir): + + #test if is zip file + if (check_zip( myfile )): + + # extract all files names and added it in the tab + myarchive = zipfile.ZipFile(myfile, 'r') + + # extract all files + myarchive.extractall(tmp_input_dir) + + + #test if is tar.gz file + else: + # extract all files names and added it in the tab + mygzfile = tarfile.open(myfile, 'r') + + # extract all files + mygzfile.extractall(tmp_input_dir) + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,8 @@ +<!-- Use the file tool_data_table_conf.xml.oldlocstyle if you don't want to update your loc files as changed in revision 4550:535d276c92bc--> +<tables> + <!-- Locations of indexes in the BWA mapper format --> + <table name="bwa_indexes" comment_char="#"> + <columns>value, dbkey, name, path</columns> + <file path="tool-data/bwa_index.loc" /> + </table> +</tables>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,13 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="stacks" version="1.18"> + <repository name="package_stacks_1_18" changeset_revision="6bb02bb29819" owner="cmonjeau" toolshed="https://toolshed.g2.bx.psu.edu/" /> + </package> + <package name="bwa" version="0.6.2"> + <repository name="package_bwa_0_6_2" changeset_revision="0778635a84ba" owner="devteam" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu/" /> + </package> + <package name="velvet" version="1.2.10"> + <repository name="package_velvet_1_2_10" changeset_revision="1c500c3e7fdf" owner="rjullien" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu/" /> + </package> +</tool_dependency> +
