Mercurial > repos > joachim-jacob > qualimap_suite
changeset 2:934cd08c77af draft
Uploaded
author | joachim-jacob |
---|---|
date | Tue, 12 Feb 2013 04:48:36 -0500 |
parents | 59c0a4f9c9ff |
children | 9537dd9dd18b |
files | README bamqc.xml bamqc_wrapper.pl |
diffstat | 3 files changed, 410 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README Tue Feb 12 04:48:36 2013 -0500 @@ -0,0 +1,30 @@ +qualimap - 1.0.0 +===================================================================== +<joachim.jacob@vib.be> + +SUMMARY +------- +Qualimap examines sequencing alignment data in SAM/BAM files and provides an overall view of the data that helps to the detect biases in the sequencing and/or mapping of the data and eases decision-making for further analysis. + +http://qualimap.bioinfo.cipf.es/ + +Garcia-Alcalde et.al, (2012) Qualimap: evaluating next generation sequencing aligmnent data, +Bioinformatics. doi: 10.1093/bioinformatics/bts503 + +REQUIREMENTS AND INSTALLATION +----------------------------- +Install this tool in your Galaxy via the Toolshed + + ) Third party code installation + Required: qualimap JAR archives. + + ) Integration into Galaxy + Please use the toolshed + +DETAILS AND ISSUES +------------------ + ) Only bamqc provided in this first version + ) bamqc: allow passing number of threads, and take into account number of 'chunks' + ) bamqc: + +=====================================================================
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bamqc.xml Tue Feb 12 04:48:36 2013 -0500 @@ -0,0 +1,238 @@ + +<tool id="qualimap_bamqc" name="Analyse SAM/BAM with bamqc" version="0.0.1"> + <!-- Additional info: wrapper compatible with versions ..... --> + <description> + to asses mapping quality metrics. + </description> + + <version_command> + qualimap --version + </version_command> + + <requirements> + <requirement type="package">qualimap</requirement> + </requirements> + + <command interpreter="perl"> + ## it is recommended that you write a wrapper for your tool + ## and pass all parameters to that tool, which parses them. + bamqc_wrapper.pl $configfile + </command> + + <inputs> + <param format="sam,bam" name="bam" type="data" label="Alignments in the BAM or SAM format" help="The set of aligned reads." /> + <param type="boolean" name="c" checked="TRUE" truevalue="-c" falsevalue="" label="paint chromosome limits inside charts" /> + <conditional name="customgtf"> + <param name="upload" type="select" label="BETA! Analyze the alignment data for the regions of interest you provide"> + <option value="yes">Yes</option> + <option value="no" selected="true">No</option> + </param> + <when value="yes"> + <param name="gff" type="data" format="bed,gtf,gff3" label="Choose your feature annotation file" help="Provide your BED, GTF or GFF file"/> + <param name="os" type="boolean" checked="FALSE" truevalue="-os" falsevalue="" label="compute also regions outside stats" help="If checked, the information about the reads that are mapped outside of the regions of interest will be also computed and shown in a separate section" /> + <param type="select" name="p" label="The sequencing protocol strand specificity" help="Can be non-strand-specific, forward-stranded orreverse-stranded. This information is required to calculate the number of correct strand reads."> + <option value="NON-STRAND-SPECIFIC">Non-strand-specific</option> + <option value="STRAND-SPECIFIC-FORWARD">Strand-specific forward</option> + <option value="STRAND-SPECIFIC-REVERSE">Strand-specific reverse</option> + </param> + </when> + <when value="no"/> + </conditional> + <param name="hm" type="text" size="3" value="3" label="minimum size for a homopolymer to be considered in indel analysis" help="Only homopolymers of this size or larger will be considered when estimating homopolymer indels count"/> + <param name="nr" type="text" size="6" value="1000" label="number of reads in the chunk" help="In order to reduce the load of I/O, reads are analyzed in chunks. Each chunk contains the selected number of reads which will be loaded into memory and analyzed by a single thread. Smaller numbers may result in lower performance, but also the memory consumption will be reduced. The default value is 1000 reads"/> + </inputs> + + <outputs> + <data format="html" name="bamqc_result" label="${tool.name} on ${on_string}"> + <!-- <data format="html" name="bamqc_result" label="${tool.name} on ${on_string}" from_work_dir="bamqc_output/qualimapReport.html"> --> + </data> + </outputs> + + <configfiles> + <!-- this config file collects all parameter settings --> + <configfile name="configfile"> + ## first we pass some galaxy environment variables + galtemp==${__new_file_path__} + + bamqc_result==$bamqc_result + outputdir==$bamqc_result.files_path + bam==$bam + c==$c + hm==$hm + nr==$nr + #if $customgtf.upload=="yes" + gff==$customgtf.gff + os==$customgtf.os + p==$customgtf.p + #end if + </configfile> + </configfiles> + + <tests> + <!-- Test base-space single-end reads with pre-built index and preset parameters --> + <test> + <!-- TopHat commands: + tophat -o tmp_dir -p 1 tophat_in1 test-data/tophat_in2.fastqsanger + Rename the files in tmp_dir appropriately + --> + <param name="input1" ftype="fastqsanger" value="tophat_in2.fastqsanger" /> + <param name="genomeSource" value="indexed" /> + <param name="index" value="tophat_test" /> + <param name="sPaired" value="single" /> + <param name="sSettingsType" value="preSet" /> + <output name="junctions" file="tophat_out1j.bed" /> + <output name="accepted_hits" file="tophat_out1h.bam" compare="sim_size" /> + </test> + <!-- Test using base-space test data: paired-end reads, index from history. --> + <test> + <!-- TopHat commands: + bowtie-build -f test-data/tophat_in1.fasta tophat_in1 + tophat -o tmp_dir -p 1 -r 20 tophat_in1 test-data/tophat_in2.fastqsanger test-data/tophat_in3.fastqsanger + Rename the files in tmp_dir appropriately + --> + <param name="input1" ftype="fastqsanger" value="tophat_in2.fastqsanger" /> + <param name="genomeSource" value="history" /> + <param name="ownFile" ftype="fasta" value="tophat_in1.fasta" /> + <param name="sPaired" value="paired" /> + <param name="input2" ftype="fastqsanger" value="tophat_in3.fastqsanger" /> + <param name="mate_inner_distance" value="20" /> + <param name="pSettingsType" value="preSet" /> + <output name="junctions" file="tophat_out2j.bed" /> + <output name="accepted_hits" file="tophat_out2h.bam" compare="sim_size" /> + </test> + <!-- Test base-space single-end reads with user-supplied reference fasta and full parameters --> + <test> + <!-- Tophat commands: + bowtie-build -f test-data/tophat_in1.fasta tophat_in1 + tophat -o tmp_dir -p 1 -a 8 -m 0 -i 70 -I 500000 -F 0.15 -g 40 +coverage-search +min-coverage-intron 50 +max-coverage-intro 20000 +segment-mismatches 2 +segment-length 25 +closure-search +min-closure-exon 50 +min-closure-intron 50 +max-closure-intro 5000 +microexon-search tophat_in1 test-data/tophat_in2.fastqsanger + Replace the + with double-dash + Rename the files in tmp_dir appropriately + --> + <param name="input1" ftype="fastqsanger" value="tophat_in2.fastqsanger"/> + <param name="genomeSource" value="history"/> + <param name="ownFile" value="tophat_in1.fasta"/> + <param name="sPaired" value="single"/> + <param name="sSettingsType" value="full"/> + <param name="library_type" value="FR Unstranded"/> + <param name="anchor_length" value="8"/> + <param name="splice_mismatches" value="0"/> + <param name="min_intron_length" value="70"/> + <param name="max_intron_length" value="500000"/> + <param name="max_multihits" value="40"/> + <param name="min_segment_intron" value="50" /> + <param name="max_segment_intron" value="500000" /> + <param name="seg_mismatches" value="2"/> + <param name="seg_length" value="25"/> + <param name="allow_indel_search" value="Yes"/> + <param name="max_insertion_length" value="3"/> + <param name="max_deletion_length" value="3"/> + <param name="use_junctions" value="Yes" /> + <param name="use_annotations" value="No" /> + <param name="use_juncs" value="No" /> + <param name="no_novel_juncs" value="No" /> + <param name="use_search" value="Yes" /> + <param name="min_closure_exon" value="50" /> + <param name="min_closure_intron" value="50" /> + <param name="max_closure_intron" value="5000" /> + <param name="use_search" value="Yes" /> + <param name="min_coverage_intron" value="50" /> + <param name="max_coverage_intron" value="20000" /> + <param name="microexon_search" value="Yes" /> + <output name="insertions" file="tophat_out3i.bed" /> + <output name="deletions" file="tophat_out3d.bed" /> + <output name="junctions" file="tophat_out3j.bed" /> + <output name="accepted_hits" file="tophat_out3h.bam" compare="sim_size" /> + </test> + <!-- Test base-space paired-end reads with user-supplied reference fasta and full parameters --> + <test> + <!-- TopHat commands: + tophat -o tmp_dir -r 20 -p 1 -a 8 -m 0 -i 70 -I 500000 -F 0.15 -g 40 +coverage-search +min-coverage-intron 50 +max-coverage-intro 20000 +segment-mismatches 2 +segment-length 25 +microexon-search tophat_in1 test-data/tophat_in2.fastqsanger test-data/tophat_in3.fastqsanger + Replace the + with double-dash + Rename the files in tmp_dir appropriately + --> + <param name="input1" ftype="fastqsanger" value="tophat_in2.fastqsanger"/> + <param name="genomeSource" value="indexed"/> + <param name="index" value="tophat_test"/> + <param name="sPaired" value="paired"/> + <param name="input2" ftype="fastqsanger" value="tophat_in3.fastqsanger"/> + <param name="mate_inner_distance" value="20"/> + <param name="pSettingsType" value="full"/> + <param name="library_type" value="FR Unstranded"/> + <param name="mate_std_dev" value="20"/> + <param name="anchor_length" value="8"/> + <param name="splice_mismatches" value="0"/> + <param name="min_intron_length" value="70"/> + <param name="max_intron_length" value="500000"/> + <param name="max_multihits" value="40"/> + <param name="min_segment_intron" value="50" /> + <param name="max_segment_intron" value="500000" /> + <param name="seg_mismatches" value="2"/> + <param name="seg_length" value="25"/> + <param name="allow_indel_search" value="No"/> + <param name="use_junctions" value="Yes" /> + <param name="use_annotations" value="No" /> + <param name="use_juncs" value="No" /> + <param name="no_novel_juncs" value="No" /> + <param name="use_search" value="No" /> + <param name="microexon_search" value="Yes" /> + <output name="junctions" file="tophat_out4j.bed" /> + <output name="accepted_hits" file="tophat_out4h.bam" compare="sim_size" /> + </test> + </tests> + + <help> +**Tool Overview** + +Tool_ allows for simply but throroughly checking of the quality of mapping. + +.. _Tool: http://qualimap.bioinfo.cipf.es// + +------ + +**Know what you are doing** + +.. class:: warningmark + +Know what you are doing by reading the `documentation`__ and experimenting. + +.. __: http://tophat.cbcb.umd.edu/manual.html + +------ + +**Input formats** + +Tool accepts files in Sanger FASTQ format. Use the FASTQ Groomer to prepare your files. + +------ + +**Outputs** + +Tool produces two output files: + +- junctions -- A UCSC BED_ track of junctions reported by TopHat. Each junction consists of two connected BED blocks, where each block is as long as the maximal overhang of any read spanning the junction. The score is the number of alignments spanning the junction. +- accepted_hits -- A list of read alignments in BAM_ format. + +.. _BED: http://genome.ucsc.edu/FAQ/FAQformat.html#format1 +.. _BAM: http://samtools.sourceforge.net/ + +Two other possible outputs, depending on the options you choose, are insertions and deletions, both of which are in BED format. + +------- + +**Tool settings** + +All of the options have a default value. You can change any of them. Some of the options in Tophat have been implemented here. + +------ + +**Tool parameter list** + +This is a list of implemented Tophat options:: + + -r This is the expected (mean) inner distance between mate pairs. For, example, for paired end runs with fragments + selected at 300bp, where each end is 50bp, you should set -r to be 200. There is no default, and this parameter + is required for paired end runs. + + </help> +</tool> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bamqc_wrapper.pl Tue Feb 12 04:48:36 2013 -0500 @@ -0,0 +1,142 @@ +#!/usr/bin/perl +# bamqc_wrapper.pl +# Joachim Jacob - joachim.jacob@gmail.com - 2013 + +use strict; +use File::Temp 'tempdir'; +use File::Basename; +use Log::Log4perl qw(:easy); + +# ---------------------- Prepping Logging -----------------------------# +######################################################################## +# Log levels: $DEBUG $INFO $WARN $ERROR $FATAL +# ConversionPattern: %d %-5p %F{1} [%M] (line %L): %m%n%n +my $log_conf = q/ + log4perl.category = ERROR, Screen + log4perl.appender.Screen.stderr=1 + log4perl.appender.Screen.layout=Log::Log4perl::Layout::PatternLayout + log4perl.appender.Screen.layout.ConversionPattern = %d %-5p %m%n + log4perl.appender.Screen = Log::Log4perl::Appender::Screen +/; + +Log::Log4perl::init( \$log_conf ); +my $logger = get_logger(); + +# ----------------- Getting parameters file ---------------------------# +######################################################################## +my ($configfile) = @ARGV; +my (%para); +open(CONFIG,"<$configfile"); +while (<CONFIG>) { + if (/(\S+)==(.+)$/){ $para{ $1 } = $2 ; } +} +close(CONFIG); + +=Excerpt Config parameters + + ## first we pass some galaxy environment variables + galtemp==${__new_file_path__} + + bam==$bam + c==$c + hm==$hm + nr==$nr + #if $customgtf.upload=="yes" + gff==$customgtf.gff + os==$customgtf.os + p==$customgtf.p + #end if + +=cut + +for my $para (keys %para){ + INFO "$para\tset to\t$para{$para}"; +} + + +# ---------------------- Prepping temp dir ----------------------------# +######################################################################## +# within the temporary directory of Galaxy, we create a temporary dir + +my $galtemp = $para{'galtemp'}; +DEBUG "\nReceived Galaxy temporary directory:\n$galtemp"; + +my $tempdir = File::Temp->tempdir('tmpXXXXX', DIR => $galtemp, CLEANUP => 1); +mkdir "$tempdir", 077 unless -d "$tempdir"; +INFO "\nTemporary directory:\n$tempdir"; + + +# ---------------------- Output files ---------------------------------# +######################################################################## +# Two possible scenarios +# - output is written to the file at the output location in config file +# - output is moved to the output file location + +# open (SAMOUTPUT, ">$para{'output_sam'}") +# or ( ERROR "Could not open output file for writing! (file:$para{'output_sam'}" and die ); +# print SAMOUTPUT "Here comes sam\n"; +# close (SAMOUTPUT); + + +# -------------------- Assembling command ----------------------------# +######################################################################## +my $outdir="bamqc_output"; +my $qualimap_path=""; # for tool_dependency later on +my $command = $qualimap_path."qualimap bamqc --outdir $outdir"; # this will ultimately be executed + +# based on parameters, which are called by $para{'parametername'} +# the command is assembled +$command .= " -bam $para{'bam'} -hm $para{'hm'} -nr $para{'nr'}"; +if($para{'c'}){$command .= " $para{'c'} "; } +if ( $para{'gff'} ){ + $command .= " -gff $para{'gff'} -p $para{'p'} "; + if($para{'os'}) { + $command .= $para{'os'}; + } +} + +$command .= " -nt 4 -outformat HTML"; + + +# --------------------- Executing command ----------------------------# +######################################################################## +run_process($command, "qualimap bamqc", "."); + + +# ----------------- Postprocessing command ---------------------------# +######################################################################## +# Hackish !! + +my $librarydir = dirname($para{'bamqc_result'}); +my $jobworkdir = dirname($para{'outputdir'}); +my $files_path_name = basename($para{'outputdir'}); +INFO "\n Librarydir = $librarydir \n jobworkdir = $jobworkdir \n files_path_name = $files_path_name \n"; +system("mv $jobworkdir/bamqc_output $librarydir/$files_path_name") == 0 or die "Could not move output data to $librarydir\n"; +system("rm -rf $para{'bamqc_result'}") == 0 or die "Could not post-process the results...\n"; +system("ln -s $librarydir/$files_path_name/qualimapReport.html $para{'bamqc_result'}") == 0 or die "Could not move output data to $librarydir\n"; + + +# --------------------------- Exiting --------------------------------# +######################################################################## +exit 0; + + + +### Subroutines ### +######################################################################## +sub run_process { + my ($command, $name, $tempdir)= @_; + my $logger = get_logger(); + INFO "\nProcess to launch:\n $command\nIn directory\n$tempdir\n"; + system("cd $tempdir; $command 2>/dev/null") == 0 or die "$name failed\nExit status $?\nCommand: $command"; + if ($? == -1) { + print "failed to execute: $!\n"; + } elsif ($? & 127) { + printf "child died with signal %d, %s coredump\n", ($? & 127), ($? & 128) ? 'with' : 'without'; + } else { + printf "$name executed successfully\n", $? >> 8; + } +} + + +