Mercurial > repos > ucsb-phylogenetics > ucsb_phylogenetics
changeset 4:5b23f3eb3f09 draft
Added sequence gap remover and Trimming Reads.
author | ucsb-phylogenetics |
---|---|
date | Sun, 24 Jun 2012 16:02:29 -0400 |
parents | 3db16e8335e1 |
children | eb55013e0f28 |
files | ucsb_phylogenetics/gap-rem/gap-rem.xml ucsb_phylogenetics/gap-rem/gap_rem_README.txt ucsb_phylogenetics/gap-rem/seqfill.pl ucsb_phylogenetics/phylobayes/pb.xml ucsb_phylogenetics/phylobayes/phylobayes_wrapper.tar.bz2 ucsb_phylogenetics/phylomatic/phylomatic.xml ucsb_phylogenetics/trimmingreads/TrimmingReads.pl ucsb_phylogenetics/trimmingreads/TrimmingReads.xml ucsb_phylogenetics/trimmingreads/TrimmingReads_README.txt |
diffstat | 9 files changed, 586 insertions(+), 2 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ucsb_phylogenetics/gap-rem/gap-rem.xml Sun Jun 24 16:02:29 2012 -0400 @@ -0,0 +1,29 @@ +<tool id="gap-rem" name="Sequence Gap Remover" version="1.0.1"> + <description>Removes gap in sequences</description> + <command interpreter="perl">seqfill.pl $file $question_mark $hyphen $N $usePart $pfile</command> + <inputs> + <param format="phylipnon" name="file" type="data" label="File" /> + <param type ="boolean" name="usePart" truevalue="true" falsevalue="false" label="Also use Partition File -- if unchecked, it will be ignored." /> + <param format="txt" name="pfile" type="data" label="Partition file" /> + <param type="boolean" name="question_mark" truevalue="true" falsevalue="false" label="'?' signifies gap" /> + <param type="boolean" name="hyphen" truevalue="true" falsevalue="false" label="'-' signifies gap" /> + <param type="boolean" name="N" truevalue="true" falsevalue="false" label="'N' signifies gap"/> + </inputs> + + <outputs> + <data from_work_dir="out.phy" format="phylipnon" name="out1" /> + <data from_work_dir="partOut.txt" format="txt" name="out2" /> + </outputs> + + <help> + http://labs.eemb.ucsb.edu/oakley/todd/ + + Sequence Gap Remover + + Takes an input phylip file and removes any specified gap characters that exist in the same + columns of containing sequences. + + Output will be a text file. + </help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ucsb_phylogenetics/gap-rem/gap_rem_README.txt Sun Jun 24 16:02:29 2012 -0400 @@ -0,0 +1,30 @@ +Sequence Gap Remover + http://labs.eemb.ucsb.edu/oakley/todd/ + + Sequence Gap Remover + + Takes an input phylip file and removes any specified gap characters that exist in the same + columns of containing sequences. + + Output will be a text file. + +Original Perl and Galaxy script implemented by: Roger Ngo and Todd H. Oakley, UCSB + +Sequence Gap Remover will remove any common gaps within a sequence in a phylip file. + +The package includes: + +* gap-rem.xml - the Galaxy tool +* seqfill.pl - the Perl script +* gap_rem_README.txt - the documentation file + +To install, copy the seqfill.pl and gap-rem.xml to a folder in the /galaxy-dist/tools/ +directory and add the XML file to tool_conf.xml. + +Restart the Galaxy instance with: + +./run.sh --stop-daemon + +and then + +./run.sh --reload --daemon \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ucsb_phylogenetics/gap-rem/seqfill.pl Sun Jun 24 16:02:29 2012 -0400 @@ -0,0 +1,147 @@ +#!/usr/bin/perl + +# Sequence Gap Remover + +# Todd H. Oakley and Roger Ngo, UCSB + +# 2012 + +# Removes gaps in sequences from a phylip file. + +my $file = $ARGV[0]; +my $q_mark = $ARGV[1]; +my $hyphen = $ARGV[2]; +my $N = $ARGV[3]; +my $usePartFile = $ARGV[4]; +my $partFile = $ARGV[5]; + +my $out = "out.phylipnon"; # output file + +open(FILE, $file); + my @speciesNames; + my @sequenceLines; + + my @currentLineContent; + + my $i = 0; + while($currentLine = <FILE>) { + chomp($currentLine); + @currentLineContent = split(/\t/, $currentLine); + $speciesNames[$i] = $currentLineContent[0]; + $sequenceLines[$i] = $currentLineContent[1]; + $i++; + } + + my $dataInfo = $speciesNames[1]; # gets num of species and sequence length + my @numbers = split(/ /, $dataInfo); + + my $numberOfSpecies = $numbers[0]; + my $sequenceLength = $numbers[1]; + +close(FILE); + +open(OUT, '>'.$out); + my @columnData; # this will have $sequenceLength elements + for($j = 0; $j < $numberOfSpecies+2; $j++) { + for($k = 0; $k < $sequenceLength; $k++) { + $currChar = substr($sequenceLines[$j], $k, 1); + $columnData[$k] = $columnData[$k].$currChar; + } + } + + # mark locations that will be removed + my @flagMap; + for($i = 0; $i < $sequenceLength; $i++) { + $flagMap[$i] = 0; + } + my $index = 0; + foreach $el(@columnData) { + my $tot = 0; + my $q_mark_occur = 0; + my $hyphen_occur = 0; + my $N_occur = 0; + + if($q_mark eq "true") { + $q_mark_occur = ($el =~ tr/?//); + } + if($hyphen eq "true") { + $hyphen_occur = ($el =~ tr/-//); + } + if($N eq "true") { + $N_occur = ($el =~ tr/N//); + } + + $tot = $q_mark_occur + $hyphen_occur + $N_occur; + if($tot == $numberOfSpecies) { + $flagMap[$index] = 1; + } + $index++; + } + + my $newSequenceLength = $sequenceLength; + foreach $el(@flagMap) { + if($el == 1) { + $newSequenceLength--; + } + } + + print OUT $speciesNames[0]."\n"; + print OUT $numberOfSpecies." ".$newSequenceLength."\n"; + for($i = 2; $i < $numberOfSpecies+3; $i++) { + print OUT $speciesNames[$i]."\t"; + for($j = 0; $j < $sequenceLength; $j++) { + if($flagMap[$j] == 0) { + my $character = substr($sequenceLines[$i], $j, 1); + print OUT $character; + } + } + print OUT "\n"; + } + +close(OUT); + +my $partOut = "partOut.txt"; + +if($usePartFile eq "true") { + # update the partition file + open(PART, $partFile); + my @data; + my @ranges; + my @names; + $i = 0; + while($currentLine = <PART>) { + @data = split(/=/, $currentLine); + $names[$i] = $data[0]; + $ranges[$i] = $data[1]; + $i++; + } + close(PART); + + my $firstFlag = 1; + open(PARTOUT, '>'.$partOut); + $j = 0; + my $newLower; + foreach $el(@ranges) { + print PARTOUT $names[$j]." = "; + @lowerUpper = split(/-/, $el); + if($firstFlag == 1) { + $newLower = $lowerUpper[0]; + $firstFlag = 0; + } + my $currUpper = $lowerUpper[1]; + my $newUpper = $currUpper; + + + + for($i = $currLower; $i < $currUpper; $i++) { + if($flagMap[$i] == 1) { + $newUpper--; + } + } + + print PARTOUT $newLower." - ".$newUpper."\n"; + $newLower = $newUpper + 1; + $j++; + } + close(PARTOUT); +}
--- a/ucsb_phylogenetics/phylobayes/pb.xml Thu Jun 21 17:48:46 2012 -0400 +++ b/ucsb_phylogenetics/phylobayes/pb.xml Sun Jun 24 16:02:29 2012 -0400 @@ -1,5 +1,8 @@ -<tool id="phylobayes" name="Phylobayes"> +<tool id="phylobayes" name="Phylobayes" version="1.0.1"> <description>Runs phylobayes</description> + <requirements> + <requirement type="binary">Phylobayes 3.3b</requirement> + </requirements> <command interpreter="perl">pb.pl $filename $nchain $cycles $discrepancies $effectivesize $burnin $sampleInterval</command> <inputs> <param name="filename" type="data" format="txt" label="Input file (ali)" />
--- a/ucsb_phylogenetics/phylomatic/phylomatic.xml Thu Jun 21 17:48:46 2012 -0400 +++ b/ucsb_phylogenetics/phylomatic/phylomatic.xml Sun Jun 24 16:02:29 2012 -0400 @@ -1,5 +1,8 @@ -<tool id="phylomatic" name="Phylomatic"> +<tool id="phylomatic" name="Phylomatic" version="1.0.1"> <description>Run Phylomatic</description> + <requirements> + <requirement type="binary">Phylocom Phylomatic</requirement> + </requirements> <command interpreter="perl">./phylomatic.pl $input1 $input2</command> <inputs> <param name="input1" type="data" format="txt" label="Phylogenetic Tree" />
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ucsb_phylogenetics/trimmingreads/TrimmingReads.pl Sun Jun 24 16:02:29 2012 -0400 @@ -0,0 +1,310 @@ +#! /usr/bin/perl + +use strict; +use warnings; +use Getopt::Long; +use File::Basename; +use List::Util qw(sum min max); + +my $seqFormat = "a"; # 1: Sanger; 2: Solexa; 3: Illumina 1.3+; 4: Illumina 1.5+; +my $subVal; + +# Parameter variables +my $file; +my $helpAsked; +my $rTrimBases = 0; +my $lTrimBases = 0; +my $qCutOff = 0; +my $lenCutOff = -1; +my $outFile = ""; +my $isQualTrimming = 1; + +GetOptions( + "i=s" => \$file, + "h|help" => \$helpAsked, + "l|leftTrimBases=i" => \$lTrimBases, + "o|outputFile=s" => \$outFile, + "r|rightTrimBases=i" => \$rTrimBases, + "q|qualCutOff=i" => \$qCutOff, + "n|lenCutOff=i" => \$lenCutOff, + ); +if(defined($helpAsked)) { + prtUsage(); + exit; +} +if(!defined($file)) { + prtError("No input files are provided"); +} + +my ($fileName, $filePath) = fileparse($file); +$outFile = $file . "_trimmed" if($outFile eq ""); + +open(I, "$file") or die "Can not open file $file\n"; +open(O, ">$outFile") or die "Can not create file $outFile\n"; +my $tmpLine = <I>; +close(I); +if($tmpLine =~ /^@/) { + print "Input read/sequence format: FASTQ\n"; + if($lTrimBases == 0 && $rTrimBases == 0 && $qCutOff == 0 && $lenCutOff == -1) { + print "Trimming parameters are not set.\nNothing to do.\nExiting...\n"; + unlink($outFile); + exit; + } + print "Checking FASTQ variant: File $file...\n"; + my $nLines = checkFastQFormat($file, 1); + if($seqFormat == 1) { + $subVal = 33; + print "Input FASTQ file format: Sanger\n"; + } + if($seqFormat == 2) { + $subVal = 64; + print "Input FASTQ file format: Solexa\n"; + } + if($seqFormat == 3) { + $subVal = 64; + print "Input FASTQ file format: Illumina 1.3+\n"; + } + if($seqFormat == 4) { + $subVal = 64; + print "Input FASTQ file format: Illumina 1.5+\n"; + } + if($seqFormat == 5) { + $subVal = 33; + print "Input FASTQ file format: Illumina 1.8+\n"; + } + if($lTrimBases != 0 || $rTrimBases != 0) { + print "Trimming $lTrimBases bases from left end and $rTrimBases bases from right end"; + $isQualTrimming = 0; + } + else { + print "Trimming based on PHRED quality score (< $qCutOff)"; + } + print " followed by length filtering (< $lenCutOff bp)\n"; + + open(I, "$file") or die "Can not open file $file\n"; + my $c = 0; + my $currId1 = ""; + my $currId2 = ""; + my $currSeq = ""; + my $currQual = ""; + + + while(my $line = <I>) { + chomp $line; + $c++; + if($c == 5) { + $c = 1; + } + if($c == 1) { + $currId1 = $line; + } + if($c == 3) { + $currId2 = $line; + } + if($isQualTrimming == 0) { + if($c == 2) { + $currSeq = trimSeq($line); + } + elsif($c == 4) { + $currQual = trimSeq($line); + print O "$currId1\n$currSeq\n$currId2\n$currQual\n" if(length $currSeq >= $lenCutOff); + } + } + else { + if($c == 4) { + $currQual = trimSeq4Qual($line); + if(defined($currQual)) { + my $len = length $currQual; + $currSeq =~ /^(.{$len})/; + $currSeq = $1; + print O "$currId1\n$currSeq\n$currId2\n$currQual\n" if(length $currSeq >= $lenCutOff); + } + } + elsif($c == 2) { + $currSeq = $line; + } + } + } + print "Trimmed file is generated: $outFile\n"; +} +else { + print "Input read/sequence format: FASTA\n"; + if($qCutOff != 0) { + print "Warning: Quality trimming can not be performed for FASTA files\n"; + $qCutOff = 0; + } + if($lTrimBases == 0 && $rTrimBases == 0 && $lenCutOff == -1) { + print "Trimming parameters are not set.\nNothing to do.\nExiting...\n"; + unlink($outFile); + exit; + } + print "Trimming $lTrimBases bases from left end and $rTrimBases bases from right end followed by length filtering (< $lenCutOff bp)\n"; + open(I, "$file") or die "Can not open file $file\n"; + my $prevFastaSeqId = ""; + my $fastaSeqId = ""; + my $fastaSeq = ""; + + while(my $line = <I>) { + chomp $line; + if($line =~ /^>/) { + $prevFastaSeqId = $fastaSeqId; + $fastaSeqId = $line; + if($fastaSeq ne "") { + my $outSeq = trimSeq($fastaSeq); + print O "$prevFastaSeqId\n", formatSeq($outSeq), "\n" if(length $outSeq >= $lenCutOff); + } + $fastaSeq = ""; + } + else { + $fastaSeq .= $line; + } + } + if($fastaSeq ne "") { + $prevFastaSeqId = $fastaSeqId; + my $outSeq = trimSeq($fastaSeq); + print O "$prevFastaSeqId\n", formatSeq($outSeq), "\n" if(length $outSeq >= $lenCutOff); + } + print "Trimmed read/sequence file is generated: $outFile\n"; +} +close(O); +close(I); + + +sub trimSeq { + my $seq = $_[0]; + $seq =~ /^.{$lTrimBases}(.+).{$rTrimBases}$/; + return $1; +} + +sub trimSeq4Qual { + my $qual = $_[0]; + my @ASCII = unpack("C*", $qual); + my $trimCount = 0; + for(my $i=@ASCII; $i>0; $i--) { + my $val = $ASCII[$i-1]; + $val -= $subVal; + if($val < $qCutOff) { + $trimCount++; + } + else { + last; + } + } + $qual =~ /^(.+).{$trimCount}$/; + return $1; +} + +sub formatSeq { + my $seq = $_[0]; + my $newSeq = ""; + my $ch = 60; + for(my $i=0; $i<length $seq; $i+=$ch) { + $newSeq .= substr($seq, $i, $ch) . "\n"; + } + chomp($newSeq); # To remove \n at the end of the whole sequence.. + return $newSeq; +} + +sub prtHelp { + print "\n$0 options:\n\n"; + print "### Input reads/sequences (FASTQ/FASTA) (Required)\n"; + print " -i <Read/Sequence file>\n"; + print " File containing reads/sequences in either FASTQ or FASTA format\n"; + print "\n"; + print "### Other options [Optional]\n"; + print " -h | -help\n"; + print " Prints this help\n"; + print "--------------------------------- Trimming Options ---------------------------------\n"; + print " -l | -leftTrimBases <Integer>\n"; + print " Number of bases to be trimmed from left end (5' end)\n"; + print " default: 0\n"; + print " -r | -rightTrimBases <Integer>\n"; + print " Number of bases to be trimmed from right end (3' end)\n"; + print " default: 0\n"; + print " -q | -qualCutOff <Integer> (Only for FASTQ files)\n"; + print " Cut-off PHRED quality score for trimming reads from right end (3' end)\n"; + print " For eg.: -q 20, will trim bases having PHRED quality score less than 20 at 3' end of the read\n"; + print " Note: Quality trimming can be performed only if -l and -r are not used\n"; + print " default: 0 (i.e. quality trimming is OFF)\n"; + print " -n | -lenCutOff <Integer>\n"; + print " Read length cut-off\n"; + print " Reads shorter than given length will be discarded\n"; + print " default: -1 (i.e. length filtering is OFF)\n"; + print "--------------------------------- Output Options ---------------------------------\n"; + print " -o | -outputFile <Output file name>\n"; + print " Output will be stored in the given file\n"; + print " default: By default, output file will be stored where the input file is\n"; + print "\n"; +} + +sub prtError { + my $msg = $_[0]; + print STDERR "+======================================================================+\n"; + printf STDERR "|%-70s|\n", " Error:"; + printf STDERR "|%-70s|\n", " $msg"; + print STDERR "+======================================================================+\n"; + prtUsage(); + exit; +} + +sub prtUsage { + print "\nUsage: perl $0 <options>\n"; + prtHelp(); +} + +sub checkFastQFormat { # Takes FASTQ file as an input and if the format is incorrect it will print error and exit, otherwise it will return the number of lines in the file. + my $file = $_[0]; + my $isVariantIdntfcntOn = $_[1]; + my $lines = 0; + open(F, "<$file") or die "Can not open file $file\n"; + my $counter = 0; + my $minVal = 1000; + my $maxVal = 0; + while(my $line = <F>) { + $lines++; + $counter++; + next if($line =~ /^\n$/); + if($counter == 1 && $line !~ /^\@/) { + prtErrorExit("Invalid FASTQ file format.\n\t\tFile: $file"); + } + if($counter == 3 && $line !~ /^\+/) { + prtErrorExit("Invalid FASTQ file format.\n\t\tFile: $file"); + } + if($counter == 4 && $lines < 1000000) { + chomp $line; + my @ASCII = unpack("C*", $line); + $minVal = min(min(@ASCII), $minVal); + $maxVal = max(max(@ASCII), $maxVal); + } + if($counter == 4) { + $counter = 0; + } + } + close(F); + my $tseqFormat = 0; + if($minVal >= 33 && $minVal <= 73 && $maxVal >= 33 && $maxVal <= 73) { + $tseqFormat = 1; + } + elsif($minVal >= 66 && $minVal <= 105 && $maxVal >= 66 && $maxVal <= 105) { + $tseqFormat = 4; # Illumina 1.5+ + } + elsif($minVal >= 64 && $minVal <= 105 && $maxVal >= 64 && $maxVal <= 105) { + $tseqFormat = 3; # Illumina 1.3+ + } + elsif($minVal >= 59 && $minVal <= 105 && $maxVal >= 59 && $maxVal <= 105) { + $tseqFormat = 2; # Solexa + } + elsif($minVal >= 33 && $minVal <= 74 && $maxVal >= 33 && $maxVal <= 74) { + $tseqFormat = 5; # Illumina 1.8+ + } + if($isVariantIdntfcntOn) { + $seqFormat = $tseqFormat; + } + else { + if($tseqFormat != $seqFormat) { + print STDERR "Warning: It seems the specified variant of FASTQ doesn't match the quality values in input FASTQ files.\n"; + } + } + return $lines; +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ucsb_phylogenetics/trimmingreads/TrimmingReads.xml Sun Jun 24 16:02:29 2012 -0400 @@ -0,0 +1,33 @@ +<tool id="trimming_reads" name="TrimmingReads"> + <description>Runs TrimmingReads</description> + <command interpreter="perl">TrimmingReads.pl -i $input -l $l_int -o out.fastq -r $r_int -q $q_int -n $len_int</command> + <inputs> + <param name="input" label="Input file" type="data" format="fastq"/> + <param name="l_int" label="Left Trim Bases" type="integer" value="0" /> + <param name="r_int" label="Right Trim Bases" type="integer" value="0" /> + <param name="q_int" label="Qual Cut Off" type="integer" value="0"/> + <param name="len_int" label="Length Cut Off" type="integer" value="0"/> + </inputs> + <outputs> + <data from_work_dir="out.fastq" format="fastq" /> + </outputs> + + <help> + + Trimming Reads is a part of the NGS QC Toolkit. + + This tool trims the reads/sequences and their quality scores (in case of FASTQ file) in two ways. + First, it trims fixed (user-specified) number of bases from 5’ and/or 3’ end of the reads and corresponding qualities from the input FASTQ file. + Second, it trims low quality bases from 3’ end of the read using user-defined threshold value of quality score. + Input to this tool is either FASTQ or FASTA format file. + Options are provided to specify the number of bases to be trimmed and the quality threshold for quality based trimming. + (http://59.163.192.90:8080/ngsqctoolkit/NGSQCToolkitv2.2.3_manual.pdf) + + http://59.163.192.90:8080/ngsqctoolkit/ + + http://www.plosone.org/article/info:doi/10.1371/journal.pone.0030619 + Patel RK, Jain M (2012). NGS QC Toolkit: A toolkit for quality control of next generation sequencing data. PLoS ONE, 7(2): e30619. + + Left Trim + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ucsb_phylogenetics/trimmingreads/TrimmingReads_README.txt Sun Jun 24 16:02:29 2012 -0400 @@ -0,0 +1,29 @@ +Trimming Reads is a part of NGS QC Toolkit. + +Original script written by: Mukesh Jain, and Ravi Patel + +Galaxy tool wrapper written by: Roger Ngo and Todd H. Oakley, UCSB + +Required files included in this package: + +* TrimmingReads.xml - Galaxy tool file for the Perl script +* TrimmingReads_README.txt - Documentation file + +Dependencies: + +TrimmingReads.pl from the NGS QC Toolkit is required to be installed +in the Galaxy user account. + +Installation and Configuration Instructions: + +1. Copy TrimmingReads.xml to a folder in /galaxy-dist/tools/ +2. Add the XML tool to tool_conf.xml in /galaxy-dist/ +3. Add the TrimmingReads.pl to your path variable in .bashrc +4. Type: source .bashrc +5. Restart Galaxy using: + +./run.sh --stop-reload + +and then + +./run.sh --reload --daemon