# HG changeset patch # User ucsb-phylogenetics # Date 1340568149 14400 # Node ID 5b23f3eb3f0989de863a3e2e2aca48fb7f943cc7 # Parent 3db16e8335e10830435082db681126c8ca3f5d55 Added sequence gap remover and Trimming Reads. diff -r 3db16e8335e1 -r 5b23f3eb3f09 ucsb_phylogenetics/gap-rem/gap-rem.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ucsb_phylogenetics/gap-rem/gap-rem.xml Sun Jun 24 16:02:29 2012 -0400 @@ -0,0 +1,29 @@ + + Removes gap in sequences + seqfill.pl $file $question_mark $hyphen $N $usePart $pfile + + + + + + + + + + + + + + + + http://labs.eemb.ucsb.edu/oakley/todd/ + + Sequence Gap Remover + + Takes an input phylip file and removes any specified gap characters that exist in the same + columns of containing sequences. + + Output will be a text file. + + + diff -r 3db16e8335e1 -r 5b23f3eb3f09 ucsb_phylogenetics/gap-rem/gap_rem_README.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ucsb_phylogenetics/gap-rem/gap_rem_README.txt Sun Jun 24 16:02:29 2012 -0400 @@ -0,0 +1,30 @@ +Sequence Gap Remover + http://labs.eemb.ucsb.edu/oakley/todd/ + + Sequence Gap Remover + + Takes an input phylip file and removes any specified gap characters that exist in the same + columns of containing sequences. + + Output will be a text file. + +Original Perl and Galaxy script implemented by: Roger Ngo and Todd H. Oakley, UCSB + +Sequence Gap Remover will remove any common gaps within a sequence in a phylip file. + +The package includes: + +* gap-rem.xml - the Galaxy tool +* seqfill.pl - the Perl script +* gap_rem_README.txt - the documentation file + +To install, copy the seqfill.pl and gap-rem.xml to a folder in the /galaxy-dist/tools/ +directory and add the XML file to tool_conf.xml. + +Restart the Galaxy instance with: + +./run.sh --stop-daemon + +and then + +./run.sh --reload --daemon \ No newline at end of file diff -r 3db16e8335e1 -r 5b23f3eb3f09 ucsb_phylogenetics/gap-rem/seqfill.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ucsb_phylogenetics/gap-rem/seqfill.pl Sun Jun 24 16:02:29 2012 -0400 @@ -0,0 +1,147 @@ +#!/usr/bin/perl + +# Sequence Gap Remover + +# Todd H. Oakley and Roger Ngo, UCSB + +# 2012 + +# Removes gaps in sequences from a phylip file. + +my $file = $ARGV[0]; +my $q_mark = $ARGV[1]; +my $hyphen = $ARGV[2]; +my $N = $ARGV[3]; +my $usePartFile = $ARGV[4]; +my $partFile = $ARGV[5]; + +my $out = "out.phylipnon"; # output file + +open(FILE, $file); + my @speciesNames; + my @sequenceLines; + + my @currentLineContent; + + my $i = 0; + while($currentLine = ) { + chomp($currentLine); + @currentLineContent = split(/\t/, $currentLine); + $speciesNames[$i] = $currentLineContent[0]; + $sequenceLines[$i] = $currentLineContent[1]; + $i++; + } + + my $dataInfo = $speciesNames[1]; # gets num of species and sequence length + my @numbers = split(/ /, $dataInfo); + + my $numberOfSpecies = $numbers[0]; + my $sequenceLength = $numbers[1]; + +close(FILE); + +open(OUT, '>'.$out); + my @columnData; # this will have $sequenceLength elements + for($j = 0; $j < $numberOfSpecies+2; $j++) { + for($k = 0; $k < $sequenceLength; $k++) { + $currChar = substr($sequenceLines[$j], $k, 1); + $columnData[$k] = $columnData[$k].$currChar; + } + } + + # mark locations that will be removed + my @flagMap; + for($i = 0; $i < $sequenceLength; $i++) { + $flagMap[$i] = 0; + } + my $index = 0; + foreach $el(@columnData) { + my $tot = 0; + my $q_mark_occur = 0; + my $hyphen_occur = 0; + my $N_occur = 0; + + if($q_mark eq "true") { + $q_mark_occur = ($el =~ tr/?//); + } + if($hyphen eq "true") { + $hyphen_occur = ($el =~ tr/-//); + } + if($N eq "true") { + $N_occur = ($el =~ tr/N//); + } + + $tot = $q_mark_occur + $hyphen_occur + $N_occur; + if($tot == $numberOfSpecies) { + $flagMap[$index] = 1; + } + $index++; + } + + my $newSequenceLength = $sequenceLength; + foreach $el(@flagMap) { + if($el == 1) { + $newSequenceLength--; + } + } + + print OUT $speciesNames[0]."\n"; + print OUT $numberOfSpecies." ".$newSequenceLength."\n"; + for($i = 2; $i < $numberOfSpecies+3; $i++) { + print OUT $speciesNames[$i]."\t"; + for($j = 0; $j < $sequenceLength; $j++) { + if($flagMap[$j] == 0) { + my $character = substr($sequenceLines[$i], $j, 1); + print OUT $character; + } + } + print OUT "\n"; + } + +close(OUT); + +my $partOut = "partOut.txt"; + +if($usePartFile eq "true") { + # update the partition file + open(PART, $partFile); + my @data; + my @ranges; + my @names; + $i = 0; + while($currentLine = ) { + @data = split(/=/, $currentLine); + $names[$i] = $data[0]; + $ranges[$i] = $data[1]; + $i++; + } + close(PART); + + my $firstFlag = 1; + open(PARTOUT, '>'.$partOut); + $j = 0; + my $newLower; + foreach $el(@ranges) { + print PARTOUT $names[$j]." = "; + @lowerUpper = split(/-/, $el); + if($firstFlag == 1) { + $newLower = $lowerUpper[0]; + $firstFlag = 0; + } + my $currUpper = $lowerUpper[1]; + my $newUpper = $currUpper; + + + + for($i = $currLower; $i < $currUpper; $i++) { + if($flagMap[$i] == 1) { + $newUpper--; + } + } + + print PARTOUT $newLower." - ".$newUpper."\n"; + $newLower = $newUpper + 1; + $j++; + } + close(PARTOUT); +} diff -r 3db16e8335e1 -r 5b23f3eb3f09 ucsb_phylogenetics/phylobayes/pb.xml --- a/ucsb_phylogenetics/phylobayes/pb.xml Thu Jun 21 17:48:46 2012 -0400 +++ b/ucsb_phylogenetics/phylobayes/pb.xml Sun Jun 24 16:02:29 2012 -0400 @@ -1,5 +1,8 @@ - + Runs phylobayes + + Phylobayes 3.3b + pb.pl $filename $nchain $cycles $discrepancies $effectivesize $burnin $sampleInterval diff -r 3db16e8335e1 -r 5b23f3eb3f09 ucsb_phylogenetics/phylobayes/phylobayes_wrapper.tar.bz2 Binary file ucsb_phylogenetics/phylobayes/phylobayes_wrapper.tar.bz2 has changed diff -r 3db16e8335e1 -r 5b23f3eb3f09 ucsb_phylogenetics/phylomatic/phylomatic.xml --- a/ucsb_phylogenetics/phylomatic/phylomatic.xml Thu Jun 21 17:48:46 2012 -0400 +++ b/ucsb_phylogenetics/phylomatic/phylomatic.xml Sun Jun 24 16:02:29 2012 -0400 @@ -1,5 +1,8 @@ - + Run Phylomatic + + Phylocom Phylomatic + ./phylomatic.pl $input1 $input2 diff -r 3db16e8335e1 -r 5b23f3eb3f09 ucsb_phylogenetics/trimmingreads/TrimmingReads.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ucsb_phylogenetics/trimmingreads/TrimmingReads.pl Sun Jun 24 16:02:29 2012 -0400 @@ -0,0 +1,310 @@ +#! /usr/bin/perl + +use strict; +use warnings; +use Getopt::Long; +use File::Basename; +use List::Util qw(sum min max); + +my $seqFormat = "a"; # 1: Sanger; 2: Solexa; 3: Illumina 1.3+; 4: Illumina 1.5+; +my $subVal; + +# Parameter variables +my $file; +my $helpAsked; +my $rTrimBases = 0; +my $lTrimBases = 0; +my $qCutOff = 0; +my $lenCutOff = -1; +my $outFile = ""; +my $isQualTrimming = 1; + +GetOptions( + "i=s" => \$file, + "h|help" => \$helpAsked, + "l|leftTrimBases=i" => \$lTrimBases, + "o|outputFile=s" => \$outFile, + "r|rightTrimBases=i" => \$rTrimBases, + "q|qualCutOff=i" => \$qCutOff, + "n|lenCutOff=i" => \$lenCutOff, + ); +if(defined($helpAsked)) { + prtUsage(); + exit; +} +if(!defined($file)) { + prtError("No input files are provided"); +} + +my ($fileName, $filePath) = fileparse($file); +$outFile = $file . "_trimmed" if($outFile eq ""); + +open(I, "$file") or die "Can not open file $file\n"; +open(O, ">$outFile") or die "Can not create file $outFile\n"; +my $tmpLine = ; +close(I); +if($tmpLine =~ /^@/) { + print "Input read/sequence format: FASTQ\n"; + if($lTrimBases == 0 && $rTrimBases == 0 && $qCutOff == 0 && $lenCutOff == -1) { + print "Trimming parameters are not set.\nNothing to do.\nExiting...\n"; + unlink($outFile); + exit; + } + print "Checking FASTQ variant: File $file...\n"; + my $nLines = checkFastQFormat($file, 1); + if($seqFormat == 1) { + $subVal = 33; + print "Input FASTQ file format: Sanger\n"; + } + if($seqFormat == 2) { + $subVal = 64; + print "Input FASTQ file format: Solexa\n"; + } + if($seqFormat == 3) { + $subVal = 64; + print "Input FASTQ file format: Illumina 1.3+\n"; + } + if($seqFormat == 4) { + $subVal = 64; + print "Input FASTQ file format: Illumina 1.5+\n"; + } + if($seqFormat == 5) { + $subVal = 33; + print "Input FASTQ file format: Illumina 1.8+\n"; + } + if($lTrimBases != 0 || $rTrimBases != 0) { + print "Trimming $lTrimBases bases from left end and $rTrimBases bases from right end"; + $isQualTrimming = 0; + } + else { + print "Trimming based on PHRED quality score (< $qCutOff)"; + } + print " followed by length filtering (< $lenCutOff bp)\n"; + + open(I, "$file") or die "Can not open file $file\n"; + my $c = 0; + my $currId1 = ""; + my $currId2 = ""; + my $currSeq = ""; + my $currQual = ""; + + + while(my $line = ) { + chomp $line; + $c++; + if($c == 5) { + $c = 1; + } + if($c == 1) { + $currId1 = $line; + } + if($c == 3) { + $currId2 = $line; + } + if($isQualTrimming == 0) { + if($c == 2) { + $currSeq = trimSeq($line); + } + elsif($c == 4) { + $currQual = trimSeq($line); + print O "$currId1\n$currSeq\n$currId2\n$currQual\n" if(length $currSeq >= $lenCutOff); + } + } + else { + if($c == 4) { + $currQual = trimSeq4Qual($line); + if(defined($currQual)) { + my $len = length $currQual; + $currSeq =~ /^(.{$len})/; + $currSeq = $1; + print O "$currId1\n$currSeq\n$currId2\n$currQual\n" if(length $currSeq >= $lenCutOff); + } + } + elsif($c == 2) { + $currSeq = $line; + } + } + } + print "Trimmed file is generated: $outFile\n"; +} +else { + print "Input read/sequence format: FASTA\n"; + if($qCutOff != 0) { + print "Warning: Quality trimming can not be performed for FASTA files\n"; + $qCutOff = 0; + } + if($lTrimBases == 0 && $rTrimBases == 0 && $lenCutOff == -1) { + print "Trimming parameters are not set.\nNothing to do.\nExiting...\n"; + unlink($outFile); + exit; + } + print "Trimming $lTrimBases bases from left end and $rTrimBases bases from right end followed by length filtering (< $lenCutOff bp)\n"; + open(I, "$file") or die "Can not open file $file\n"; + my $prevFastaSeqId = ""; + my $fastaSeqId = ""; + my $fastaSeq = ""; + + while(my $line = ) { + chomp $line; + if($line =~ /^>/) { + $prevFastaSeqId = $fastaSeqId; + $fastaSeqId = $line; + if($fastaSeq ne "") { + my $outSeq = trimSeq($fastaSeq); + print O "$prevFastaSeqId\n", formatSeq($outSeq), "\n" if(length $outSeq >= $lenCutOff); + } + $fastaSeq = ""; + } + else { + $fastaSeq .= $line; + } + } + if($fastaSeq ne "") { + $prevFastaSeqId = $fastaSeqId; + my $outSeq = trimSeq($fastaSeq); + print O "$prevFastaSeqId\n", formatSeq($outSeq), "\n" if(length $outSeq >= $lenCutOff); + } + print "Trimmed read/sequence file is generated: $outFile\n"; +} +close(O); +close(I); + + +sub trimSeq { + my $seq = $_[0]; + $seq =~ /^.{$lTrimBases}(.+).{$rTrimBases}$/; + return $1; +} + +sub trimSeq4Qual { + my $qual = $_[0]; + my @ASCII = unpack("C*", $qual); + my $trimCount = 0; + for(my $i=@ASCII; $i>0; $i--) { + my $val = $ASCII[$i-1]; + $val -= $subVal; + if($val < $qCutOff) { + $trimCount++; + } + else { + last; + } + } + $qual =~ /^(.+).{$trimCount}$/; + return $1; +} + +sub formatSeq { + my $seq = $_[0]; + my $newSeq = ""; + my $ch = 60; + for(my $i=0; $i\n"; + print " File containing reads/sequences in either FASTQ or FASTA format\n"; + print "\n"; + print "### Other options [Optional]\n"; + print " -h | -help\n"; + print " Prints this help\n"; + print "--------------------------------- Trimming Options ---------------------------------\n"; + print " -l | -leftTrimBases \n"; + print " Number of bases to be trimmed from left end (5' end)\n"; + print " default: 0\n"; + print " -r | -rightTrimBases \n"; + print " Number of bases to be trimmed from right end (3' end)\n"; + print " default: 0\n"; + print " -q | -qualCutOff (Only for FASTQ files)\n"; + print " Cut-off PHRED quality score for trimming reads from right end (3' end)\n"; + print " For eg.: -q 20, will trim bases having PHRED quality score less than 20 at 3' end of the read\n"; + print " Note: Quality trimming can be performed only if -l and -r are not used\n"; + print " default: 0 (i.e. quality trimming is OFF)\n"; + print " -n | -lenCutOff \n"; + print " Read length cut-off\n"; + print " Reads shorter than given length will be discarded\n"; + print " default: -1 (i.e. length filtering is OFF)\n"; + print "--------------------------------- Output Options ---------------------------------\n"; + print " -o | -outputFile \n"; + print " Output will be stored in the given file\n"; + print " default: By default, output file will be stored where the input file is\n"; + print "\n"; +} + +sub prtError { + my $msg = $_[0]; + print STDERR "+======================================================================+\n"; + printf STDERR "|%-70s|\n", " Error:"; + printf STDERR "|%-70s|\n", " $msg"; + print STDERR "+======================================================================+\n"; + prtUsage(); + exit; +} + +sub prtUsage { + print "\nUsage: perl $0 \n"; + prtHelp(); +} + +sub checkFastQFormat { # Takes FASTQ file as an input and if the format is incorrect it will print error and exit, otherwise it will return the number of lines in the file. + my $file = $_[0]; + my $isVariantIdntfcntOn = $_[1]; + my $lines = 0; + open(F, "<$file") or die "Can not open file $file\n"; + my $counter = 0; + my $minVal = 1000; + my $maxVal = 0; + while(my $line = ) { + $lines++; + $counter++; + next if($line =~ /^\n$/); + if($counter == 1 && $line !~ /^\@/) { + prtErrorExit("Invalid FASTQ file format.\n\t\tFile: $file"); + } + if($counter == 3 && $line !~ /^\+/) { + prtErrorExit("Invalid FASTQ file format.\n\t\tFile: $file"); + } + if($counter == 4 && $lines < 1000000) { + chomp $line; + my @ASCII = unpack("C*", $line); + $minVal = min(min(@ASCII), $minVal); + $maxVal = max(max(@ASCII), $maxVal); + } + if($counter == 4) { + $counter = 0; + } + } + close(F); + my $tseqFormat = 0; + if($minVal >= 33 && $minVal <= 73 && $maxVal >= 33 && $maxVal <= 73) { + $tseqFormat = 1; + } + elsif($minVal >= 66 && $minVal <= 105 && $maxVal >= 66 && $maxVal <= 105) { + $tseqFormat = 4; # Illumina 1.5+ + } + elsif($minVal >= 64 && $minVal <= 105 && $maxVal >= 64 && $maxVal <= 105) { + $tseqFormat = 3; # Illumina 1.3+ + } + elsif($minVal >= 59 && $minVal <= 105 && $maxVal >= 59 && $maxVal <= 105) { + $tseqFormat = 2; # Solexa + } + elsif($minVal >= 33 && $minVal <= 74 && $maxVal >= 33 && $maxVal <= 74) { + $tseqFormat = 5; # Illumina 1.8+ + } + if($isVariantIdntfcntOn) { + $seqFormat = $tseqFormat; + } + else { + if($tseqFormat != $seqFormat) { + print STDERR "Warning: It seems the specified variant of FASTQ doesn't match the quality values in input FASTQ files.\n"; + } + } + return $lines; +} + diff -r 3db16e8335e1 -r 5b23f3eb3f09 ucsb_phylogenetics/trimmingreads/TrimmingReads.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ucsb_phylogenetics/trimmingreads/TrimmingReads.xml Sun Jun 24 16:02:29 2012 -0400 @@ -0,0 +1,33 @@ + + Runs TrimmingReads + TrimmingReads.pl -i $input -l $l_int -o out.fastq -r $r_int -q $q_int -n $len_int + + + + + + + + + + + + + + Trimming Reads is a part of the NGS QC Toolkit. + + This tool trims the reads/sequences and their quality scores (in case of FASTQ file) in two ways. + First, it trims fixed (user-specified) number of bases from 5� and/or 3� end of the reads and corresponding qualities from the input FASTQ file. + Second, it trims low quality bases from 3� end of the read using user-defined threshold value of quality score. + Input to this tool is either FASTQ or FASTA format file. + Options are provided to specify the number of bases to be trimmed and the quality threshold for quality based trimming. + (http://59.163.192.90:8080/ngsqctoolkit/NGSQCToolkitv2.2.3_manual.pdf) + + http://59.163.192.90:8080/ngsqctoolkit/ + + http://www.plosone.org/article/info:doi/10.1371/journal.pone.0030619 + Patel RK, Jain M (2012). NGS QC Toolkit: A toolkit for quality control of next generation sequencing data. PLoS ONE, 7(2): e30619. + + Left Trim + + diff -r 3db16e8335e1 -r 5b23f3eb3f09 ucsb_phylogenetics/trimmingreads/TrimmingReads_README.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ucsb_phylogenetics/trimmingreads/TrimmingReads_README.txt Sun Jun 24 16:02:29 2012 -0400 @@ -0,0 +1,29 @@ +Trimming Reads is a part of NGS QC Toolkit. + +Original script written by: Mukesh Jain, and Ravi Patel + +Galaxy tool wrapper written by: Roger Ngo and Todd H. Oakley, UCSB + +Required files included in this package: + +* TrimmingReads.xml - Galaxy tool file for the Perl script +* TrimmingReads_README.txt - Documentation file + +Dependencies: + +TrimmingReads.pl from the NGS QC Toolkit is required to be installed +in the Galaxy user account. + +Installation and Configuration Instructions: + +1. Copy TrimmingReads.xml to a folder in /galaxy-dist/tools/ +2. Add the XML tool to tool_conf.xml in /galaxy-dist/ +3. Add the TrimmingReads.pl to your path variable in .bashrc +4. Type: source .bashrc +5. Restart Galaxy using: + +./run.sh --stop-reload + +and then + +./run.sh --reload --daemon