Mercurial > repos > ucsb-phylogenetics > ucsb_phylogenetics

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/gap-rem/gap-rem.xml	Sun Jun 24 16:02:29 2012 -0400
@@ -0,0 +1,29 @@
+<tool id="gap-rem" name="Sequence Gap Remover" version="1.0.1">
+	<description>Removes gap in sequences</description>
+	<command interpreter="perl">seqfill.pl $file $question_mark $hyphen $N $usePart $pfile</command>
+	<inputs>
+		<param format="phylipnon" name="file" type="data" label="File" />
+		<param type ="boolean" name="usePart" truevalue="true" falsevalue="false" label="Also use Partition File -- if unchecked, it will be ignored." />
+		<param format="txt" name="pfile" type="data" label="Partition file" />
+		<param type="boolean" name="question_mark" truevalue="true" falsevalue="false" label="'?' signifies gap" />
+		<param type="boolean" name="hyphen" truevalue="true" falsevalue="false" label="'-' signifies gap" />
+		<param type="boolean" name="N" truevalue="true" falsevalue="false" label="'N' signifies gap"/>
+	</inputs>
+
+	<outputs>
+		<data from_work_dir="out.phy" format="phylipnon" name="out1" />
+		<data from_work_dir="partOut.txt" format="txt" name="out2" />
+	</outputs>
+
+	<help>
+	http://labs.eemb.ucsb.edu/oakley/todd/
+
+	Sequence Gap Remover
+
+	Takes an input phylip file and removes any specified gap characters that exist in the same
+	columns of containing sequences.
+
+	Output will be a text file.
+	</help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/gap-rem/gap_rem_README.txt	Sun Jun 24 16:02:29 2012 -0400
@@ -0,0 +1,30 @@
+Sequence Gap Remover
+	http://labs.eemb.ucsb.edu/oakley/todd/
+
+	Sequence Gap Remover
+
+	Takes an input phylip file and removes any specified gap characters that exist in the same
+	columns of containing sequences.
+
+	Output will be a text file.
+
+Original Perl and Galaxy script implemented by: Roger Ngo and Todd H. Oakley, UCSB
+
+Sequence Gap Remover will remove any common gaps within a sequence in a phylip file.
+
+The package includes:
+
+* gap-rem.xml - the Galaxy tool
+* seqfill.pl - the Perl script
+* gap_rem_README.txt - the documentation file
+
+To install, copy the seqfill.pl and gap-rem.xml to a folder in the /galaxy-dist/tools/
+directory and add the XML file to tool_conf.xml.
+
+Restart the Galaxy instance with:
+
+./run.sh --stop-daemon
+
+and then
+
+./run.sh --reload --daemon
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/gap-rem/seqfill.pl	Sun Jun 24 16:02:29 2012 -0400
@@ -0,0 +1,147 @@
+#!/usr/bin/perl
+
+# Sequence Gap Remover
+
+# Todd H. Oakley and Roger Ngo, UCSB
+
+# 2012
+
+# Removes gaps in sequences from a phylip file.
+
+my $file = $ARGV[0];
+my $q_mark = $ARGV[1];
+my $hyphen = $ARGV[2];
+my $N = $ARGV[3];
+my $usePartFile = $ARGV[4];
+my $partFile = $ARGV[5];
+
+my $out = "out.phylipnon";	# output file
+
+open(FILE, $file);
+	my @speciesNames;
+	my @sequenceLines;
+
+	my @currentLineContent;
+
+	my $i = 0;
+	while($currentLine = <FILE>) {
+		chomp($currentLine);
+		@currentLineContent = split(/\t/, $currentLine);
+		$speciesNames[$i] = $currentLineContent[0];
+		$sequenceLines[$i] = $currentLineContent[1];
+		$i++;
+	}
+
+	my $dataInfo = $speciesNames[1];	# gets num of species and sequence length
+	my @numbers = split(/ /, $dataInfo);
+
+	my $numberOfSpecies = $numbers[0];
+	my $sequenceLength = $numbers[1];
+
+close(FILE);
+
+open(OUT, '>'.$out);
+	my @columnData;		# this will have $sequenceLength elements
+	for($j = 0; $j < $numberOfSpecies+2; $j++) {
+		for($k = 0; $k < $sequenceLength; $k++) {
+			$currChar = substr($sequenceLines[$j], $k, 1);
+			$columnData[$k] = $columnData[$k].$currChar;
+		}
+	}
+
+	# mark locations that will be removed
+	my @flagMap;
+	for($i = 0; $i < $sequenceLength; $i++) {
+		$flagMap[$i] = 0;
+	}
+	my $index = 0;
+	foreach $el(@columnData) {
+		my $tot = 0;
+		my $q_mark_occur = 0;
+		my $hyphen_occur = 0;
+		my $N_occur = 0;
+
+		if($q_mark eq "true") {
+			$q_mark_occur = ($el =~ tr/?//);
+		}
+		if($hyphen eq "true") {
+			$hyphen_occur = ($el =~ tr/-//);
+		}
+		if($N eq "true") {
+			$N_occur = ($el =~ tr/N//);
+		}
+
+		$tot = $q_mark_occur + $hyphen_occur + $N_occur;
+		if($tot == $numberOfSpecies) {
+			$flagMap[$index] = 1;
+		}
+		$index++;
+	}
+
+	my $newSequenceLength = $sequenceLength;
+	foreach $el(@flagMap) {
+		if($el == 1) {
+			$newSequenceLength--;
+		}
+	}
+
+	print OUT $speciesNames[0]."\n";
+	print OUT $numberOfSpecies." ".$newSequenceLength."\n";
+	for($i = 2; $i < $numberOfSpecies+3; $i++) {
+		print OUT $speciesNames[$i]."\t";
+		for($j = 0; $j < $sequenceLength; $j++) {
+			if($flagMap[$j] == 0) {
+				my $character = substr($sequenceLines[$i], $j, 1);
+				print OUT $character;
+			}
+		}
+		print OUT "\n";
+	}
+
+close(OUT);
+
+my $partOut = "partOut.txt";
+
+if($usePartFile eq "true") {
+	# update the partition file
+	open(PART, $partFile);
+		my @data;
+		my @ranges;
+		my @names;
+		$i = 0;
+		while($currentLine = <PART>) {
+			@data = split(/=/, $currentLine);
+			$names[$i] = $data[0];
+			$ranges[$i] = $data[1];
+			$i++;
+		}
+	close(PART);
+
+	my $firstFlag = 1;
+	open(PARTOUT, '>'.$partOut);
+		$j = 0;
+		my $newLower;
+		foreach $el(@ranges) {
+			print PARTOUT $names[$j]." = ";
+			@lowerUpper = split(/-/, $el);
+			if($firstFlag == 1) {
+				$newLower = $lowerUpper[0];
+				$firstFlag = 0;
+			}
+			my $currUpper = $lowerUpper[1];
+			my $newUpper = $currUpper;
+
+
+
+			for($i = $currLower; $i < $currUpper; $i++) {
+				if($flagMap[$i] == 1) {
+					$newUpper--;
+				}
+			}
+
+			print PARTOUT $newLower." - ".$newUpper."\n";
+			$newLower = $newUpper + 1;
+			$j++;
+		}
+	close(PARTOUT);
+}
--- a/ucsb_phylogenetics/phylobayes/pb.xml	Thu Jun 21 17:48:46 2012 -0400
+++ b/ucsb_phylogenetics/phylobayes/pb.xml	Sun Jun 24 16:02:29 2012 -0400
@@ -1,5 +1,8 @@
-<tool id="phylobayes" name="Phylobayes">
+<tool id="phylobayes" name="Phylobayes" version="1.0.1">
 	<description>Runs phylobayes</description>
+	<requirements>
+		<requirement type="binary">Phylobayes 3.3b</requirement>
+	</requirements>
 	<command interpreter="perl">pb.pl $filename $nchain $cycles $discrepancies $effectivesize $burnin $sampleInterval</command>
 	<inputs>
 		<param name="filename" type="data" format="txt" label="Input file (ali)" />
Binary file ucsb_phylogenetics/phylobayes/phylobayes_wrapper.tar.bz2 has changed
--- a/ucsb_phylogenetics/phylomatic/phylomatic.xml	Thu Jun 21 17:48:46 2012 -0400
+++ b/ucsb_phylogenetics/phylomatic/phylomatic.xml	Sun Jun 24 16:02:29 2012 -0400
@@ -1,5 +1,8 @@
-<tool id="phylomatic" name="Phylomatic">
+<tool id="phylomatic" name="Phylomatic" version="1.0.1">
 	<description>Run Phylomatic</description>
+	<requirements>
+		<requirement type="binary">Phylocom Phylomatic</requirement>
+	</requirements>
 	<command interpreter="perl">./phylomatic.pl $input1 $input2</command>
 	<inputs>
 		<param name="input1" type="data" format="txt" label="Phylogenetic Tree" />
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/trimmingreads/TrimmingReads.pl	Sun Jun 24 16:02:29 2012 -0400
@@ -0,0 +1,310 @@
+#! /usr/bin/perl
+
+use strict;
+use warnings;
+use Getopt::Long;
+use File::Basename;
+use List::Util qw(sum min max);
+
+my $seqFormat = "a";		# 1: Sanger; 2: Solexa; 3: Illumina 1.3+; 4: Illumina 1.5+;
+my $subVal;
+
+# Parameter variables
+my $file;
+my $helpAsked;
+my $rTrimBases = 0;
+my $lTrimBases = 0;
+my $qCutOff = 0;
+my $lenCutOff = -1;
+my $outFile = "";
+my $isQualTrimming = 1;
+
+GetOptions(
+			"i=s" => \$file,
+			"h|help" => \$helpAsked,
+			"l|leftTrimBases=i" => \$lTrimBases,
+			"o|outputFile=s" => \$outFile,
+			"r|rightTrimBases=i" => \$rTrimBases,
+			"q|qualCutOff=i" => \$qCutOff,
+			"n|lenCutOff=i" => \$lenCutOff,
+		  );
+if(defined($helpAsked)) {
+	prtUsage();
+	exit;
+}
+if(!defined($file)) {
+	prtError("No input files are provided");
+}
+
+my ($fileName, $filePath) = fileparse($file);
+$outFile = $file . "_trimmed" if($outFile eq "");
+
+open(I, "$file") or die "Can not open file $file\n";
+open(O, ">$outFile") or die "Can not create file $outFile\n";
+my $tmpLine = <I>;
+close(I);
+if($tmpLine =~ /^@/) {
+	print "Input read/sequence format: FASTQ\n";
+	if($lTrimBases == 0 && $rTrimBases == 0 && $qCutOff == 0 && $lenCutOff == -1) {
+		print "Trimming parameters are not set.\nNothing to do.\nExiting...\n";
+		unlink($outFile);
+		exit;
+	}
+	print "Checking FASTQ variant: File $file...\n";
+	my $nLines = checkFastQFormat($file, 1);
+	if($seqFormat == 1) {
+		$subVal = 33;
+		print "Input FASTQ file format: Sanger\n";
+	}
+	if($seqFormat == 2) {
+		$subVal = 64;
+		print "Input FASTQ file format: Solexa\n";
+	}
+	if($seqFormat == 3) {
+		$subVal = 64;
+		print "Input FASTQ file format: Illumina 1.3+\n";
+	}
+	if($seqFormat == 4) {
+		$subVal = 64;
+		print "Input FASTQ file format: Illumina 1.5+\n";
+	}
+	if($seqFormat == 5) {
+		$subVal = 33;
+		print "Input FASTQ file format: Illumina 1.8+\n";
+	}
+	if($lTrimBases != 0 || $rTrimBases != 0) {
+		print "Trimming $lTrimBases bases from left end and $rTrimBases bases from right end";
+		$isQualTrimming = 0;
+	}
+	else {
+		print "Trimming based on PHRED quality score (< $qCutOff)";
+	}
+	print " followed by length filtering (< $lenCutOff bp)\n";
+
+	open(I, "$file") or die "Can not open file $file\n";
+	my $c = 0;
+	my $currId1 = "";
+	my $currId2 = "";
+	my $currSeq = "";
+	my $currQual = "";
+
+
+	while(my $line = <I>) {
+		chomp $line;
+        $c++;
+        if($c == 5) {
+                $c = 1;
+        }
+        if($c == 1) {
+        	$currId1 = $line;
+        }
+        if($c == 3) {
+        	$currId2 = $line;
+        }
+        if($isQualTrimming == 0) {
+	        if($c == 2) {
+	        	$currSeq = trimSeq($line);
+	        }
+	        elsif($c == 4) {
+	        	$currQual = trimSeq($line);
+	        	print O "$currId1\n$currSeq\n$currId2\n$currQual\n" if(length $currSeq >= $lenCutOff);
+	        }
+        }
+        else {
+	        if($c == 4) {
+				$currQual = trimSeq4Qual($line);
+				if(defined($currQual)) {
+					my $len = length $currQual;
+					$currSeq =~ /^(.{$len})/;
+					$currSeq = $1;
+		        	print O "$currId1\n$currSeq\n$currId2\n$currQual\n" if(length $currSeq >= $lenCutOff);
+				}
+	        }
+	        elsif($c == 2) {
+	        	$currSeq = $line;
+	        }
+        }
+	}
+	print "Trimmed file is generated: $outFile\n";
+}
+else {
+	print "Input read/sequence format: FASTA\n";
+	if($qCutOff != 0) {
+		print "Warning: Quality trimming can not be performed for FASTA files\n";
+		$qCutOff = 0;
+	}
+	if($lTrimBases == 0 && $rTrimBases == 0 && $lenCutOff == -1) {
+		print "Trimming parameters are not set.\nNothing to do.\nExiting...\n";
+		unlink($outFile);
+		exit;
+	}
+	print "Trimming $lTrimBases bases from left end and $rTrimBases bases from right end followed by length filtering (< $lenCutOff bp)\n";
+	open(I, "$file") or die "Can not open file $file\n";
+	my $prevFastaSeqId = "";
+	my $fastaSeqId = "";
+	my $fastaSeq = "";
+
+	while(my $line = <I>) {
+		chomp $line;
+		if($line =~ /^>/) {
+			$prevFastaSeqId = $fastaSeqId;
+			$fastaSeqId = $line;
+			if($fastaSeq ne "") {
+				my $outSeq = trimSeq($fastaSeq);
+				print O "$prevFastaSeqId\n", formatSeq($outSeq), "\n" if(length $outSeq >= $lenCutOff);
+			}
+			$fastaSeq = "";
+		}
+		else {
+			$fastaSeq .= $line;
+		}
+	}
+	if($fastaSeq ne "") {
+		$prevFastaSeqId = $fastaSeqId;
+		my $outSeq = trimSeq($fastaSeq);
+		print O "$prevFastaSeqId\n", formatSeq($outSeq), "\n" if(length $outSeq >= $lenCutOff);
+	}
+	print "Trimmed read/sequence file is generated: $outFile\n";
+}
+close(O);
+close(I);
+
+
+sub trimSeq {
+	my $seq = $_[0];
+	$seq =~ /^.{$lTrimBases}(.+).{$rTrimBases}$/;
+	return $1;
+}
+
+sub trimSeq4Qual {
+	my $qual = $_[0];
+	my @ASCII = unpack("C*", $qual);
+	my $trimCount = 0;
+	for(my $i=@ASCII; $i>0; $i--) {
+		my $val = $ASCII[$i-1];
+		$val -= $subVal;
+		if($val < $qCutOff) {
+			$trimCount++;
+		}
+		else {
+			last;
+		}
+	}
+	$qual =~ /^(.+).{$trimCount}$/;
+	return $1;
+}
+
+sub formatSeq {
+	my $seq = $_[0];
+    my $newSeq = "";
+    my $ch = 60;
+    for(my $i=0; $i<length $seq; $i+=$ch) {
+        $newSeq .= substr($seq, $i, $ch) . "\n";
+    }
+    chomp($newSeq);         # To remove \n at the end of the whole sequence..
+    return $newSeq;
+}
+
+sub prtHelp {
+	print "\n$0 options:\n\n";
+	print "### Input reads/sequences (FASTQ/FASTA) (Required)\n";
+	print "  -i <Read/Sequence file>\n";
+	print "    File containing reads/sequences in either FASTQ or FASTA format\n";
+	print "\n";
+	print "### Other options [Optional]\n";
+	print "  -h | -help\n";
+	print "    Prints this help\n";
+	print "--------------------------------- Trimming Options ---------------------------------\n";
+	print "  -l | -leftTrimBases <Integer>\n";
+	print "    Number of bases to be trimmed from left end (5' end)\n";
+	print "    default: 0\n";
+	print "  -r | -rightTrimBases <Integer>\n";
+	print "    Number of bases to be trimmed from right end (3' end)\n";
+	print "    default: 0\n";
+	print "  -q | -qualCutOff <Integer> (Only for FASTQ files)\n";
+	print "    Cut-off PHRED quality score for trimming reads from right end (3' end)\n";
+	print "      For eg.: -q 20, will trim bases having PHRED quality score less than 20 at 3' end of the read\n";
+	print "      Note: Quality trimming can be performed only if -l and -r are not used\n";
+	print "    default: 0 (i.e. quality trimming is OFF)\n";
+	print "  -n | -lenCutOff <Integer>\n";
+	print "    Read length cut-off\n";
+	print "    Reads shorter than given length will be discarded\n";
+	print "    default: -1 (i.e. length filtering is OFF)\n";
+	print "--------------------------------- Output Options ---------------------------------\n";
+	print "  -o | -outputFile <Output file name>\n";
+	print "    Output will be stored in the given file\n";
+	print "    default: By default, output file will be stored where the input file is\n";
+	print "\n";
+}
+
+sub prtError {
+	my $msg = $_[0];
+	print STDERR "+======================================================================+\n";
+	printf STDERR "|%-70s|\n", "  Error:";
+	printf STDERR "|%-70s|\n", "       $msg";
+	print STDERR "+======================================================================+\n";
+	prtUsage();
+	exit;
+}
+
+sub prtUsage {
+	print "\nUsage: perl $0 <options>\n";
+	prtHelp();
+}
+
+sub checkFastQFormat {				# Takes FASTQ file as an input and if the format is incorrect it will print error and exit, otherwise it will return the number of lines in the file.
+	my $file = $_[0];
+	my $isVariantIdntfcntOn = $_[1];
+	my $lines = 0;
+	open(F, "<$file") or die "Can not open file $file\n";
+	my $counter = 0;
+	my $minVal = 1000;
+	my $maxVal = 0;
+	while(my $line = <F>) {
+		$lines++;
+		$counter++;
+		next if($line =~ /^\n$/);
+		if($counter == 1 && $line !~ /^\@/) {
+			prtErrorExit("Invalid FASTQ file format.\n\t\tFile: $file");
+		}
+		if($counter == 3 && $line !~ /^\+/) {
+			prtErrorExit("Invalid FASTQ file format.\n\t\tFile: $file");
+		}
+		if($counter == 4 && $lines < 1000000) {
+			chomp $line;
+			my @ASCII = unpack("C*", $line);
+			$minVal = min(min(@ASCII), $minVal);
+			$maxVal = max(max(@ASCII), $maxVal);
+		}
+		if($counter == 4) {
+			$counter = 0;
+		}
+	}
+	close(F);
+	my $tseqFormat = 0;
+	if($minVal >= 33 && $minVal <= 73 && $maxVal >= 33 && $maxVal <= 73) {
+		$tseqFormat = 1;
+	}
+	elsif($minVal >= 66 && $minVal <= 105 && $maxVal >= 66 && $maxVal <= 105) {
+		$tseqFormat = 4;			# Illumina 1.5+
+	}
+	elsif($minVal >= 64 && $minVal <= 105 && $maxVal >= 64 && $maxVal <= 105) {
+		$tseqFormat = 3;			# Illumina 1.3+
+	}
+	elsif($minVal >= 59 && $minVal <= 105 && $maxVal >= 59 && $maxVal <= 105) {
+		$tseqFormat = 2;			# Solexa
+	}
+	elsif($minVal >= 33 && $minVal <= 74 && $maxVal >= 33 && $maxVal <= 74) {
+		$tseqFormat = 5;			# Illumina 1.8+
+	}
+	if($isVariantIdntfcntOn) {
+		$seqFormat = $tseqFormat;
+	}
+	else {
+		if($tseqFormat != $seqFormat) {
+			print STDERR "Warning: It seems the specified variant of FASTQ doesn't match the quality values in input FASTQ files.\n";
+		}
+	}
+	return $lines;
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/trimmingreads/TrimmingReads.xml	Sun Jun 24 16:02:29 2012 -0400
@@ -0,0 +1,33 @@
+<tool id="trimming_reads" name="TrimmingReads">
+	<description>Runs TrimmingReads</description>
+	<command interpreter="perl">TrimmingReads.pl -i $input -l $l_int -o out.fastq -r $r_int -q $q_int -n $len_int</command>
+	<inputs>
+		<param name="input" label="Input file" type="data" format="fastq"/>
+		<param name="l_int" label="Left Trim Bases" type="integer" value="0" />
+		<param name="r_int" label="Right Trim Bases" type="integer" value="0" />
+		<param name="q_int" label="Qual Cut Off" type="integer" value="0"/>
+		<param name="len_int" label="Length Cut Off" type="integer" value="0"/>
+	</inputs>
+	<outputs>
+		<data from_work_dir="out.fastq" format="fastq" />
+	</outputs>
+
+	<help>
+
+		Trimming Reads is a part of the NGS QC Toolkit.
+
+		This tool trims the reads/sequences and their quality scores (in case of FASTQ file) in two ways.
+		First, it trims fixed (user-specified) number of bases from 5� and/or 3� end of the reads and corresponding qualities from the input FASTQ file.
+		Second, it trims low quality bases from 3� end of the read using user-defined threshold value of quality score.
+		Input to this tool is either FASTQ or FASTA format file.
+		Options are provided to specify the number of bases to be trimmed and the quality threshold for quality based trimming.
+		(http://59.163.192.90:8080/ngsqctoolkit/NGSQCToolkitv2.2.3_manual.pdf)
+
+		http://59.163.192.90:8080/ngsqctoolkit/
+
+		http://www.plosone.org/article/info:doi/10.1371/journal.pone.0030619
+		Patel RK, Jain M (2012). NGS QC Toolkit: A toolkit for quality control of next generation sequencing data. PLoS ONE, 7(2): e30619.
+
+		Left Trim
+	</help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/trimmingreads/TrimmingReads_README.txt	Sun Jun 24 16:02:29 2012 -0400
@@ -0,0 +1,29 @@
+Trimming Reads is a part of NGS QC Toolkit.
+
+Original script written by: Mukesh Jain, and Ravi Patel
+
+Galaxy tool wrapper written by: Roger Ngo and Todd H. Oakley, UCSB
+
+Required files included in this package:
+
+* TrimmingReads.xml - Galaxy tool file for the Perl script
+* TrimmingReads_README.txt - Documentation file
+
+Dependencies:
+
+TrimmingReads.pl from the NGS QC Toolkit is required to be installed
+in the Galaxy user account.
+
+Installation and Configuration Instructions:
+
+1. Copy TrimmingReads.xml to a folder in /galaxy-dist/tools/
+2. Add the XML tool to tool_conf.xml in /galaxy-dist/
+3. Add the TrimmingReads.pl to your path variable in .bashrc
+4. Type: source .bashrc
+5. Restart Galaxy using:
+
+./run.sh --stop-reload
+
+and then
+
+./run.sh --reload --daemon