# HG changeset patch
# User ucsb-phylogenetics
# Date 1347132814 14400
# Node ID 798d8401d42064fb7f6d5a62e17ae4cf7ed64c81
# Parent c83d7e34cc88744518102270e61f1b6653165812
Uploaded
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/NJst/NJst.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/NJst/NJst.sh Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,4 @@
+#First call perl script which reads trees and writes
+/home/galaxy/galaxy-dist/tools/Rtools/makeNJst.pl $1 $2 > Rnjst.R 2>log.txt
+
+R --vanilla < Rnjst.R 2>log.txt
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/NJst/NJst.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/NJst/NJst.xml Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,19 @@
+
+ Estimate species tree with NJst from table of tree names and newick trees
+ NJst.sh $input $output
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert*
+
+
+
+
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/NJst/README.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/NJst/README.txt Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,5 @@
+Produces species tree from input of multiple gene trees
+
+(Liu, Yu 2011)
+
+BioPerl is required to be installed.
\ No newline at end of file
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/NJst/makeNJst.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/NJst/makeNJst.pl Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,87 @@
+#!/usr/bin/perl
+
+#This script generates an R script to call NJst
+#input is a table with treenamenewick tree
+use strict;
+use Bio::TreeIO;
+
+my $filename = $ARGV[0];
+my $outfile = $ARGV[1];
+open FILE, $filename or die $!;
+
+
+my @splitline;
+
+print "require(phybase);\n";
+print "genetrees<-c(";
+my $counter=0;
+my $tree;
+while () {
+ chomp;
+ #get a line from the data file
+ my $currentinput = "$_";
+ @splitline = split(/\t/);
+ my $treename= $splitline[0];
+ $tree = $splitline[1];
+ unless($counter==0){
+ print ", ";
+ }
+ $counter++;
+ print "'$tree'";
+}
+print ")\n"; #close genetree vector
+print "taxaname<-c(";
+my $spnum = tree2spList($tree);
+print ")\nspname<-taxaname\n";
+print "species.structure<-matrix(0,$spnum,$spnum)\n";
+print "diag(species.structure)<-1\n";
+print "\n";
+print "result<-NJst(genetrees,taxaname,spname,species.structure)\n";
+print "write(result, file='$outfile')\n";
+close FILE;
+
+
+
+
+
+#This script requires phybase R package
+#NJst is a function used as follows
+# genetrees<-c("(A:0.004,(B:0.003,(C:0.002,(D:0.001,E:0.001)
+# :0.001):0.001):0.001);","(A:0.004,(B:0.003,(E:0.002,(D:0.001,C:0.001):0.001):0.001):0.001);","(A:0.004,(B:0.003,(C:0.002,(D:0.001,E:0.001):0.001):0.001):0.001);")
+# taxaname<-c("A","B","C","D","E")
+# spname<-taxaname
+# species.structure<-matrix(0, 5, 5)
+# diag(species.structure)<-1
+#
+# NJst(genetrees,taxaname, spname, species.structure)
+
+
+
+sub tree2spList {
+ my $treefile=shift;
+
+ my ($charactername, $characterstate);
+ my ($call, $sp_id, $char_id);
+
+ #Open treefile and get taxon names from tree
+ my $stringfh;
+ open($stringfh, "<", \$treefile);
+
+ my $input = Bio::TreeIO->new(-format => 'newick', -fh => $stringfh);
+ my $tree = $input->next_tree;
+
+ my @taxa = $tree->get_leaf_nodes;
+ my @names = map { $_->id } @taxa;
+
+ my $count=0;
+ foreach(@names){
+ my $treespecies = $_;
+ $treespecies =~ s/^\s+|\s+$//g ; #Trim leading and trailing whitespace
+ unless($count==0){
+ print ",";
+ }
+ print "'$treespecies'";
+ $count++
+ }
+ return $count;
+} #end of tree2spList subroutine
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/PDPairs/PD.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/PDPairs/PD.pl Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,76 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+use FindBin;
+use lib "$FindBin::Bin/lib";
+use Bio::TreeIO;
+use Bio::Tree::Tree;
+
+##Need to add error checking
+ #1. check if input is tree file and species pairs table
+ #2. check for misspelling of species in species pairs - report when sp not found
+ #3. Throw ERROR back into Galaxy for users to diagnose problem
+##Want to add
+ #1. Option for dividing in half - for divergence times
+ #2. Option to switch orientation of table? Rows be dif trees cols be diff pairs ??
+
+###this script will find the phylogenetic distance between two species
+#input is a tree, output filename, and table with pairwise distances
+#usage:
+#PD.pl
+# parse in newick/new hampshire format
+my @species1;
+my @species2;
+
+
+my $half=$ARGV[3];
+my $divtimebool;
+if($half eq 'yes'){
+ $divtimebool=1;
+}elsif($half eq 'no'){
+ $divtimebool=0;
+}else{
+ die "Argument must contain yes or no for divergence times\n";
+}
+my $outfile = $ARGV[2];
+open(OUT, ">$outfile") or die("Couldn't open output file $ARGV[2]\n");
+
+
+my $pairsfile = $ARGV[0];
+open(PAIRS, "$pairsfile") or die("Couldn't open input file $ARGV[0]\n");
+while () {
+ chomp;
+ my $sp1;
+ my $sp2;
+ ($sp1, $sp2) = split("\t");
+ push(@species1, $sp1);
+ push(@species2, $sp2);
+}
+
+my $treefile = $ARGV[1];
+
+for(my $i=0; $i < @species1; $i++){
+ print OUT $species1[$i]."\t".$species2[$i];
+ open(TREE, "$treefile") or die("Couldn't open output file $ARGV[1]\n");
+
+ my $treeio = new Bio::TreeIO('-format' => 'newick',
+ '-file' => $treefile);
+
+ while(my $tree = $treeio->next_tree){;
+ my $node1 = $tree->find_node(-id => $species1[$i]);
+ my $node2 = $tree->find_node(-id => $species2[$i]);
+ my $distances = $tree->distance(-nodes => [$node1,$node2]);
+
+ #ADD OPTION FOR DIVIDING BY 2 FOR DIVERGENCE TIMES
+ if($divtimebool==1){
+ $distances = $distances/2 ;
+ }
+ print OUT "\t".$distances;
+ }
+print OUT "\n";
+close(TREE);
+}
+
+close(PAIRS);
+close(OUT);
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/PDPairs/PDpairs.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/PDPairs/PDpairs.xml Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,28 @@
+
+ Pairwise distance between taxa in a phylogenetic tree
+
+ PD.pl $intable $intree $outtable $half
+
+
+
+
+
+
+
+
+
+
+Input a table of species pairs::
+
+ species1 species2
+ species2 species4
+
+And a Newick format phylogeny with branch lengths
+
+Output is a table of the species pairs followed by a column of pairwise distance for each tree in the file::
+
+ species1 species2 1.104
+ species2 species4 2.119
+
+
+
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/PDPairs/README.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/PDPairs/README.txt Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,3 @@
+Calculates phylogenetic distances for pairs of species on a phylogeny
+
+Tools developed by Oakley et al
\ No newline at end of file
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/SHtest/README.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/SHtest/README.txt Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,5 @@
+Uses RAxML to compute an SHtest to compare trees
+
+(Stamatakis 2006)
+
+RAxML is required to be installed
\ No newline at end of file
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/SHtest/SHtest.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/SHtest/SHtest.pl Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,47 @@
+#! /usr/bin/perl -w
+
+use strict;
+use warnings;
+#raxml.pl Galaxy wrapper calls raxml from raxml.xml
+
+##For debugging command line pass, uncomment next
+#for (my $i=0; $i < @ARGV; $i++){
+# print "Parameter #$i ".$ARGV[$i]."\n\n";
+#}
+#exit;
+
+my $datatype = shift(@ARGV); #0 datatype
+my $data_file= shift(@ARGV); #1 input a phylip file
+my $part_file = shift(@ARGV); #2 optional partition file
+my $best_tree = shift(@ARGV); #3 best tree for SH comparison
+my $alt_trees = shift(@ARGV); #4 Alternative tree(s) for SH comparison
+my $model;
+
+#ADD OPTIONS TO BUILD FULL RAXML COMMANDLINE ARGUMENT
+
+my $build_command;
+#First CALL RAXML THROUGH PATH with 8 threads
+ $build_command = "raxmlHPC-PTHREADS-SSE3 ";
+#Add SH Test Option and Thread number for PThreads
+ $build_command = $build_command."-f h -T 4";
+#Next add call to input phylip file
+ $build_command = $build_command." -s ".$data_file;
+#model is passed directly with xml
+ $model = $datatype;
+ $build_command = $build_command." -m ".$model;
+#Add call to partition file name
+ unless($part_file eq 'None'){
+ $build_command = $build_command." -q ".$part_file;
+ }
+#Next add call to input best tree file
+ $build_command = $build_command." -t ".$best_tree;
+#Next add call to input best tree file
+ $build_command = $build_command." -z ".$alt_trees;
+#name output files galaxy
+ $build_command = $build_command." -n SH";
+
+print "Galaxy COMMAND BUILD WAS: $build_command\n";
+
+#Uncomment to actually call raxml
+system $build_command;
+
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/SHtest/SHtest.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/SHtest/SHtest.xml Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,31 @@
+
+ Use RAxML to calculate SHtest to compare trees
+
+ raxml
+
+
+ SHtest.pl $datatype $data_file $part_file $best_tree $alt_trees
+ > $raxml_log
+ 2>&1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ raxml Home Page:
+ http://www.exelixis-lab.org/software.html
+
+
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/aliscorecut/README.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/aliscorecut/README.txt Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,5 @@
+ALICUT/SCORE
+------------
+Needs a modified ALICUT perl script in PATH.
+Script is called Aliscore.02.pl and is at:
+http://zfmk.de/web/Forschung/Abteilungen/AG_Wgele/Software/Aliscore/index.en.html
\ No newline at end of file
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/gblocks/README.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/gblocks/README.txt Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,7 @@
+gblocks
+
+Implements gblocks to prune ambiguous alignments
+
+(Talavera, Castresana 2007)
+
+gblocks package required to be installed
\ No newline at end of file
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/gblocks/gblocks.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/gblocks/gblocks.pl Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,30 @@
+#! /usr/bin/perl -w
+
+use strict;
+use warnings;
+#gblocks.pl [fasta file]
+
+my $infile=shift(@ARGV);
+my $datatype=shift(@ARGV);
+my $gaps=shift(@ARGV);
+my $size=shift(@ARGV);
+my $outfileloc=shift(@ARGV);
+my $htmlfileloc=shift(@ARGV);
+
+
+
+
+##For debugging command line pass, uncomment next
+#for (my $i=0; $i < @ARGV; $i++){
+# print "Parameter #$i ".$ARGV[$i]."\n\n";
+#}
+
+system "Gblocks $infile $datatype $gaps -b4=$size";
+
+#Gblocks requires output from $input.fas to be written to $input.fas-gb
+#Copy that file to gout where galaxy expects to find the output
+my $outfile = $infile."-gb";
+my $htmlfile = $outfile.".htm";
+system "cat $outfile > $outfileloc";
+system "cat $htmlfile > $htmlfileloc";
+exit;
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/gblocks/gblocks.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/gblocks/gblocks.xml Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,61 @@
+
+ Convert Aligned FASTA to phylip Extended
+
+ gblocks.pl $input $datatype $gaps $Block $out_file $html_file > $screen
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+About Gblocks
+
+Version 0.91b, January 2002
+
+Copyrightose Castresana
+
+Gblocks is a computer program written in ANSI C language that eliminates poorly aligned
+positions and divergent regions of an alignment of DNA or protein sequences. These
+positions may not be homologous or may have been saturated by multiple substitutions and it
+is convenient to eliminate them prior to phylogenetic analysis. Gblocks selects blocks in a
+similar way as it is usually done by hand but following a reproducible set of conditions.
+The selected blocks must fulfill certain requirements with respect to the lack of large
+segments of contiguous nonconserved positions, lack of gap positions and high conservation
+of flanking positions, making the final alignment more suitable for phylogenetic analysis.
+Gblocks outputs several files to visualize the selected blocks. The use of a program such
+as Gblocks reduces the necessity of manually editing multiple alignments, makes the
+automation of phylogenetic analysis of large data sets feasible and, finally, facilitates
+the reproduction of the alignments and subsequent phylogenetic analysis by other
+researchers. Gblocks is very fast in processing alignments and it is therefore highly
+suitable for large-scale phylogenetic analyses.
+
+Several parameters can be modified to make the selection of blocks more or less stringent.
+In general, a relaxed selection of blocks is better for short alignments, whereas a
+stringent selection is more adequate for longer ones. Be aware that the default options of
+Gblocks are stringent.
+
+ Talavera, G., and Castresana, J. (2007). Improvement of phylogenies after removing
+divergent and ambiguously aligned blocks from protein sequence alignments. Systematic
+Biology 56, 564-577.
+
+ Castresana, J. (2000). Selection of conserved blocks from multiple alignments for their
+use in phylogenetic analysis. Molecular Biology and Evolution 17, 540-552.
+
+
+
+
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/get_orfs_or_cdss/get_orfs_or_cdss.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/get_orfs_or_cdss/get_orfs_or_cdss.py Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,219 @@
+#!/usr/bin/env python
+"""Find ORFs in a nucleotide sequence file.
+
+get_orfs_or_cdss.py $input_fasta $input_format $table $ftype $ends $mode $min_len $strand $out_nuc_file $out_prot_file
+
+Takes ten command line options, input sequence filename, format, genetic
+code, CDS vs ORF, end type (open, closed), selection mode (all, top, one),
+minimum length (in amino acids), strand (both, forward, reverse), output
+nucleotide filename, and output protein filename.
+
+This tool is a short Python script which requires Biopython. If you use
+this tool in scientific work leading to a publication, please cite the
+Biopython application note:
+
+Cock et al 2009. Biopython: freely available Python tools for computational
+molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
+http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
+
+This script is copyright 2011 by Peter Cock, The James Hutton Institute
+(formerly SCRI), Dundee, UK. All rights reserved.
+
+See accompanying text file for licence details (MIT/BSD style).
+
+This is version 0.0.1 of the script.
+"""
+import sys
+import re
+
+def stop_err(msg, err=1):
+ sys.stderr.write(msg.rstrip() + "\n")
+ sys.exit(err)
+
+try:
+ from Bio.Seq import Seq, reverse_complement, translate
+ from Bio.SeqRecord import SeqRecord
+ from Bio import SeqIO
+ from Bio.Data import CodonTable
+except ImportError:
+ stop_err("Missing Biopython library")
+
+#Parse Command Line
+try:
+ input_file, seq_format, table, ftype, ends, mode, min_len, strand, out_nuc_file, out_prot_file = sys.argv[1:]
+except ValueError:
+ stop_err("Expected ten arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))
+
+try:
+ table = int(table)
+except ValueError:
+ stop_err("Expected integer for genetic code table, got %s" % table)
+
+try:
+ table_obj = CodonTable.ambiguous_generic_by_id[table]
+except KeyError:
+ stop_err("Unknown codon table %i" % table)
+
+if ftype not in ["CDS", "ORF"]:
+ stop_err("Expected CDS or ORF, got %s" % ftype)
+
+if ends not in ["open", "closed"]:
+ stop_err("Expected open or closed for end treatment, got %s" % ends)
+
+try:
+ min_len = int(min_len)
+except ValueError:
+ stop_err("Expected integer for min_len, got %s" % min_len)
+
+if seq_format.lower()=="sff":
+ seq_format = "sff-trim"
+elif seq_format.lower()=="fasta":
+ seq_format = "fasta"
+elif seq_format.lower().startswith("fastq"):
+ seq_format = "fastq"
+else:
+ stop_err("Unsupported file type %r" % seq_format)
+
+print "Genetic code table %i" % table
+print "Minimum length %i aa" % min_len
+#print "Taking %s ORF(s) from %s strand(s)" % (mode, strand)
+
+starts = sorted(table_obj.start_codons)
+assert "NNN" not in starts
+re_starts = re.compile("|".join(starts))
+
+stops = sorted(table_obj.stop_codons)
+assert "NNN" not in stops
+re_stops = re.compile("|".join(stops))
+
+def start_chop_and_trans(s, strict=True):
+ """Returns offset, trimmed nuc, protein."""
+ if strict:
+ assert s[-3:] in stops, s
+ assert len(s) % 3 == 0
+ for match in re_starts.finditer(s):
+ #Must check the start is in frame
+ start = match.start()
+ if start % 3 == 0:
+ n = s[start:]
+ assert len(n) % 3 == 0, "%s is len %i" % (n, len(n))
+ if strict:
+ t = translate(n, table, cds=True)
+ else:
+ #Use when missing stop codon,
+ t = "M" + translate(n[3:], table, to_stop=True)
+ return start, n, t
+ return None, None, None
+
+def break_up_frame(s):
+ """Returns offset, nuc, protein."""
+ start = 0
+ for match in re_stops.finditer(s):
+ index = match.start() + 3
+ if index % 3 != 0:
+ continue
+ n = s[start:index]
+ if ftype=="CDS":
+ offset, n, t = start_chop_and_trans(n)
+ else:
+ offset = 0
+ t = translate(n, table, to_stop=True)
+ if n and len(t) >= min_len:
+ yield start + offset, n, t
+ start = index
+ if ends == "open":
+ #No stop codon, Biopython's strict CDS translate will fail
+ n = s[start:]
+ #Ensure we have whole codons
+ #TODO - Try appending N instead?
+ #TODO - Do the next four lines more elegantly
+ if len(n) % 3:
+ n = n[:-1]
+ if len(n) % 3:
+ n = n[:-1]
+ if ftype=="CDS":
+ offset, n, t = start_chop_and_trans(n, strict=False)
+ else:
+ offset = 0
+ t = translate(n, table, to_stop=True)
+ if n and len(t) >= min_len:
+ yield start + offset, n, t
+
+
+def get_all_peptides(nuc_seq):
+ """Returns start, end, strand, nucleotides, protein.
+
+ Co-ordinates are Python style zero-based.
+ """
+ #TODO - Refactor to use a generator function (in start order)
+ #rather than making a list and sorting?
+ answer = []
+ full_len = len(nuc_seq)
+ if strand != "reverse":
+ for frame in range(0,3):
+ for offset, n, t in break_up_frame(nuc_seq[frame:]):
+ start = frame + offset #zero based
+ answer.append((start, start + len(n), +1, n, t))
+ if strand != "forward":
+ rc = reverse_complement(nuc_seq)
+ for frame in range(0,3) :
+ for offset, n, t in break_up_frame(rc[frame:]):
+ start = full_len - frame - offset #zero based
+ answer.append((start, start + len(n), -1, n ,t))
+ answer.sort()
+ return answer
+
+def get_top_peptides(nuc_seq):
+ """Returns all peptides of max length."""
+ values = list(get_all_peptides(nuc_seq))
+ if not values:
+ raise StopIteration
+ max_len = max(len(x[-1]) for x in values)
+ for x in values:
+ if len(x[-1]) == max_len:
+ yield x
+
+def get_one_peptide(nuc_seq):
+ """Returns first (left most) peptide with max length."""
+ values = list(get_top_peptides(nuc_seq))
+ if not values:
+ raise StopIteration
+ yield values[0]
+
+if mode == "all":
+ get_peptides = get_all_peptides
+elif mode == "top":
+ get_peptides = get_top_peptides
+elif mode == "one":
+ get_peptides = get_one_peptide
+
+in_count = 0
+out_count = 0
+if out_nuc_file == "-":
+ out_nuc = sys.stdout
+else:
+ out_nuc = open(out_nuc_file, "w")
+if out_prot_file == "-":
+ out_prot = sys.stdout
+else:
+ out_prot = open(out_prot_file, "w")
+for record in SeqIO.parse(input_file, seq_format):
+ for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())):
+ out_count += 1
+ if f_strand == +1:
+ loc = "%i..%i" % (f_start+1, f_end)
+ else:
+ loc = "complement(%i..%i)" % (f_start+1, f_end)
+ descr = "length %i aa, %i bp, from %s of %s" \
+ % (len(t), len(n), loc, record.description)
+ r = SeqRecord(Seq(n), id = record.id + "|%s%i" % (ftype, i+1), name = "", description= descr)
+ t = SeqRecord(Seq(t), id = record.id + "|%s%i" % (ftype, i+1), name = "", description= descr)
+ SeqIO.write(r, out_nuc, "fasta")
+ SeqIO.write(t, out_prot, "fasta")
+ in_count += 1
+if out_nuc is not sys.stdout:
+ out_nuc.close()
+if out_prot is not sys.stdout:
+ out_prot.close()
+
+print "Found %i %ss in %i sequences" % (out_count, ftype, in_count)
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/get_orfs_or_cdss/get_orfs_or_cdss.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/get_orfs_or_cdss/get_orfs_or_cdss.txt Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,75 @@
+Galaxy tool to find ORFs or simple CDSs
+=======================================
+
+This tool is copyright 2011 by Peter Cock, The James Hutton Institute
+(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved.
+See the licence text below.
+
+This tool is a short Python script (using Biopython library functions)
+to search nucleotide sequences for open reading frames (ORFs) or coding
+sequences (CDSs) where the first potential start codon is used. See the
+help text in the XML file for more information.
+
+There are just two files to install:
+
+* get_orfs_or_cdss.py (the Python script)
+* get_orfs_or_cdss.xml (the Galaxy tool definition)
+
+The suggested location is in the Galaxy folder tools/filters next to the tool
+for calling sff_extract.py for converting SFF to FASTQ or FASTA + QUAL.
+
+You will also need to modify the tools_conf.xml file to tell Galaxy to offer the
+tool. One suggested location is in the filters section. Simply add the line:
+
+
+
+You will also need to install Biopython 1.54 or later. If you want to run
+the unit tests, include this line in tools_conf.xml.sample and the sample
+FASTA files under the test-data directory. That's it.
+
+
+History
+=======
+
+v0.0.1 - Initial version.
+
+
+Developers
+==========
+
+This script and related tools are being developed on the following hg branch:
+http://bitbucket.org/peterjc/galaxy-central/src/tools
+
+For making the "Galaxy Tool Shed" http://community.g2.bx.psu.edu/ tarball use
+the following command from the Galaxy root folder:
+
+tar -czf get_orfs_or_cdss.tar.gz tools/filters/get_orfs_or_cdss.*
+
+Check this worked:
+
+$ tar -tzf get_orfs_or_cdss.tar.gz
+filter/get_orfs_or_cdss.py
+filter/get_orfs_or_cdss.txt
+filter/get_orfs_or_cdss.xml
+
+
+Licence (MIT/BSD style)
+=======================
+
+Permission to use, copy, modify, and distribute this software and its
+documentation with or without modifications and for any purpose and
+without fee is hereby granted, provided that any copyright notices
+appear in all copies and that both those copyright notices and this
+permission notice appear in supporting documentation, and that the
+names of the contributors or copyright holders not be used in
+advertising or publicity pertaining to distribution of the software
+without specific prior permission.
+
+THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
+WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
+OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
+OR PERFORMANCE OF THIS SOFTWARE.
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/get_orfs_or_cdss/get_orfs_or_cdss.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/get_orfs_or_cdss/get_orfs_or_cdss.xml Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,149 @@
+
+ e.g. to get peptides from ESTs
+
+get_orfs_or_cdss.py $input_file $input_file.ext $table $ftype $ends $mode $min_len $strand $out_nuc_file $out_prot_file > $stdout 2>&1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Bio
+
+
+
+**What it does**
+
+Takes an input file of nucleotide sequences (typically FASTA, but also FASTQ
+and Standard Flowgram Format (SFF) are supported), and searches each sequence
+for open reading frames (ORFs) or potential coding sequences (CDSs) of the
+given minimum length. These are returned as FASTA files of nucleotides and
+protein sequences.
+
+You can choose to have all the ORFs/CDSs above the minimum length for each
+sequence (similar to the EMBOSS getorf tool), those with the longest length
+equal, or the first ORF/CDS with the longest length (in the special case
+where a sequence encodes two or more long ORFs/CDSs of the same length). The
+last option is a reasonable choice when the input sequences represent EST or
+mRNA sequences, where only one ORF/CDS is expected.
+
+Note that if no ORFs/CDSs in a sequence match the criteria, there will be no
+output for that sequence.
+
+Also note that the ORFs/CDSs are assigned modified identifiers to distinguish
+them from the original full length sequences, by appending a suffix.
+
+The start and stop codons are taken from the `NCBI Genetic Codes
+<http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi>`_.
+When searching for ORFs, the sequences will run from stop codon to stop
+codon, and any start codons are ignored. When searching for CDSs, the first
+potential start codon will be used, giving the longest possible CDS within
+each ORF, and thus the longest possible protein sequence. This is useful
+for things like BLAST or domain searching, but since this may not be the
+correct start codon may not be appropriate for signal peptide detection
+etc.
+
+**Example Usage**
+
+Given some EST sequences (Sanger capillary reads) assembled into unigenes,
+or a transcriptome assembly from some RNA-Seq, each of your nucleotide
+sequences should (barring sequencing, assembly errors, frame-shifts etc)
+encode one protein as a single ORF/CDS, which you wish to extract (and
+perhaps translate into amino acids).
+
+If your RNS-Seq data was strand specific, and assembled taking this into
+account, you should only search for ORFs/CDSs on the forward strand.
+
+**Citation**
+
+This tool uses Biopython. If you use this tool in scientific work leading
+to a publication, please cite the Biopython application note (and Galaxy
+too of course):
+
+Cock et al 2009. Biopython: freely available Python tools for computational
+molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
+http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
+
+Installed by Sabrina.
+
+
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/hmmbuild/README.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/hmmbuild/README.txt Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,4 @@
+HMMBUILD/SEARCH
+---------------
+Needs HMMER package avaliable in PATH
+Get v3.0 here: http://hmmer.janelia.org/
\ No newline at end of file
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/hmmsearch/README.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/hmmsearch/README.txt Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,4 @@
+HMMBUILD/SEARCH
+---------------
+Needs HMMER package avaliable in PATH
+Get v3.0 here: http://hmmer.janelia.org/
\ No newline at end of file
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/iAssembler/iAssembler2.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/iAssembler/iAssembler2.xml Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,23 @@
+
+ Assembly of transcriptomes.
+
+ iAssembler_wrapper2.pl -i $input -e $maxlength -h $minoverlap -p $minpercent
+
+
+
+
+
+
+
+
+
+
+
+ iAssembler is a standalone package to assemble ESTs generated using Sanger and/or Roche-454 pyrosequencing technologies into contigs. The pipeline gives much higher accuracy in EST assembly than other existing assemblers by employing an iterative assembly strategy and automated error corrections of mis-assemblies. iAssembler first performs iterative assemblies using MIRA and CAP3 (default: four cycles of MIRA assemblies followed by one CAP3 assembly) to correct assembly errors (mostly sequences derived from the same transcript fail to be assembled together) which occur frequently in just one round of assembly. The program then performs post-assembly quality checking by 1) aligning each EST sequence to its corresponding unigene sequence to identify mis-assemblies; and 2) performing all-verus-all pair-wise sequence alignments of unigenes to identify sequences derived from same transcripts that fail to be assembled together. The identified mis-assemblies are then corrected by the program automatically.
+
+http://bioinfo.bti.cornell.edu/tool/iAssembler/
+
+Citation:
+Zheng Y, Zhao L, Gao J, Fei Z. (2011) iAssembler: a package for de novo assembly of Roche-454/Sanger transcriptome sequences. BMC Bioinformatics 12:453
+
+
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/iAssembler/iAssembler_README.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/iAssembler/iAssembler_README.txt Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,73 @@
+iAssembler tools for Galaxy
+
+ iAssembler is a standalone package to assemble ESTs generated using Sanger and/or Roche-454 pyrosequencing technologies into contigs.
+ The pipeline gives much higher accuracy in EST assembly than other existing assemblers by employing an iterative assembly strategy and automated
+ error corrections of mis-assemblies.
+
+ iAssembler first performs iterative assemblies using MIRA and CAP3 (default: four cycles of MIRA assemblies followed by one CAP3 assembly) to correct
+ assembly errors (mostly sequences derived from the same transcript fail to be assembled together) which occur frequently in just one round of assembly.
+
+ The program then performs post-assembly quality checking by
+ 1) aligning each EST sequence to its corresponding unigene sequence to identify mis-assemblies; and
+ 2) performing all-verus-all pair-wise sequence alignments of unigenes to identify sequences derived from same transcripts that fail to be assembled together.
+
+ The identified mis-assemblies are then corrected by the program automatically.
+
+ http://bioinfo.bti.cornell.edu/tool/iAssembler/
+
+ Citation:
+ Zheng Y, Zhao L, Gao J, Fei Z. (2011) iAssembler: a package for de novo assembly of Roche-454/Sanger transcriptome sequences. BMC Bioinformatics 12:453
+
+Galaxy XML and Perl wrapper script written by: Roger Ngo, Sam Min and Todd H. Oakley, UCSB
+
+Included files in this package:
+
+* iAssemble2r.xml - Galaxy XML tool for iAssembler 1.3
+* iAssembler_wrapper2.pl - Wrapper script for Galaxy XML tool
+* increment.txt - File required by iAssembler_wrapper.pl
+* iAssembler_README - Documentation file
+
+Note: iAssembler.pl MUST BE modified in lines 254-258 due to a bug preventing the program from working in the Galaxy platform.
+
+Pre-Installation:
+
+iAssembler 1.3 must be installed on the Galaxy user account. In order for the Galaxy tool wrapper to work, the iAssembler.pl
+script must be modified on lines 254-258.
+
+FROM:
+
+my $version_file = $working_dir."/mira_version";
+system("$program_bin_dir/mira | head > $version_file");
+
+TO:
+
+my $version_file = $working_dir."/mira_version";
+system("$program_bin_dir/mira > $working_dir/out");
+system("head $working_dir/out > $version_file");
+
+
+Installation Instructions:
+
+1. Copy the iAssembler folder to a directory in your Galaxy user account.
+
+2. Copy iAssembler.xml, iAssembler_wrapper.pl and increment.txt to a folder in /galaxy-dist/tools/
+
+Note: increment.txt and iAssembler_wrapper.pl MUST be in the same directory.
+
+3. In iAssembler_wrapper.pl, modify the $iAssemblerBinPath to point to the iAssembler 1.3 directory in your
+Galaxy user account and $iAssemblerToolPath to the path of the wrapper.
+
+By default they have already been assigned as:
+
+my $iAssemblerBinPath = '/labdata/nfs/galaxy/pkgs/iAssembler';
+my $iAssemblerToolPath = '/labdata/nfs/galaxy/galaxy-dist/tools/iAssembler';
+
+4. Add the Galaxy tool information to tool_conf.xml in /galaxy-dist/
+
+5. Restart Galaxy using
+
+./run.sh --stop-daemon
+
+and then
+
+./run.sh --reload --daemon
\ No newline at end of file
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/iAssembler/iAssembler_wrapper2.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/iAssembler/iAssembler_wrapper2.pl Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,43 @@
+#!/usr/bin/perl
+
+# Wrapper script written by: Roger Ngo, Sam Min and Todd H. Oakley, UCSB
+
+use warnings;
+use strict;
+use Cwd;
+
+my $dir = getcwd();
+
+my $iAssemblerBinPath = '/home/galaxy/pkgs/iAssembler';
+my $iAssemblerToolPath = '/home/galaxy/galaxy-dist/tools/iAssembler';
+
+#iAssembler_wrapper.pl -i $input -e $maxlength -h $minoverlap -p $minpercent ...example
+
+my $input=$ARGV[1];
+my $maxlength=$ARGV[3];
+my $minoverlap=$ARGV[5];
+my $minpercent=$ARGV[7];
+
+open my $file, '<', $iAssemblerToolPath."\/increment.txt";
+ my $increment = <$file>;
+ $increment = int($increment);
+close $file;
+
+my $temp = $increment;
+
+open(UPDATE, '>'.$iAssemblerToolPath."\/increment.txt");
+ $increment = $increment + 1;
+ print UPDATE $increment;
+close(UPDATE);
+
+qx/cp $input $iAssemblerBinPath\/input.$temp.fasta/;
+
+chdir($iAssemblerBinPath);
+
+qx/$iAssemblerBinPath\/iAssembler.pl -i input.$temp.fasta -e $maxlength -h $minoverlap -p $minpercent 2>$dir\/log/;
+
+chdir("$dir");
+
+qx/cp $iAssemblerBinPath\/input.$temp.fasta_output\/unigene_seq.fasta unigene_seq.fasta/;
+
+qx/rm -rf $iAssemblerBinPath\/input.$temp.*/;
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/iAssembler/increment.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/iAssembler/increment.txt Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,1 @@
+0
\ No newline at end of file
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/muscle/README.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/muscle/README.txt Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,4 @@
+MUSCLE
+------
+Needs MUSCLE installed in PATH.
+Get Muscle at: http://www.drive5.com/muscle/
\ No newline at end of file
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/muscle/muscle.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/muscle/muscle.py Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,150 @@
+import os
+import optparse
+import subprocess
+from multiprocessing import Pool
+
+directory = ""
+results = "results.data"
+extension = ".fs"
+aligned_extension = ".afa"
+
+
+def unescape(string):
+ mapped_chars = {
+ '>': '__gt__',
+ '<': '__lt__',
+ "'": '__sq__',
+ '"': '__dq__',
+ '[': '__ob__',
+ ']': '__cb__',
+ '{': '__oc__',
+ '}': '__cc__',
+ '@': '__at__',
+ '\n': '__cn__',
+ '\r': '__cr__',
+ '\t': '__tc__',
+ '#': '__pd__'
+ }
+
+ for key, value in mapped_chars.iteritems():
+ string = string.replace(value, key)
+
+ return string
+
+
+def isTabular(file):
+ with open(file) as f:
+ for line in f:
+ if line[0] == '>':
+ return False
+ return True
+
+
+def toData(text):
+ text = text.split('\n')
+ result = ''
+ for line in text:
+ if '>' in line:
+ line = '\n' + line.replace('> ', "") + '\t'
+ line = line.replace(" ", "\t")
+ result += line
+ return result[1:] # Index past the first newline char
+
+def toDataSingle(text):
+ text = text.split('\n')
+ result = ''
+ for line in text:
+ line = line + '\n'
+ result += line
+ return result[1:] # Index past the first newline char
+
+def muscle(input):
+ file_name = directory + os.sep + input
+ popen = subprocess.Popen(['muscle', "-in", file_name, "-out", file_name + aligned_extension]) # ./muscle
+ popen.wait()
+
+ popen = subprocess.Popen(['pwd']) # ./muscle
+ popen.wait()
+
+
+class Sequence:
+ def __init__(self, string):
+ lis = string.split()
+ self.species = lis[0]
+ self.family = lis[1]
+ self.name = lis[2]
+ self.header = ' '.join(lis[:-1])
+ self.sequence = lis[-1]
+ self.string = string
+
+ def printFASTA(self):
+ return '> ' + self.header + '\n' + self.sequence + '\n'
+
+
+def saveMulti(tabFile):
+ with open(tabFile) as f:
+ for line in f:
+ seq = Sequence(line)
+ with open(directory + os.sep + seq.family + extension, "a") as p:
+ p.write(seq.printFASTA())
+
+
+def saveSingle(fastaFile):
+ with open(fastaFile) as f:
+ for line in f:
+ with open(directory + os.sep + "fasta" + extension, "a") as p:
+ p.write(line)
+
+
+def main():
+ usage = """%prog [options]
+options (listed below) default to 'None' if omitted
+ """
+ parser = optparse.OptionParser(usage=usage)
+
+ parser.add_option(
+ '-d', '--directory',
+ metavar="PATH",
+ dest='path',
+ default='.',
+ help='Path to working directory.')
+
+ parser.add_option(
+ '-i', '--in',
+ dest='input',
+ action='store',
+ type='string',
+ metavar="FILE",
+ help='Name of input data.')
+
+ options, args = parser.parse_args()
+
+ global directory
+ inputFile = unescape(options.input)
+ directory = unescape(options.path) + os.sep + "data"
+
+ os.mkdir(directory)
+
+ if isTabular(inputFile):
+ saveMulti(inputFile)
+ else:
+ saveSingle(inputFile)
+
+ pool = Pool()
+ list_of_files = [file for file in os.listdir(directory) if file.lower().endswith(extension)]
+ pool.map(muscle, list_of_files)
+
+ result = [file for file in os.listdir(directory) if file.lower().endswith(aligned_extension)]
+ if isTabular(inputFile):
+ with open(directory + os.sep + results, "a") as f:
+ for file in result:
+ with open(directory + os.sep + file, "r") as r:
+ f.write(toData(r.read()) + "\n")
+ else:
+ with open(directory + os.sep + results, "a") as f:
+ for file in result:
+ with open(directory + os.sep + file, "r") as r:
+ f.write(toDataSingle(r.read()) + "\n")
+
+if __name__ == '__main__':
+ main()
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/muscle/muscle.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/muscle/muscle.xml Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,25 @@
+
+ MUSCLE: Multiple sequence alignment. Input can be fasta or phytab format.
+
+ muscle
+
+
+ muscle.py -i $data > $muscle_stdout 2>&1
+
+
+
+
+
+
+
+
+
+
+
+ **MUSCLE v3.8**
+
+ Runs MUSCLE on EvolMAP data.
+
+ See MUSCLE help: http://www.drive5.com/muscle/muscle_userguide3.8.html
+
+
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/mview/README.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/mview/README.txt Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,6 @@
+converts an aligned sequences file in fasta format to html for visualization
+
+(Brown, Leroy, Sander 1998)
+
+
+mview package required to be installed
\ No newline at end of file
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/mview/mview.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/mview/mview.pl Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,12 @@
+#!/usr/bin/perl
+
+my $input = $ARGV[0];
+my $dna = $ARGV[1];
+
+if ($dna eq 'dna'){
+ $dna = '-DNA';
+}else{
+ $dna = '';
+}
+my $run = qx/mview -in pearson $dna -bold -coloring group -html head $input/;
+print $run;
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/mview/mview.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/mview/mview.xml Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,27 @@
+
+ View multiple sequence alignment in html
+ mview
+ mview.pl $input $dna > $output
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tools converts an aligned sequences file in fasta format to html for visualizing the alignment in
+Galaxy
+--------
+
+
+
+
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/phytab_LB_pruner/LB_prunerG.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/phytab_LB_pruner/LB_prunerG.pl Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,68 @@
+#!/usr/bin/perl -w
+use strict;
+use Bio::TreeIO;
+use Bio::Tree::Tree;
+
+###this script will find the divergence time between two species
+#input is a tree and 2 species or node names
+
+# parse in newick/new hampshire format
+
+my $infile = $ARGV[0];
+my $multiplier = $ARGV[1];
+my $outfile = $ARGV[2];
+
+open(IN, "$infile") or exit;
+open(OUT, ">$outfile") or exit;
+
+
+my $tree = Bio::TreeIO->new(-format => 'newick',
+ -file => $infile)->next_tree;
+
+
+my $total_length = $tree->total_branch_length;
+my @taxa = $tree->get_nodes;
+my $ave_node_len = $total_length / @taxa;
+my @leafs = $tree->get_leaf_nodes;
+
+for my $node ( $tree->get_leaf_nodes ) {
+ if($node->branch_length > ($multiplier * $ave_node_len)){
+ print OUT $node->id."\t".$ARGV[0]."\t".$node->branch_length."\t$ave_node_len\n";
+ }
+}
+my @LB_clade;
+my @names;
+my $newroot = $tree->get_root_node;
+for my $node ( $tree->get_nodes ) {
+ if($node->branch_length){
+ if($node->branch_length > ($multiplier * $ave_node_len)){
+#This finds a long internal branch
+ #Print descendants as Long Branch Clade
+ for my $child ( $node->get_all_Descendents ) {
+ if($child->is_Leaf){
+ push(@LB_clade,$child->id."\t".$ARGV[0]."\t999\t$ave_node_len");
+ push(@names, $child->id);
+ }
+ }
+ }
+ }
+}
+
+if(@LB_clade > @leafs/2){ #More than half in LBA clade - remove those NOT in LBA clade
+ for my $node ( $tree->get_leaf_nodes ) {
+ my $curcheck = $node->id;
+ chomp($curcheck);
+ my $count = grep {/$curcheck/} @names;
+
+ if($count==0){ #Tip was not collected in @names
+ print OUT $node->id."\t".$ARGV[0]."\t999\t$ave_node_len\n";
+ }
+ }
+}else{
+ if(@LB_clade){
+ print OUT join("\n",@LB_clade)."\n";
+ }
+}
+
+
+close(IN);
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/phytab_LB_pruner/README.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/phytab_LB_pruner/README.txt Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,2 @@
+Identify genes on very long branches.
+
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/phytab_LB_pruner/phytab_LB_pruner.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/phytab_LB_pruner/phytab_LB_pruner.py Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,141 @@
+import os
+import optparse
+import subprocess
+from multiprocessing import Pool
+
+directory = ""
+results = "results.data"
+extension = ""
+aligned_extension = ".tab"
+datatype = ""
+
+perlpath = "/home/galaxy/galaxy-dist/tools/ucsb_phylogenetics/"
+
+def unescape(string):
+ mapped_chars = {
+ '>': '__gt__',
+ '<': '__lt__',
+ "'": '__sq__',
+ '"': '__dq__',
+ '[': '__ob__',
+ ']': '__cb__',
+ '{': '__oc__',
+ '}': '__cc__',
+ '@': '__at__',
+ '\n': '__cn__',
+ '\r': '__cr__',
+ '\t': '__tc__',
+ '#': '__pd__'
+ }
+
+ for key, value in mapped_chars.iteritems():
+ string = string.replace(value, key)
+
+ return string
+
+
+def isTabular(file):
+ with open(file) as f:
+ for line in f:
+ if line[0] == '>':
+ return False
+ return True
+
+#def toData(text, name):
+# name = name.replace("fasta", "") #file name has fasta when fasta file called
+# text = name.replace(".fs.tre", "") + "\t" + text.replace(" " , "")
+# return text
+
+
+def toData(text, name):
+ text = text.split('\n')
+ result = ''
+ for line in text:
+ if '\t' in line:
+ line = line.replace("./data/","") + "\n"
+ result += line
+ return result # Index past the first newline char
+
+def LB_pruner(input):
+ file_name = directory + os.sep + input
+ popen = subprocess.Popen(['perl', perlpath+'LB_prunerG.pl', file_name, indata, file_name + aligned_extension])
+ popen.wait()
+
+class Sequence:
+ def __init__(self, string):
+ lis = string.split()
+ self.name = lis[0]
+ self.tree = lis[1]
+ self.string = string
+
+ def printFASTA(self):
+ return self.tree + '\n'
+
+def saveMulti(tabFile):
+ with open(tabFile) as f:
+ for line in f:
+ seq = Sequence(line)
+ with open(directory + os.sep + seq.name + extension, "a") as p:
+ p.write(seq.printFASTA())
+
+def saveSingle(fastaFile):
+ with open(fastaFile) as f:
+ for line in f:
+ with open(directory + os.sep + "fasta" + extension, "a") as p:
+ p.write(line)
+
+def main():
+ usage = """%prog [options]
+options (listed below) default to 'None' if omitted
+ """
+ parser = optparse.OptionParser(usage=usage)
+
+ parser.add_option(
+ '-d', '--directory',
+ metavar="PATH",
+ dest='path',
+ default='.',
+ help='Path to working directory.')
+
+ parser.add_option(
+ '-i', '--in',
+ dest='input',
+ action='store',
+ type='string',
+ metavar="FILE",
+ help='Name of input data.')
+
+ parser.add_option(
+ '-m', '--mult',
+ dest='datatype',
+ action='store',
+ type='string',
+ help='Multiplier')
+
+ options, args = parser.parse_args()
+
+ global directory
+ global indata
+ inputFile = unescape(options.input)
+ directory = unescape(options.path) + os.sep + "data"
+ indata = unescape(options.datatype)
+
+ os.mkdir(directory)
+
+ if isTabular(inputFile):
+ saveMulti(inputFile)
+ else:
+ saveSingle(inputFile)
+
+ pool = Pool()
+ list_of_files = [file for file in os.listdir(directory) if file.lower().endswith(extension)]
+ pool.map(LB_pruner, list_of_files)
+
+ result = [file for file in os.listdir(directory) if file.lower().endswith(aligned_extension)]
+ with open(directory + os.sep + results, "a") as f:
+ for file in result:
+ with open(directory + os.sep + file, "r") as r:
+ f.write(toData(r.read(),file))
+
+if __name__ == '__main__':
+ main()
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/phytab_LB_pruner/phytab_LB_pruner.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/phytab_LB_pruner/phytab_LB_pruner.xml Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,30 @@
+
+ LB_pruner: Identify genes on very long branches.
+
+ LB_prunerG.pl
+
+
+ phytab_LB_pruner.py -i $data -m $multiplier
+
+
+
+
+
+
+
+
+
+
+
+
+Input a table as follows:::
+ name newick_tree;
+ name2 newick_tree;
+ name3 newick_tree;
+Enter a value for M, the multiplier. LB pruner will find the average of all branch
+lengths in each newick tree. If any branch is longer than Mx the average, that gene will
+be written to the output file. If an internal branch is longer than M times the average,
+then all members of that clade are written to the out file, with 999 as the length of
+each branch. Third column is branch length, fourth column is average BL for tree.
+
+
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/phytab_clearcut/README.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/phytab_clearcut/README.txt Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,6 @@
+Generate Neighbor Joining phylogeny. Input can be fasta or phytab format.
+
+clearcut -- (Evans, Sheneman, Foster 2006)
+
+
+Requires clearcut to be installed
\ No newline at end of file
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/phytab_clearcut/phytab_clearcut.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/phytab_clearcut/phytab_clearcut.py Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,143 @@
+import os
+import optparse
+import subprocess
+from multiprocessing import Pool
+
+directory = ""
+results = "results.data"
+extension = ".fs"
+aligned_extension = ".tre"
+datatype = ""
+
+def unescape(string):
+ mapped_chars = {
+ '>': '__gt__',
+ '<': '__lt__',
+ "'": '__sq__',
+ '"': '__dq__',
+ '[': '__ob__',
+ ']': '__cb__',
+ '{': '__oc__',
+ '}': '__cc__',
+ '@': '__at__',
+ '\n': '__cn__',
+ '\r': '__cr__',
+ '\t': '__tc__',
+ '#': '__pd__'
+ }
+
+ for key, value in mapped_chars.iteritems():
+ string = string.replace(value, key)
+
+ return string
+
+
+def isTabular(file):
+ with open(file) as f:
+ for line in f:
+ if line[0] == '>':
+ return False
+ return True
+
+def toData(text, name):
+ name = name.replace("fasta", "") #file name has fasta when fasta file called
+ text = name.replace(".fs.tre", "") + "\t" + text.replace(" " , "")
+ return text
+
+#
+#def toData(text):
+# text = text.split('\n')
+# result = ''
+# for line in text:
+# if '>' in line:
+# line = '\n' + line.replace('>', "") + '\t'
+# line = line.replace(" ", "\t")
+# result += line
+# return result[1:] # Index past the first newline char
+
+def clearcut(input):
+ file_name = directory + os.sep + input
+ popen = subprocess.Popen(['clearcut', "--in=" + file_name, "--out="+file_name + aligned_extension, "--alignment","-k", indata])
+ popen.wait()
+
+class Sequence:
+ def __init__(self, string):
+ lis = string.split()
+ self.species = lis[0]
+ self.family = lis[1]
+ self.name = lis[2]
+ self.header = ' '.join(lis[:-1])
+ self.sequence = lis[-1]
+ self.string = string
+
+ def printFASTA(self):
+ return '>' + self.header + '\n' + self.sequence + '\n'
+
+def saveMulti(tabFile):
+ with open(tabFile) as f:
+ for line in f:
+ seq = Sequence(line)
+ with open(directory + os.sep + seq.family + extension, "a") as p:
+ p.write(seq.printFASTA())
+
+def saveSingle(fastaFile):
+ with open(fastaFile) as f:
+ for line in f:
+ with open(directory + os.sep + "fasta" + extension, "a") as p:
+ p.write(line)
+
+def main():
+ usage = """%prog [options]
+options (listed below) default to 'None' if omitted
+ """
+ parser = optparse.OptionParser(usage=usage)
+
+ parser.add_option(
+ '-d', '--directory',
+ metavar="PATH",
+ dest='path',
+ default='.',
+ help='Path to working directory.')
+
+ parser.add_option(
+ '-i', '--in',
+ dest='input',
+ action='store',
+ type='string',
+ metavar="FILE",
+ help='Name of input data.')
+
+ parser.add_option(
+ '-t', '--type',
+ dest='datatype',
+ action='store',
+ type='string',
+ help='-P for protein. -D for DNA.')
+
+ options, args = parser.parse_args()
+
+ global directory
+ global indata
+ inputFile = unescape(options.input)
+ directory = unescape(options.path) + os.sep + "data"
+ indata = "-" + unescape(options.datatype)
+
+ os.mkdir(directory)
+
+ if isTabular(inputFile):
+ saveMulti(inputFile)
+ else:
+ saveSingle(inputFile)
+
+ pool = Pool()
+ list_of_files = [file for file in os.listdir(directory) if file.lower().endswith(extension)]
+ pool.map(clearcut, list_of_files)
+
+ result = [file for file in os.listdir(directory) if file.lower().endswith(aligned_extension)]
+ with open(directory + os.sep + results, "a") as f:
+ for file in result:
+ with open(directory + os.sep + file, "r") as r:
+ f.write(toData(r.read(),file))
+
+if __name__ == '__main__':
+ main()
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/phytab_clearcut/phytab_clearcut.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/phytab_clearcut/phytab_clearcut.xml Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,25 @@
+
+ clearcut: Generate Neighbor Joining phylogeny. Input can be fasta or phytab format.
+
+ clearcut
+
+
+ phytab_clearcut.py -i $data -t $datatype > $clearcut_stdout 2>&1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/phytab_prank/README.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/phytab_prank/README.txt Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,5 @@
+Implements PRANK phylogeny aware multiple sequence alignment
+
+(Loytynoja, Goldman 2008)
+
+PRANK Package required to be installed.
\ No newline at end of file
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/phytab_prank/phytab_prank.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/phytab_prank/phytab_prank.py Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,135 @@
+import os
+import optparse
+import subprocess
+from multiprocessing import Pool
+
+directory = ""
+results = "results.data"
+extension = ".fs"
+aligned_extension = ".afa"
+output_extension = ".afa.2.fas"
+
+
+def unescape(string):
+ mapped_chars = {
+ '>': '__gt__',
+ '<': '__lt__',
+ "'": '__sq__',
+ '"': '__dq__',
+ '[': '__ob__',
+ ']': '__cb__',
+ '{': '__oc__',
+ '}': '__cc__',
+ '@': '__at__',
+ '\n': '__cn__',
+ '\r': '__cr__',
+ '\t': '__tc__',
+ '#': '__pd__'
+ }
+
+ for key, value in mapped_chars.iteritems():
+ string = string.replace(value, key)
+
+ return string
+
+
+def isTabular(file):
+ with open(file) as f:
+ for line in f:
+ if line[0] == '>':
+ return False
+ return True
+
+
+def toData(text):
+ text = text.split('\n')
+ result = ''
+ for line in text:
+ if '>' in line:
+ line = '\n' + line.replace('>__XX__', "") + '\t'
+ line = line.replace("__XX__", "\t")
+ result += line
+ return result[1:] # Index past the first newline char
+
+
+def prank(input):
+ file_name = directory + os.sep + input
+ popen = subprocess.Popen(['pwd'])
+ popen.wait()
+ popen = subprocess.Popen(['prank', "-d=" + file_name, "-o=" + file_name + aligned_extension, "-quiet"])
+ popen.wait()
+
+class Sequence:
+ def __init__(self, string):
+ lis = string.split()
+ self.species = lis[0]
+ self.family = lis[1]
+ self.name = lis[2]
+ self.header = '__XX__'.join(lis[:-1]) #prank replaces space with _ so can't join with spaces like muscle does
+ self.sequence = lis[-1]
+ self.string = string
+
+ def printFASTA(self):
+ return '>__XX__' + self.header + '\n' + self.sequence + '\n'
+
+
+def saveMulti(tabFile):
+ with open(tabFile) as f:
+ for line in f:
+ seq = Sequence(line)
+ with open(directory + os.sep + seq.family + extension, "a") as p:
+ p.write(seq.printFASTA())
+
+
+def saveSingle(fastaFile):
+ with open(fastaFile) as f:
+ for line in f:
+ with open(directory + os.sep + "fasta" + extension, "a") as p:
+ p.write(line)
+
+
+def main():
+ usage = """%prog [options]
+options (listed below) default to 'None' if omitted
+ """
+ parser = optparse.OptionParser(usage=usage)
+
+ parser.add_option(
+ '-d', '--directory',
+ metavar="PATH",
+ dest='path',
+ default='.',
+ help='Path to working directory.')
+
+ parser.add_option(
+ '-i', '--in',
+ dest='input',
+ action='store',
+ type='string',
+ metavar="FILE",
+ help='Name of input data.')
+
+ options, args = parser.parse_args()
+
+ global directory
+ inputFile = unescape(options.input)
+ directory = unescape(options.path) + os.sep + "data"
+
+ os.mkdir(directory)
+
+ if isTabular(inputFile):
+ saveMulti(inputFile)
+ else:
+ saveSingle(inputFile)
+
+ pool = Pool()
+ list_of_files = [file for file in os.listdir(directory) if file.lower().endswith(extension)]
+ pool.map(prank, list_of_files)
+ result = [file for file in os.listdir(directory) if file.lower().endswith(output_extension)]
+ with open(directory + os.sep + results, "a") as f:
+ for file in result:
+ with open(directory + os.sep + file, "r") as r:
+ f.write(toData(r.read()) + "\n")
+
+if __name__ == '__main__':
+ main()
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/phytab_prank/phytab_prank.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/phytab_prank/phytab_prank.xml Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,20 @@
+
+ prank: Multiple sequence alignment. Input can be fasta or phytab format.
+
+ prank
+
+
+ phytab_prank.py -i $data > $prank_stdout 2>&1
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/prottest/README.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/prottest/README.txt Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,8 @@
+prottest
+
+Selection of best-fit models of protein evolution
+
+(Abascal, Zardoya, Posada 2005)
+
+
+ProtTest package required to be installed.
\ No newline at end of file
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/prottest/prottest.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/prottest/prottest.xml Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,25 @@
+
+ Selection of best-fit models of protein evolution.
+
+ prottest
+
+
+ prottest_wrapper.pl -i $input -o $output
+
+
+
+
+
+
+
+
+
+
+ ProtTest is a bioinformatic tool for the selection of the most appropriate model of protein evolution (among the set of candidate models) for the data at hand. ProtTest makes this selection by finding the model with the smallest Akaike Information Criterion (AIC) or Bayesian Information Criterion (BIC) score. At the same time, ProtTest obtains model-averaged estimates of different parameters (Posada and Buckley 2004) and calculates the importance of each of these parameters. ProtTest differs from its nucleotide homolog Modeltest (Posada and Crandall 1998) in that it does not include likelihood ratio tests (many models implemented in ProtTest are not nested).
+
+http://darwin.uvigo.es/software/prottest.html
+
+Citation:
+Abascal F, Zardoya R, Posada, D. 2005. ProtTest: Selection of best-fit models of protein evolution. Bioinformatics: 21(9):2104-2105.
+
+
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/prottest/prottest_wrapper.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/prottest/prottest_wrapper.pl Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,16 @@
+#!/usr/bin/perl
+
+use warnings;
+use strict;
+use Cwd;
+
+my $dir=getcwd();
+
+#protest directory placed in main user path. Also, changed runProttest
+#script to include full path of jar file
+my $prottestPath='/home/galaxy/pkgs/ProtTest2.4';
+
+my $input=$ARGV[1];
+my $output=$ARGV[3];
+
+system "$prottestPath/runProtTest -i $input -o $output" ;
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/prune_taxa/Prune_taxa.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/prune_taxa/Prune_taxa.xml Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,17 @@
+
+ Pruning taxa from a tree or multiple trees
+
+ java -jar /home/galaxy/galaxy-dist/tool-data/shared/jars/phyutility.jar -pr -in $input1 -out $output -names $taxonlist 2>&1
+
+
+
+
+
+
+
+
+
+ Calls phyutility.jar -tt to sample from a trees file.
+ Trimming (or thinning) trees can be essential if other programs require less trees than are present in your files. Phyutility will thin these files to make them more manageable.
+
+
\ No newline at end of file
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/prune_taxa/README.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/prune_taxa/README.txt Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,5 @@
+Removing taxa from a tree or multiple trees
+
+(Smith, Dunn 2008)
+
+Prune taxa requires Phyloinformatic Utility to be installed. (phyutility.jar)
\ No newline at end of file
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/raxml/README.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/raxml/README.txt Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,5 @@
+Implements maximum likelihood (ML) search for optimal phylogeny
+
+(Stamatakis 2006)
+
+This tool requires the RAxML package to be installed on the system.
\ No newline at end of file
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/raxml/raxml.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/raxml/raxml.pl Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,98 @@
+#! /usr/bin/perl -w
+
+use strict;
+use warnings;
+#raxml.pl Galaxy wrapper calls raxml from raxml.xml
+#xml file contains:
+#raxml.pl [GTR|CAT] [PROT|DNA] [protmodel] [morphmodel] [phylip file] [constraint] [partition] [best_tree?] [invar?] [#bootreps] [outgroup]
+
+##For debugging command line pass, uncomment next
+#for (my $i=0; $i < @ARGV; $i++){
+# print "Parameter #$i ".$ARGV[$i]."\n\n";
+#}
+#exit;
+
+my $rate_het=shift(@ARGV); #0 rate heterogeneity? value will = GAMMA or CAT
+my $datatype = shift(@ARGV); #1 datatype? True=Protein False=DNA
+my $protmodel = shift(@ARGV); #2 which protein model
+my $morphmodel = shift(@ARGV); #3 which morphology multistate model
+my $data_file= shift(@ARGV); #4 input a phylip file
+my $part_file = shift(@ARGV); #5 optional partition file
+my $constraint_tree = shift(@ARGV); #6 optional constraint tree
+my $find_best = shift(@ARGV); #7 if ML find ML tree as well as bootstrapping
+my $invar = shift(@ARGV); #8 if INVAR include invariant site parameter in model
+my $nboots = shift(@ARGV); #9 Number of bootstrap reps
+my $seed = shift(@ARGV); #10 Number of bootstrap reps
+my $long = shift(@ARGV); #11 decide whether to do a long call or not, with multiple threads
+my $outgroup = shift(@ARGV); #12 Specify the outgroup
+my $model;
+
+
+
+# From shell pipeline
+# raxmlHPC-PTHREADS7.2.6 -T $processors -f a -s $data_name.data -q $data_name.part -m $model -n $data_name -N 100 -x 1234567890 -o Limulus_polyphemus
+# cp RAxML_bestTree.$data_name $data_nameBootBest.tre
+# cp RAxML_bipartitions.$data_name $data_nameBoot.tre
+
+#ADD OPTIONS TO BUILD FULL RAXML COMMANDLINE ARGUMENT
+
+my $build_command;
+#First CALL RAXML THROUGH PATH with 8 threads
+if($long eq 'Long'){
+ $build_command = "raxmlHPC-PTHREADS-SSE3 -T 8";
+}else{
+ $build_command = "raxmlHPC-MPI-SSE3 ";
+}
+#Check if find best tree is desired
+ if($find_best eq "ML"){
+ $build_command = $build_command." -f a ";
+ }
+#Next add call to input phylip file
+ $build_command = $build_command." -s ".$data_file;
+#Add call to partition file name
+ unless($part_file eq 'None'){
+ $build_command = $build_command." -q ".$part_file;
+ }
+#Build substitution model
+ if($datatype eq "PROT"){
+ $model = "PROT";
+ }elsif($datatype eq "DNA"){
+ $model = "GTR";
+ }
+ if($rate_het eq "GTR"){
+ $model = $model."GAMMA";
+ }elsif($rate_het eq "CAT"){
+ $model = $model."CAT";
+ }
+ if($invar eq "INVAR"){
+ $model = $model."I";
+ }
+ if($datatype eq "PROT"){
+ $model = $model.$protmodel;
+ }
+ $build_command = $build_command." -m ".$model;
+#Add multistate morphology model
+ $build_command = $build_command." -K ".$morphmodel;
+#check constraint tree
+ unless($constraint_tree eq 'None'){
+ $build_command = $build_command." -g ".$constraint_tree;
+ }
+#N Bootstraps
+ $build_command = $build_command." -N ".$nboots;
+#Bootstrap seed
+ $build_command = $build_command." -x ".$seed;
+#Parsimony seed
+ $build_command = $build_command." -p "."1234567";
+
+
+#name output files galaxy
+ $build_command = $build_command." -n galaxy";
+#Outgroup
+ if(defined $outgroup){
+ $build_command = $build_command." -o ".$outgroup;
+ }
+
+print "Galaxy COMMAND BUILD WAS: $build_command\n";
+
+#Uncomment to actually call raxml
+system $build_command;
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/raxml/raxml.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/raxml/raxml.xml Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,64 @@
+
+ Maximum Likelihood Analysis
+
+ raxml
+
+
+ raxml.pl $GAMMA $PROT $protmodel $morphmodel $data_file $part_file $constraint
+ $ML $INVAR $Boot $seed Long $Out > $raxml_log
+ 2>&1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ raxml Home Page:
+ http://www.exelixis-lab.org/software.html
+
+
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/scythe/README.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/scythe/README.txt Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,4 @@
+Scythe
+------
+Needs the Scythe binary in PATH.
+Get it at: https://github.com/vsbuffalo/scythe
\ No newline at end of file
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/scythe/scythe.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/scythe/scythe.xml Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,38 @@
+
+ Scythe - A very simple adapter trimmer.
+
+ scythe
+
+
+ scythe -a $adapter -p $prior $quality -o trimmed_sequences.fastq $sequence > $stdout 2>&1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ **Scythe 0.981 BETA**
+ Scythe uses a Naive Bayesian approach to classify contaminant substrings in sequence reads. It considers quality information, which can make it robust in picking out 3'-end adapters, which often include poor quality bases.
+
+ Most next generation sequencing reads have deteriorating quality towards the 3'-end. It's common for a quality-based trimmer to be employed before mapping, assemblies, and analysis to remove these poor quality bases. However, quality-based trimming could remove bases that are helpful in identifying (and removing) 3'-end adapter contaminants. Thus, it is recommended you run Scythe before quality-based trimming, as part of a read quality control pipeline.
+
+ The Bayesian approach Scythe uses compares two likelihood models: the probability of seeing the matches in a sequence given contamination, and not given contamination. Given that the read is contaminated, the probability of seeing a certain number of matches and mistmatches is a function of the quality of the sequence. Given the read is not contaminated (and is thus assumed to be random sequence), the probability of seeing a certain number of matches and mismatches is chance. The posterior is calculated across both these likelihood models, and the class (contaminated or not contaminated) with the maximum posterior probability is the class selected.
+
+ See Scythe help: https://github.com/vsbuffalo/scythe
+
+
+
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/tab2trees/README.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/tab2trees/README.txt Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,3 @@
+An RTool that produces phylogeny graphics, one tree per page, from multiple data partitions or data sets
+
+Tools developed by Oakley et al
\ No newline at end of file
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/tab2trees/makeRtrees.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/tab2trees/makeRtrees.pl Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,91 @@
+#!/usr/bin/perl
+
+#This script generates an R script to print trees to a pdf file
+#input is a table with treenamenewick tree
+use strict;
+
+my $filename = $ARGV[0];
+my $outfile = $ARGV[1];
+open FILE, $filename or die $!;
+my $treetype = $ARGV[2];
+my $extiplabels = $ARGV[3];
+my $options;
+my $labeltaxfile = $ARGV[4];
+my %labelhash;
+my $genecount=0;
+my @genes;
+
+unless($labeltaxfile eq 'None'){
+ open LABELFILE, $labeltaxfile or die $!;
+ while () {
+ chomp;
+ #get a line from the data file
+ my $currentinput = "$_";
+ if($currentinput =~ /\t/){
+ my @splitline = split(/\t/);
+ my $speciesname= $splitline[0];
+ $speciesname = "'".$speciesname."'";
+ my $treename = $splitline[1];
+ if(exists $labelhash{$treename}){
+ push @{ $labelhash{$treename} }, $speciesname;
+ }else{
+ push @{ $labelhash{$treename} }, $speciesname;
+ #$labelhash{$treename} = $speciesname;
+ $genecount ++;
+ push @genes, $treename;
+ }
+ }
+ }
+
+}#end unless
+
+if($extiplabels eq 'yes'){
+ $options = ", show.tip.label=FALSE";
+}else{
+ $options = ", show.tip.label=TRUE";
+}
+
+print "require(ape);\n";
+print "pdf(file='$outfile');\n";
+
+while () {
+ chomp;
+ #get a line from the data file
+ my $currentinput = "$_";
+ my @splitline = split(/\t/);
+ my $treename= $splitline[0];
+ my $tree = $splitline[1];
+ my $labelsvector;
+
+ #print the R commands to make tree graphics
+ print "raw_tree <- read.tree(text = '$tree');\n";
+ print "raw_tree\$edge.length[ is.na(raw_tree\$edge.length) ] <- 0 \n";
+ print "plot(raw_tree, cex=0.6, type='$treetype' $options);\n";
+ print "title('Tree File: $treename');\n";
+
+#Add taxon labels, if optional file present and if labels exist for tree
+ if(exists $labelhash{$treename}){
+ $labelsvector = join ",", @{ $labelhash{$treename} };
+ $labelsvector = "tolabel <- c(".$labelsvector.")";
+ print "thetips <- raw_tree\$tip.label \n";
+ print $labelsvector."\n";
+ print "labels <- match(tolabel,thetips) \n";
+ print "tiplabels(tip=labels, pch=21, cex=1) \n";
+ }
+}
+print "dev.off();\n";
+close FILE;
+
+#Testing hash arrays
+#my %nums;
+#my $test='odd';
+#for my $n (4,5,6,10) {
+# if ($n % 2) {
+# push @{ $nums{$test} }, $n;
+# } else {
+# push @{ $nums{even} }, $n;
+# }
+#}
+#
+#print join ', ', @{ $nums{even} };
+#print "\n\n";
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/tab2trees/phytab2trees.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/tab2trees/phytab2trees.sh Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,5 @@
+
+#First call perl script which reads trees and writes
+/home/galaxy/galaxy-dist/tools/Rtools/makeRtrees.pl $1 $2 $3 $4 $5 > Rtrees.R 2>log.txt
+
+R --vanilla < Rtrees.R 2>log.txt
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/tab2trees/tab2trees.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/tab2trees/tab2trees.xml Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,29 @@
+
+ Create pdf of phylogeny graphics from table of tree names and newick trees
+ phytab2trees.sh $input $output $treetype $extips $labeltax
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert*
+
+-----
+
+
+
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/tagdust/README.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/tagdust/README.txt Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,4 @@
+Tagdust
+-------
+Needs Tagdust installed in PATH.
+Get at: http://genome.gsc.riken.jp/osc/english/software/src/tagdust.tgz
\ No newline at end of file
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/tagdust/tagdust.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/tagdust/tagdust.xml Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,35 @@
+
+ TagDust - A program to eliminate artifactual reads from next-generation sequencing data sets.
+
+ tagdust
+
+
+ tagdust -f $false_rate -o reads.clean.fastq -a reads.artifact.fastq $adapter $illumina > $stdout 2>&1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ **Tagdust 1.13**
+
+ Tagdust compares sequences used during the preparation of
+ a library to the sequenced reads. A read is annotated as an
+ artifact if a large proportion of it's length can be explained by
+ matches to library sequences.
+
+ Tagdust accepts library sequences (e.g. 5' and 3' adaptors) in
+ standard fasta format and reads in either fasta or fastq format.
+
+ See Tagdust help: http://genome.gsc.riken.jp/osc/english/dataresource/
+
+
+
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/thinningtrees/README.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/thinningtrees/README.txt Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,5 @@
+Sub-sample trees from a posterior distribution
+
+(Smith, Dunn 2008)
+
+Thinning Trees requires Phyloinformatic Utility to be installed on the system. (phyutility.jar)
\ No newline at end of file
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/thinningtrees/Thinning_trees.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/thinningtrees/Thinning_trees.xml Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,17 @@
+
+ Sub-sample trees from from a posterior distribution
+
+ java -jar /home/galaxy/galaxy-dist/tool-data/shared/jars/phyutility.jar -tt $input1 -in $input2 -out $output 2>&1
+
+
+
+
+
+
+
+
+
+ Calls phyutility.jar -tt to sample from a trees file.
+ Trimming (or thinning) trees can be essential if other programs require less trees than are present in your files. Phyutility will thin these files to make them more manageable.
+
+
\ No newline at end of file
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/tree_support/README.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/tree_support/README.txt Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,5 @@
+Calculates support for nodes of a single tree (bootstrap) using a file of multiple trees
+
+(Smith, Dunn 2008)
+
+Tree Support requires Phyloinformatic Utility (phyutility.jar) to be installed on the system.
\ No newline at end of file
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/tree_support/tree_support.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/tree_support/tree_support.xml Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,19 @@
+
+ Calculates support for nodes of a single tree (bootstrap) using a file of multiple trees
+
+ phyutility
+
+
+ java -jar /home/galaxy/galaxy-dist/tool-data/shared/jars/phyutility.jar -ts -in $treesfile -tree $besttree -out $outtree
+
+
+
+
+
+
+
+
+
+ Calls phyutility
+
+
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/vert_tree_format/README.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/vert_tree_format/README.txt Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,5 @@
+Convert between phylogenetic tree file formats
+
+(Smith, Dunn 2008)
+
+Vert_tree_format requires Phyloinformatic Utility (phyutility.jar) to be installed.
\ No newline at end of file
diff -r c83d7e34cc88 -r 798d8401d420 ucsb_phylogenetics/vert_tree_format/vert_tree_format.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ucsb_phylogenetics/vert_tree_format/vert_tree_format.xml Sat Sep 08 15:33:34 2012 -0400
@@ -0,0 +1,16 @@
+
+ Convert between phylogenetic tree file formats
+
+ java -jar /home/galaxy/galaxy-dist/tool-data/shared/jars/phyutility.jar -vert -in $input -out $output 2>&1
+
+
+
+
+
+
+
+
+ Calls phyutility.jar -vert to convert tree format.
+ The program automatically reads the tree format, so if starting from newick, will change to nexus; and vice versa.
+
+