diff getdata/generate_from_phylota.pl @ 0:5b9a38ec4a39 draft default tip

First commit of old repositories
author osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu>
date Tue, 11 Mar 2014 12:19:13 -0700
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/getdata/generate_from_phylota.pl	Tue Mar 11 12:19:13 2014 -0700
@@ -0,0 +1,119 @@
+#!/usr/bin/perl -w
+use strict;
+use LWP::Simple;
+use Bio::SeqIO;
+
+my $infile = $ARGV[0];
+my $database = $ARGV[1];
+my $outfile = $ARGV[2];
+my $treefile = $ARGV[3];
+my $phytabfile = $ARGV[4];
+
+open(IN, "$infile") or exit;
+open(OUT, ">$outfile") or exit;
+open(TREES, ">$treefile") or exit;
+open(DATA, "<$database") or exit;
+my @data=<DATA>;
+close(DATA);
+my @foundtrees;
+my @alltrees;
+while(<IN>){
+        my $line = $_;
+	chomp($line);
+	$line =~ s/ /_/g;
+	print "Finding trees with $line ....";
+	@foundtrees = grep /$line/, @data;
+	my $numtrees = scalar @foundtrees;
+	print " $numtrees Trees\n";
+	if($numtrees == 0){
+		my @genus = split(/_/,$line);
+		@foundtrees = grep /$genus[0]/, @data;
+		print "\tTrying genus $genus[0]";
+		my $numtrees = scalar @foundtrees;
+		print " $numtrees genus Trees\n";
+	}
+	push(@alltrees,@foundtrees);
+}
+
+@alltrees = uniq(@alltrees);
+print TREES @alltrees;
+
+#get fasta files for trees
+for(my $i=0;$i < @alltrees; $i++){
+	my @tablines = split(/\t/,$alltrees[$i]);
+	my @tici = split(/_/, $tablines[0]);
+	my $ti = $tici[0];
+	my $ci = $tici[1];
+	my $addstring = $ti.$ci."_";
+	$ti =~ s/ti//;
+	$ci =~ s/ci//;
+	my $fastafile = getfastafromphylota($ci,$ti);
+	#Add TI_CI_ to each fastaheader
+	$fastafile =~ s/\>/\>$addstring/g;
+	print OUT $fastafile;
+}
+close(IN);
+close(OUT);
+close(TREES);
+
+#Now convert fasta file to phytab file and write
+open(PHYTAB, ">$phytabfile") or exit;
+# open infile fasta file
+my $in_obj = Bio::SeqIO->new(-file => $outfile, '-format' =>'fasta');
+my $total=0;
+# grab sequence object
+while (my $seq = $in_obj->next_seq() ) {
+	my $seq_obj = $in_obj;
+	my $sequenceid = $seq->id;
+	my $species_name = $seq->desc;
+	my $fullheader = $sequenceid." ".$species_name;
+	my $sequence = $seq->seq;
+	my @header = split(/_/, $fullheader);
+	my $cluster = $header[0];
+	my $seqgi = $header[1];
+	$seqgi =~ s/gi//;
+	my $seqti = $header[2];
+	$seqti =~ s/ti//;
+	my $seqsp = $header[3];
+	$seqsp = cleansp($seqsp);
+	print "Writing phytab for $seqsp\n";
+	print PHYTAB $seqsp."\t".$cluster."\t".$seqgi."\t".$sequence."\n";
+}
+close(PHYTAB);
+
+#remove duplicate lines (trees)
+sub uniq {
+    my %seen = ();
+    my @r = ();
+    foreach my $a (@_) {
+        unless ($seen{$a}) {
+            push @r, $a;
+            $seen{$a} = 1;
+        }
+    }
+    return @r;
+}
+sub cleansp
+{
+	my $seqsp = shift;
+	$seqsp =~ s/ /_/g;
+	$seqsp =~ s/\.//g;
+	$seqsp =~ s/\'//g;
+	$seqsp =~ s/\-//g;
+	return($seqsp);
+}
+sub getfastafromphylota
+{
+	my $ci=shift;
+	my $ti=shift;
+
+	print "Writing: CI:$ci TI:$ti\n";
+
+	my $url = 'http://phylota.net/cgi-bin/sql_getcluster_fasta.cgi?format=all&db=184&ti='.$ti.'&cl='.$ci.'&ntype=1';
+	my $content = get $url;
+	die "Couldn't get $url" unless defined $content;
+	$content =~ s/\<html\>\<pre\>//;
+	$content =~ s/\<\/html\>//;
+	$content =~ s/\<\/pre\>//;
+	return($content);
+}