diff getdata/phylota_with_taxid.pl @ 0:5b9a38ec4a39 draft default tip

First commit of old repositories
author osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu>
date Tue, 11 Mar 2014 12:19:13 -0700
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/getdata/phylota_with_taxid.pl	Tue Mar 11 12:19:13 2014 -0700
@@ -0,0 +1,109 @@
+#!/usr/bin/perl -w
+use strict;
+use LWP::Simple;
+use Bio::SeqIO;
+
+my $ti = $ARGV[0];
+my $outfile = $ARGV[1];
+my $phytabfile = $ARGV[2];
+
+open(OUT, ">$outfile") or exit;
+
+my $content = getclustersfromphylota($ti);
+my @weblines = split(/\<\/tr\>/, $content);
+my @ci;
+
+#Parse html from phylota browser to retain just each ci
+foreach(@weblines){
+	if($_ =~ m/getcluster\.cgi/){
+		chomp;
+		$_ =~ s/\&ntype\=1\&db\=184\".+// ;
+		$_ =~ s/(.*?)getcluster\.cgi.+cl\=// ;
+		$_ =~ s/\<\/font\>\<\/td\>// ;
+		chomp;
+		$_ =~ s/^\n// ;
+		push(@ci, $_);
+	}
+}
+
+#get fasta files for trees
+for(my $i=0;$i < @ci; $i++){
+	my $ci = $ci[$i];
+	my $addstring = 'ti'.$ti.'ci'.$ci.'_';
+	my $fastafile = getfastafromphylota($ci,$ti);
+	#Add TI_CI_ to each fastaheader
+	$fastafile =~ s/\>/\>$addstring/g;
+	print OUT $fastafile;
+}
+close(OUT);
+
+#Now convert fasta file to phytab file and write
+open(PHYTAB, ">$phytabfile") or exit;
+# open infile fasta file
+my $in_obj = Bio::SeqIO->new(-file => $outfile, '-format' =>'fasta');
+my $total=0;
+# grab sequence object
+while (my $seq = $in_obj->next_seq() ) {
+	my $seq_obj = $in_obj;
+	my $sequenceid = $seq->id;
+	my $species_name = $seq->desc;
+	my $fullheader = $sequenceid." ".$species_name;
+	my $sequence = $seq->seq;
+	my @header = split(/_/, $fullheader);
+	my $cluster = $header[0];
+	my $seqgi = $header[1];
+	$seqgi =~ s/gi//;
+	my $seqti = $header[2];
+	$seqti =~ s/ti//;
+	my $seqsp = $header[3];
+	$seqsp = cleansp($seqsp);
+	print PHYTAB $seqsp."\t".$cluster."\t".$seqgi."\t".$sequence."\n";
+}
+close(PHYTAB);
+
+
+
+
+
+
+#**************************************************************
+#sub routines
+
+sub cleansp
+{
+	my $seqsp = shift;
+	$seqsp =~ s/ /_/g;
+	$seqsp =~ s/\.//g;
+	$seqsp =~ s/\'//g;
+	$seqsp =~ s/\-//g;
+	return($seqsp);
+}
+sub getfastafromphylota
+{
+	my $ci=shift;
+	my $ti=shift;
+
+	#print "Writing: CI:$ci TI:$ti\n";
+
+	my $url = 'http://phylota.net/cgi-bin/sql_getcluster_fasta.cgi?format=all&db=184&ti='.$ti.'&cl='.$ci.'&ntype=1';
+	my $content = get $url;
+	die "Couldn't get $url" unless defined $content;
+	$content =~ s/\<html\>\<pre\>//;
+	$content =~ s/\<\/html\>//;
+	$content =~ s/\<\/pre\>//;
+	return($content);
+}
+sub getclustersfromphylota
+{
+	my $ti=shift;
+
+	#print "Writing: CI:$ci TI:$ti\n";
+
+	my $url = 'http://phylota.net/cgi-bin/sql_getclusterset.cgi?ti='.$ti.'&ntype=1&piflag=1&dflag=0&db=184';
+	my $content = get $url;
+	die "Couldn't get $url" unless defined $content;
+	$content =~ s/\<html\>\<pre\>//;
+	$content =~ s/\<\/html\>//;
+	$content =~ s/\<\/pre\>//;
+	return($content);
+}