view getdata/phylota_with_taxid.pl @ 0:5b9a38ec4a39 draft default tip

First commit of old repositories
author osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu>
date Tue, 11 Mar 2014 12:19:13 -0700
parents
children
line wrap: on
line source

#!/usr/bin/perl -w
use strict;
use LWP::Simple;
use Bio::SeqIO;

my $ti = $ARGV[0];
my $outfile = $ARGV[1];
my $phytabfile = $ARGV[2];

open(OUT, ">$outfile") or exit;

my $content = getclustersfromphylota($ti);
my @weblines = split(/\<\/tr\>/, $content);
my @ci;

#Parse html from phylota browser to retain just each ci
foreach(@weblines){
	if($_ =~ m/getcluster\.cgi/){
		chomp;
		$_ =~ s/\&ntype\=1\&db\=184\".+// ;
		$_ =~ s/(.*?)getcluster\.cgi.+cl\=// ;
		$_ =~ s/\<\/font\>\<\/td\>// ;
		chomp;
		$_ =~ s/^\n// ;
		push(@ci, $_);
	}
}

#get fasta files for trees
for(my $i=0;$i < @ci; $i++){
	my $ci = $ci[$i];
	my $addstring = 'ti'.$ti.'ci'.$ci.'_';
	my $fastafile = getfastafromphylota($ci,$ti);
	#Add TI_CI_ to each fastaheader
	$fastafile =~ s/\>/\>$addstring/g;
	print OUT $fastafile;
}
close(OUT);

#Now convert fasta file to phytab file and write
open(PHYTAB, ">$phytabfile") or exit;
# open infile fasta file
my $in_obj = Bio::SeqIO->new(-file => $outfile, '-format' =>'fasta');
my $total=0;
# grab sequence object
while (my $seq = $in_obj->next_seq() ) {
	my $seq_obj = $in_obj;
	my $sequenceid = $seq->id;
	my $species_name = $seq->desc;
	my $fullheader = $sequenceid." ".$species_name;
	my $sequence = $seq->seq;
	my @header = split(/_/, $fullheader);
	my $cluster = $header[0];
	my $seqgi = $header[1];
	$seqgi =~ s/gi//;
	my $seqti = $header[2];
	$seqti =~ s/ti//;
	my $seqsp = $header[3];
	$seqsp = cleansp($seqsp);
	print PHYTAB $seqsp."\t".$cluster."\t".$seqgi."\t".$sequence."\n";
}
close(PHYTAB);






#**************************************************************
#sub routines

sub cleansp
{
	my $seqsp = shift;
	$seqsp =~ s/ /_/g;
	$seqsp =~ s/\.//g;
	$seqsp =~ s/\'//g;
	$seqsp =~ s/\-//g;
	return($seqsp);
}
sub getfastafromphylota
{
	my $ci=shift;
	my $ti=shift;

	#print "Writing: CI:$ci TI:$ti\n";

	my $url = 'http://phylota.net/cgi-bin/sql_getcluster_fasta.cgi?format=all&db=184&ti='.$ti.'&cl='.$ci.'&ntype=1';
	my $content = get $url;
	die "Couldn't get $url" unless defined $content;
	$content =~ s/\<html\>\<pre\>//;
	$content =~ s/\<\/html\>//;
	$content =~ s/\<\/pre\>//;
	return($content);
}
sub getclustersfromphylota
{
	my $ti=shift;

	#print "Writing: CI:$ci TI:$ti\n";

	my $url = 'http://phylota.net/cgi-bin/sql_getclusterset.cgi?ti='.$ti.'&ntype=1&piflag=1&dflag=0&db=184';
	my $content = get $url;
	die "Couldn't get $url" unless defined $content;
	$content =~ s/\<html\>\<pre\>//;
	$content =~ s/\<\/html\>//;
	$content =~ s/\<\/pre\>//;
	return($content);
}