comparison getdata/phylota_with_taxid.pl @ 0:5b9a38ec4a39 draft default tip

First commit of old repositories
author osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu>
date Tue, 11 Mar 2014 12:19:13 -0700
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:5b9a38ec4a39
1 #!/usr/bin/perl -w
2 use strict;
3 use LWP::Simple;
4 use Bio::SeqIO;
5
6 my $ti = $ARGV[0];
7 my $outfile = $ARGV[1];
8 my $phytabfile = $ARGV[2];
9
10 open(OUT, ">$outfile") or exit;
11
12 my $content = getclustersfromphylota($ti);
13 my @weblines = split(/\<\/tr\>/, $content);
14 my @ci;
15
16 #Parse html from phylota browser to retain just each ci
17 foreach(@weblines){
18 if($_ =~ m/getcluster\.cgi/){
19 chomp;
20 $_ =~ s/\&ntype\=1\&db\=184\".+// ;
21 $_ =~ s/(.*?)getcluster\.cgi.+cl\=// ;
22 $_ =~ s/\<\/font\>\<\/td\>// ;
23 chomp;
24 $_ =~ s/^\n// ;
25 push(@ci, $_);
26 }
27 }
28
29 #get fasta files for trees
30 for(my $i=0;$i < @ci; $i++){
31 my $ci = $ci[$i];
32 my $addstring = 'ti'.$ti.'ci'.$ci.'_';
33 my $fastafile = getfastafromphylota($ci,$ti);
34 #Add TI_CI_ to each fastaheader
35 $fastafile =~ s/\>/\>$addstring/g;
36 print OUT $fastafile;
37 }
38 close(OUT);
39
40 #Now convert fasta file to phytab file and write
41 open(PHYTAB, ">$phytabfile") or exit;
42 # open infile fasta file
43 my $in_obj = Bio::SeqIO->new(-file => $outfile, '-format' =>'fasta');
44 my $total=0;
45 # grab sequence object
46 while (my $seq = $in_obj->next_seq() ) {
47 my $seq_obj = $in_obj;
48 my $sequenceid = $seq->id;
49 my $species_name = $seq->desc;
50 my $fullheader = $sequenceid." ".$species_name;
51 my $sequence = $seq->seq;
52 my @header = split(/_/, $fullheader);
53 my $cluster = $header[0];
54 my $seqgi = $header[1];
55 $seqgi =~ s/gi//;
56 my $seqti = $header[2];
57 $seqti =~ s/ti//;
58 my $seqsp = $header[3];
59 $seqsp = cleansp($seqsp);
60 print PHYTAB $seqsp."\t".$cluster."\t".$seqgi."\t".$sequence."\n";
61 }
62 close(PHYTAB);
63
64
65
66
67
68
69 #**************************************************************
70 #sub routines
71
72 sub cleansp
73 {
74 my $seqsp = shift;
75 $seqsp =~ s/ /_/g;
76 $seqsp =~ s/\.//g;
77 $seqsp =~ s/\'//g;
78 $seqsp =~ s/\-//g;
79 return($seqsp);
80 }
81 sub getfastafromphylota
82 {
83 my $ci=shift;
84 my $ti=shift;
85
86 #print "Writing: CI:$ci TI:$ti\n";
87
88 my $url = 'http://phylota.net/cgi-bin/sql_getcluster_fasta.cgi?format=all&db=184&ti='.$ti.'&cl='.$ci.'&ntype=1';
89 my $content = get $url;
90 die "Couldn't get $url" unless defined $content;
91 $content =~ s/\<html\>\<pre\>//;
92 $content =~ s/\<\/html\>//;
93 $content =~ s/\<\/pre\>//;
94 return($content);
95 }
96 sub getclustersfromphylota
97 {
98 my $ti=shift;
99
100 #print "Writing: CI:$ci TI:$ti\n";
101
102 my $url = 'http://phylota.net/cgi-bin/sql_getclusterset.cgi?ti='.$ti.'&ntype=1&piflag=1&dflag=0&db=184';
103 my $content = get $url;
104 die "Couldn't get $url" unless defined $content;
105 $content =~ s/\<html\>\<pre\>//;
106 $content =~ s/\<\/html\>//;
107 $content =~ s/\<\/pre\>//;
108 return($content);
109 }