comparison getdata/generate_from_phylota.pl @ 0:5b9a38ec4a39 draft default tip

First commit of old repositories
author osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu>
date Tue, 11 Mar 2014 12:19:13 -0700
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:5b9a38ec4a39
1 #!/usr/bin/perl -w
2 use strict;
3 use LWP::Simple;
4 use Bio::SeqIO;
5
6 my $infile = $ARGV[0];
7 my $database = $ARGV[1];
8 my $outfile = $ARGV[2];
9 my $treefile = $ARGV[3];
10 my $phytabfile = $ARGV[4];
11
12 open(IN, "$infile") or exit;
13 open(OUT, ">$outfile") or exit;
14 open(TREES, ">$treefile") or exit;
15 open(DATA, "<$database") or exit;
16 my @data=<DATA>;
17 close(DATA);
18 my @foundtrees;
19 my @alltrees;
20 while(<IN>){
21 my $line = $_;
22 chomp($line);
23 $line =~ s/ /_/g;
24 print "Finding trees with $line ....";
25 @foundtrees = grep /$line/, @data;
26 my $numtrees = scalar @foundtrees;
27 print " $numtrees Trees\n";
28 if($numtrees == 0){
29 my @genus = split(/_/,$line);
30 @foundtrees = grep /$genus[0]/, @data;
31 print "\tTrying genus $genus[0]";
32 my $numtrees = scalar @foundtrees;
33 print " $numtrees genus Trees\n";
34 }
35 push(@alltrees,@foundtrees);
36 }
37
38 @alltrees = uniq(@alltrees);
39 print TREES @alltrees;
40
41 #get fasta files for trees
42 for(my $i=0;$i < @alltrees; $i++){
43 my @tablines = split(/\t/,$alltrees[$i]);
44 my @tici = split(/_/, $tablines[0]);
45 my $ti = $tici[0];
46 my $ci = $tici[1];
47 my $addstring = $ti.$ci."_";
48 $ti =~ s/ti//;
49 $ci =~ s/ci//;
50 my $fastafile = getfastafromphylota($ci,$ti);
51 #Add TI_CI_ to each fastaheader
52 $fastafile =~ s/\>/\>$addstring/g;
53 print OUT $fastafile;
54 }
55 close(IN);
56 close(OUT);
57 close(TREES);
58
59 #Now convert fasta file to phytab file and write
60 open(PHYTAB, ">$phytabfile") or exit;
61 # open infile fasta file
62 my $in_obj = Bio::SeqIO->new(-file => $outfile, '-format' =>'fasta');
63 my $total=0;
64 # grab sequence object
65 while (my $seq = $in_obj->next_seq() ) {
66 my $seq_obj = $in_obj;
67 my $sequenceid = $seq->id;
68 my $species_name = $seq->desc;
69 my $fullheader = $sequenceid." ".$species_name;
70 my $sequence = $seq->seq;
71 my @header = split(/_/, $fullheader);
72 my $cluster = $header[0];
73 my $seqgi = $header[1];
74 $seqgi =~ s/gi//;
75 my $seqti = $header[2];
76 $seqti =~ s/ti//;
77 my $seqsp = $header[3];
78 $seqsp = cleansp($seqsp);
79 print "Writing phytab for $seqsp\n";
80 print PHYTAB $seqsp."\t".$cluster."\t".$seqgi."\t".$sequence."\n";
81 }
82 close(PHYTAB);
83
84 #remove duplicate lines (trees)
85 sub uniq {
86 my %seen = ();
87 my @r = ();
88 foreach my $a (@_) {
89 unless ($seen{$a}) {
90 push @r, $a;
91 $seen{$a} = 1;
92 }
93 }
94 return @r;
95 }
96 sub cleansp
97 {
98 my $seqsp = shift;
99 $seqsp =~ s/ /_/g;
100 $seqsp =~ s/\.//g;
101 $seqsp =~ s/\'//g;
102 $seqsp =~ s/\-//g;
103 return($seqsp);
104 }
105 sub getfastafromphylota
106 {
107 my $ci=shift;
108 my $ti=shift;
109
110 print "Writing: CI:$ci TI:$ti\n";
111
112 my $url = 'http://phylota.net/cgi-bin/sql_getcluster_fasta.cgi?format=all&db=184&ti='.$ti.'&cl='.$ci.'&ntype=1';
113 my $content = get $url;
114 die "Couldn't get $url" unless defined $content;
115 $content =~ s/\<html\>\<pre\>//;
116 $content =~ s/\<\/html\>//;
117 $content =~ s/\<\/pre\>//;
118 return($content);
119 }