Mercurial > repos > ucsb-phylogenetics > osiris_phylogenetics
comparison getdata/generate_from_phylota.pl @ 0:5b9a38ec4a39 draft default tip
First commit of old repositories
author | osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> |
---|---|
date | Tue, 11 Mar 2014 12:19:13 -0700 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:5b9a38ec4a39 |
---|---|
1 #!/usr/bin/perl -w | |
2 use strict; | |
3 use LWP::Simple; | |
4 use Bio::SeqIO; | |
5 | |
6 my $infile = $ARGV[0]; | |
7 my $database = $ARGV[1]; | |
8 my $outfile = $ARGV[2]; | |
9 my $treefile = $ARGV[3]; | |
10 my $phytabfile = $ARGV[4]; | |
11 | |
12 open(IN, "$infile") or exit; | |
13 open(OUT, ">$outfile") or exit; | |
14 open(TREES, ">$treefile") or exit; | |
15 open(DATA, "<$database") or exit; | |
16 my @data=<DATA>; | |
17 close(DATA); | |
18 my @foundtrees; | |
19 my @alltrees; | |
20 while(<IN>){ | |
21 my $line = $_; | |
22 chomp($line); | |
23 $line =~ s/ /_/g; | |
24 print "Finding trees with $line ...."; | |
25 @foundtrees = grep /$line/, @data; | |
26 my $numtrees = scalar @foundtrees; | |
27 print " $numtrees Trees\n"; | |
28 if($numtrees == 0){ | |
29 my @genus = split(/_/,$line); | |
30 @foundtrees = grep /$genus[0]/, @data; | |
31 print "\tTrying genus $genus[0]"; | |
32 my $numtrees = scalar @foundtrees; | |
33 print " $numtrees genus Trees\n"; | |
34 } | |
35 push(@alltrees,@foundtrees); | |
36 } | |
37 | |
38 @alltrees = uniq(@alltrees); | |
39 print TREES @alltrees; | |
40 | |
41 #get fasta files for trees | |
42 for(my $i=0;$i < @alltrees; $i++){ | |
43 my @tablines = split(/\t/,$alltrees[$i]); | |
44 my @tici = split(/_/, $tablines[0]); | |
45 my $ti = $tici[0]; | |
46 my $ci = $tici[1]; | |
47 my $addstring = $ti.$ci."_"; | |
48 $ti =~ s/ti//; | |
49 $ci =~ s/ci//; | |
50 my $fastafile = getfastafromphylota($ci,$ti); | |
51 #Add TI_CI_ to each fastaheader | |
52 $fastafile =~ s/\>/\>$addstring/g; | |
53 print OUT $fastafile; | |
54 } | |
55 close(IN); | |
56 close(OUT); | |
57 close(TREES); | |
58 | |
59 #Now convert fasta file to phytab file and write | |
60 open(PHYTAB, ">$phytabfile") or exit; | |
61 # open infile fasta file | |
62 my $in_obj = Bio::SeqIO->new(-file => $outfile, '-format' =>'fasta'); | |
63 my $total=0; | |
64 # grab sequence object | |
65 while (my $seq = $in_obj->next_seq() ) { | |
66 my $seq_obj = $in_obj; | |
67 my $sequenceid = $seq->id; | |
68 my $species_name = $seq->desc; | |
69 my $fullheader = $sequenceid." ".$species_name; | |
70 my $sequence = $seq->seq; | |
71 my @header = split(/_/, $fullheader); | |
72 my $cluster = $header[0]; | |
73 my $seqgi = $header[1]; | |
74 $seqgi =~ s/gi//; | |
75 my $seqti = $header[2]; | |
76 $seqti =~ s/ti//; | |
77 my $seqsp = $header[3]; | |
78 $seqsp = cleansp($seqsp); | |
79 print "Writing phytab for $seqsp\n"; | |
80 print PHYTAB $seqsp."\t".$cluster."\t".$seqgi."\t".$sequence."\n"; | |
81 } | |
82 close(PHYTAB); | |
83 | |
84 #remove duplicate lines (trees) | |
85 sub uniq { | |
86 my %seen = (); | |
87 my @r = (); | |
88 foreach my $a (@_) { | |
89 unless ($seen{$a}) { | |
90 push @r, $a; | |
91 $seen{$a} = 1; | |
92 } | |
93 } | |
94 return @r; | |
95 } | |
96 sub cleansp | |
97 { | |
98 my $seqsp = shift; | |
99 $seqsp =~ s/ /_/g; | |
100 $seqsp =~ s/\.//g; | |
101 $seqsp =~ s/\'//g; | |
102 $seqsp =~ s/\-//g; | |
103 return($seqsp); | |
104 } | |
105 sub getfastafromphylota | |
106 { | |
107 my $ci=shift; | |
108 my $ti=shift; | |
109 | |
110 print "Writing: CI:$ci TI:$ti\n"; | |
111 | |
112 my $url = 'http://phylota.net/cgi-bin/sql_getcluster_fasta.cgi?format=all&db=184&ti='.$ti.'&cl='.$ci.'&ntype=1'; | |
113 my $content = get $url; | |
114 die "Couldn't get $url" unless defined $content; | |
115 $content =~ s/\<html\>\<pre\>//; | |
116 $content =~ s/\<\/html\>//; | |
117 $content =~ s/\<\/pre\>//; | |
118 return($content); | |
119 } |