Mercurial > repos > ucsb-phylogenetics > osiris_phylogenetics
comparison getdata/generate_from_phylota.pl @ 0:5b9a38ec4a39 draft default tip
First commit of old repositories
| author | osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> |
|---|---|
| date | Tue, 11 Mar 2014 12:19:13 -0700 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:5b9a38ec4a39 |
|---|---|
| 1 #!/usr/bin/perl -w | |
| 2 use strict; | |
| 3 use LWP::Simple; | |
| 4 use Bio::SeqIO; | |
| 5 | |
| 6 my $infile = $ARGV[0]; | |
| 7 my $database = $ARGV[1]; | |
| 8 my $outfile = $ARGV[2]; | |
| 9 my $treefile = $ARGV[3]; | |
| 10 my $phytabfile = $ARGV[4]; | |
| 11 | |
| 12 open(IN, "$infile") or exit; | |
| 13 open(OUT, ">$outfile") or exit; | |
| 14 open(TREES, ">$treefile") or exit; | |
| 15 open(DATA, "<$database") or exit; | |
| 16 my @data=<DATA>; | |
| 17 close(DATA); | |
| 18 my @foundtrees; | |
| 19 my @alltrees; | |
| 20 while(<IN>){ | |
| 21 my $line = $_; | |
| 22 chomp($line); | |
| 23 $line =~ s/ /_/g; | |
| 24 print "Finding trees with $line ...."; | |
| 25 @foundtrees = grep /$line/, @data; | |
| 26 my $numtrees = scalar @foundtrees; | |
| 27 print " $numtrees Trees\n"; | |
| 28 if($numtrees == 0){ | |
| 29 my @genus = split(/_/,$line); | |
| 30 @foundtrees = grep /$genus[0]/, @data; | |
| 31 print "\tTrying genus $genus[0]"; | |
| 32 my $numtrees = scalar @foundtrees; | |
| 33 print " $numtrees genus Trees\n"; | |
| 34 } | |
| 35 push(@alltrees,@foundtrees); | |
| 36 } | |
| 37 | |
| 38 @alltrees = uniq(@alltrees); | |
| 39 print TREES @alltrees; | |
| 40 | |
| 41 #get fasta files for trees | |
| 42 for(my $i=0;$i < @alltrees; $i++){ | |
| 43 my @tablines = split(/\t/,$alltrees[$i]); | |
| 44 my @tici = split(/_/, $tablines[0]); | |
| 45 my $ti = $tici[0]; | |
| 46 my $ci = $tici[1]; | |
| 47 my $addstring = $ti.$ci."_"; | |
| 48 $ti =~ s/ti//; | |
| 49 $ci =~ s/ci//; | |
| 50 my $fastafile = getfastafromphylota($ci,$ti); | |
| 51 #Add TI_CI_ to each fastaheader | |
| 52 $fastafile =~ s/\>/\>$addstring/g; | |
| 53 print OUT $fastafile; | |
| 54 } | |
| 55 close(IN); | |
| 56 close(OUT); | |
| 57 close(TREES); | |
| 58 | |
| 59 #Now convert fasta file to phytab file and write | |
| 60 open(PHYTAB, ">$phytabfile") or exit; | |
| 61 # open infile fasta file | |
| 62 my $in_obj = Bio::SeqIO->new(-file => $outfile, '-format' =>'fasta'); | |
| 63 my $total=0; | |
| 64 # grab sequence object | |
| 65 while (my $seq = $in_obj->next_seq() ) { | |
| 66 my $seq_obj = $in_obj; | |
| 67 my $sequenceid = $seq->id; | |
| 68 my $species_name = $seq->desc; | |
| 69 my $fullheader = $sequenceid." ".$species_name; | |
| 70 my $sequence = $seq->seq; | |
| 71 my @header = split(/_/, $fullheader); | |
| 72 my $cluster = $header[0]; | |
| 73 my $seqgi = $header[1]; | |
| 74 $seqgi =~ s/gi//; | |
| 75 my $seqti = $header[2]; | |
| 76 $seqti =~ s/ti//; | |
| 77 my $seqsp = $header[3]; | |
| 78 $seqsp = cleansp($seqsp); | |
| 79 print "Writing phytab for $seqsp\n"; | |
| 80 print PHYTAB $seqsp."\t".$cluster."\t".$seqgi."\t".$sequence."\n"; | |
| 81 } | |
| 82 close(PHYTAB); | |
| 83 | |
| 84 #remove duplicate lines (trees) | |
| 85 sub uniq { | |
| 86 my %seen = (); | |
| 87 my @r = (); | |
| 88 foreach my $a (@_) { | |
| 89 unless ($seen{$a}) { | |
| 90 push @r, $a; | |
| 91 $seen{$a} = 1; | |
| 92 } | |
| 93 } | |
| 94 return @r; | |
| 95 } | |
| 96 sub cleansp | |
| 97 { | |
| 98 my $seqsp = shift; | |
| 99 $seqsp =~ s/ /_/g; | |
| 100 $seqsp =~ s/\.//g; | |
| 101 $seqsp =~ s/\'//g; | |
| 102 $seqsp =~ s/\-//g; | |
| 103 return($seqsp); | |
| 104 } | |
| 105 sub getfastafromphylota | |
| 106 { | |
| 107 my $ci=shift; | |
| 108 my $ti=shift; | |
| 109 | |
| 110 print "Writing: CI:$ci TI:$ti\n"; | |
| 111 | |
| 112 my $url = 'http://phylota.net/cgi-bin/sql_getcluster_fasta.cgi?format=all&db=184&ti='.$ti.'&cl='.$ci.'&ntype=1'; | |
| 113 my $content = get $url; | |
| 114 die "Couldn't get $url" unless defined $content; | |
| 115 $content =~ s/\<html\>\<pre\>//; | |
| 116 $content =~ s/\<\/html\>//; | |
| 117 $content =~ s/\<\/pre\>//; | |
| 118 return($content); | |
| 119 } |
