Mercurial > repos > ucsb-phylogenetics > osiris_phylogenetics
diff getdata/get_gb.pl @ 0:5b9a38ec4a39 draft default tip
First commit of old repositories
author | osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> |
---|---|
date | Tue, 11 Mar 2014 12:19:13 -0700 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/getdata/get_gb.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,118 @@ +#!/usr/bin/perl -w +use strict; + +#use FindBin; +#use lib "$FindBin::Bin/lib"; +use Bio::DB::GenBank; +use Bio::SeqIO; + + +my $datafile = $ARGV[0]; +my $datatype = $ARGV[1]; +my $outtype = $ARGV[2]; +my $outfile = $ARGV[3]; +my $manual = $ARGV[4]; +my $mannames = $ARGV[5]; +my $genenames = $ARGV[6]; + + +my $accessions; +my @accnums; +my @newnames; +my $manbin=0; +my @genenames; +my $genebin=0; + +unless($mannames eq ''){ + @newnames = split(/ /,$mannames); + $manbin=1; +} + +unless($genenames eq ''){ + @genenames = split(/ /,$genenames); + $genebin=1; +} + +if($datafile eq 'None'){ + @accnums = split(/ /,$manual); +# if(@accnums != @newnames && $manbin ==1 ){ +# die "Must have the same number of Custom Names as Accession Numbers\n"; +# } +}else{ + open (FILE,"<$datafile") or die "Cannot open file containing accession numbers\n"; + + while (<FILE>) + { + chomp; + next unless ($_); + push(@accnums, $_); + } +} + my $countnames = 0; + foreach (@accnums){ + #Should check input for one word per line and throw error if not, which is not done + + $accessions = $_; + chomp; + if($accessions eq ""){ + die "Put spaces between accession numbers\n"; + } + my $qry_string .= $accessions."[accession]"." "; + + my $GBseq; + my $gb = new Bio::DB::GenBank; + my $query = Bio::DB::Query::GenBank->new + (-query =>$qry_string, + -db =>$datatype); + + my $count; + my $species; + my $seqio; + if($outtype eq "phytab"){ #print phytab format, do not use bioperl as below. + open(OUTFILE, ">>$outfile"); + if( defined ($seqio = $gb->get_Stream_by_query($query)) ){ + # my $seqio = $gb->get_Stream_by_query($query); + while( defined ($GBseq = $seqio->next_seq )) { + my $sequence = $GBseq; # read a sequence object + if($manbin ==1){ #replace GenBank Names with Custom Names + $sequence->id($newnames[$countnames]); + $sequence->desc(''); + $species = $sequence->id; + $countnames++; + }else{ + $species = $sequence->species->binomial; + $species =~ s/ /_/g ; + } + if(@genenames > 0){ + if(@genenames == 1){ + print OUTFILE $species."\t".$genenames[0]."\t".$sequence->accession."\t".$sequence->seq."\n"; + }else{ + print OUTFILE $species."\t".$genenames[$countnames-1]."\t".$sequence->accession."\t".$sequence->seq."\n"; + } + }else{ + print OUTFILE $species."\tNone\t".$sequence->accession."\t".$sequence->seq."\n"; + } + } + }else{ + print "Did not find $accessions\n"; + } + }else{ + my $fh = Bio::SeqIO->newFh(-format=>$outtype, -file=>">>$outfile"); + + if( defined ($seqio = $gb->get_Stream_by_query($query)) ){ + # my $seqio = $gb->get_Stream_by_query($query); + while( defined ($GBseq = $seqio->next_seq )) { + my $sequence = $GBseq; # read a sequence object + if($manbin ==1){ #replace GenBank Names with Custom Names + $sequence->id($newnames[$countnames]); + $sequence->desc(''); + $countnames++; + } + print $fh $sequence; # write a sequence object + } + }else{ + print "Did not find $accessions\n"; + } + } + } +exit;