diff getdata/get_gb.pl @ 0:5b9a38ec4a39 draft default tip

First commit of old repositories
author osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu>
date Tue, 11 Mar 2014 12:19:13 -0700
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/getdata/get_gb.pl	Tue Mar 11 12:19:13 2014 -0700
@@ -0,0 +1,118 @@
+#!/usr/bin/perl -w
+use strict;
+
+#use FindBin;
+#use lib "$FindBin::Bin/lib";
+use Bio::DB::GenBank;
+use Bio::SeqIO;
+
+
+my $datafile = $ARGV[0];
+my $datatype = $ARGV[1];
+my $outtype = $ARGV[2];
+my $outfile = $ARGV[3];
+my $manual = $ARGV[4];
+my $mannames = $ARGV[5];
+my $genenames = $ARGV[6];
+
+
+my $accessions;
+my @accnums;
+my @newnames;
+my $manbin=0;
+my @genenames;
+my $genebin=0;
+
+unless($mannames eq ''){
+	@newnames = split(/ /,$mannames);
+	$manbin=1;
+}
+
+unless($genenames eq ''){
+	@genenames = split(/ /,$genenames);
+	$genebin=1;
+}
+
+if($datafile eq 'None'){
+	@accnums = split(/ /,$manual);
+#	if(@accnums != @newnames && $manbin ==1 ){
+#		die "Must have the same number of Custom Names as Accession Numbers\n";
+#	}
+}else{
+	open (FILE,"<$datafile") or die "Cannot open file containing accession numbers\n";
+
+	while (<FILE>)
+	{
+	        chomp;
+	        next unless ($_);
+		push(@accnums, $_);
+	}
+}
+	my $countnames = 0;
+	foreach (@accnums){
+		#Should check input for one word per line and throw error if not, which is not done
+	
+	        $accessions = $_;
+		chomp;
+		if($accessions eq ""){
+			die "Put spaces between accession numbers\n";
+		}
+	        my $qry_string .= $accessions."[accession]"." ";
+	
+	        my $GBseq;
+	        my $gb = new Bio::DB::GenBank;
+	        my $query = Bio::DB::Query::GenBank->new
+	                (-query   =>$qry_string,
+	                 -db      =>$datatype);
+	
+	        my $count;
+	        my $species;
+		my $seqio;
+		if($outtype eq "phytab"){ #print phytab format, do not use bioperl as below.
+			open(OUTFILE, ">>$outfile");
+			if( defined ($seqio = $gb->get_Stream_by_query($query)) ){
+			#        	my $seqio = $gb->get_Stream_by_query($query);
+	        		while( defined ($GBseq = $seqio->next_seq )) {
+	        		        my $sequence = $GBseq;   # read a sequence object
+					if($manbin ==1){ #replace GenBank Names with Custom Names
+						$sequence->id($newnames[$countnames]);
+						$sequence->desc('');
+						$species = $sequence->id;
+						$countnames++;
+					}else{
+						$species = $sequence->species->binomial;
+						$species =~ s/ /_/g ;
+					}
+					if(@genenames > 0){
+						if(@genenames == 1){
+		        				print OUTFILE $species."\t".$genenames[0]."\t".$sequence->accession."\t".$sequence->seq."\n";
+						}else{
+		        				print OUTFILE $species."\t".$genenames[$countnames-1]."\t".$sequence->accession."\t".$sequence->seq."\n";
+					        }
+					}else{
+		        			print OUTFILE $species."\tNone\t".$sequence->accession."\t".$sequence->seq."\n";
+					}
+	        		}
+			}else{
+				print "Did not find $accessions\n";
+			}
+		}else{
+	        	my $fh = Bio::SeqIO->newFh(-format=>$outtype, -file=>">>$outfile");
+
+			if( defined ($seqio = $gb->get_Stream_by_query($query)) ){
+			#        	my $seqio = $gb->get_Stream_by_query($query);
+	        		while( defined ($GBseq = $seqio->next_seq )) {
+	        		        my $sequence = $GBseq;   # read a sequence object
+					if($manbin ==1){ #replace GenBank Names with Custom Names
+						$sequence->id($newnames[$countnames]);
+						$sequence->desc('');
+						$countnames++;
+					}
+	        		        print $fh $sequence; # write a sequence object
+	        		}
+			}else{
+				print "Did not find $accessions\n";
+			}
+		}
+	}
+exit;