diff ExtractSeqsFromFasta.pl @ 0:163892325845 draft default tip

Initial commit.
author galaxyp
date Fri, 10 May 2013 17:15:08 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ExtractSeqsFromFasta.pl	Fri May 10 17:15:08 2013 -0400
@@ -0,0 +1,397 @@
+#!/usr/bin/perl -w
+#
+# This script extracts sequences from a multi-sequence FASTA file
+# based on a list of accession numbers / IDs
+#
+# =====================================================
+# $Id: ExtractSeqsFromFasta.pl 15 2010-07-08 18:07:59Z pieter $
+# $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/ExtractSeqsFromFasta.pl $
+# $LastChangedDate: 2010-07-08 13:07:59 -0500 (Thu, 08 Jul 2010) $
+# $LastChangedRevision: 15 $
+# $LastChangedBy: pieter $
+# =====================================================
+
+#
+# initialise environment
+#
+use strict;
+use Getopt::Std;
+use Log::Log4perl qw(:easy);
+
+my %log_levels = (
+	'ALL'   => $ALL,
+	'TRACE' => $TRACE,
+	'DEBUG' => $DEBUG,
+	'INFO'  => $INFO,
+	'WARN'  => $WARN,
+	'ERROR' => $ERROR,
+	'FATAL' => $FATAL,
+	'OFF'   => $OFF,
+);
+my %match_logic_types = (
+	'literal'   => 1,
+	'regex' 	=> 1,
+);
+
+#
+# Get options.
+#
+my %opts;
+Getopt::Std::getopts('ul:i:o:f:m:', \%opts);
+
+my $input						= $opts{'i'};
+my $output						= $opts{'o'};
+my $filter						= $opts{'f'};
+my $log_level					= $opts{'l'};
+my $match_logic					= $opts{'m'};
+my $ignore_accession_version	= $opts{'u'};
+
+#
+# Configure logging.
+#
+# Provides default if user did not specify log level:
+$log_level = (defined($log_level) ? $log_level : 'INFO');
+
+# Reset log level to default if user specified illegal log level.
+$log_level = (
+	defined($log_levels{$log_level})
+	? $log_levels{$log_level}
+	: $log_levels{'INFO'});
+
+#Log::Log4perl->init('log4perl.properties');
+Log::Log4perl->easy_init(
+
+	#{ level    => $log_level,
+	#  file     => ">>ExtractSeqsFromFasta.log",
+	#  layout   => '%F{1}-%L-%M: %m%n' },
+	{
+		level  => $log_level,
+		file   => "STDOUT",
+		layout => '%d L:%L %p> %m%n'
+	},
+);
+my $logger = Log::Log4perl::get_logger();
+
+#
+# Check user input.
+#
+unless (defined($input) && defined($output) && defined($filter)) {
+	_Usage();
+	exit;
+}
+if ($input =~ /^$/ || $output =~ /^$/ || $filter =~ /^$/) {
+	_Usage();
+	exit;
+}
+if ($input eq $output) {
+	$logger->fatal('Output file is the same as the input file. Select a different file for the output.');
+	exit;
+}
+
+#
+# Check input and filter file.
+#
+unless (-e $input && -f $input && -r $input) {
+	$logger->fatal('Cannot read from input file ' . $input . ': ' . $!);
+	exit;
+}
+unless (-e $filter && -f $filter && -r $filter) {
+	$logger->fatal('Cannot read from filter file ' . $filter . ': ' . $!);
+	exit;
+}
+#
+# Check match logic.
+#
+$match_logic = (defined($match_logic) ? $match_logic : 'literal');
+unless (defined($match_logic_types{$match_logic})) {
+	$logger->fatal('Unkown logic ' . $match_logic . ' specified for filtering of the input sequences.');
+	exit;
+}
+
+#
+# Read accession numbers to search the FASTA file(s) for
+#
+my $wanted          = _CreateLookupHash($filter, $match_logic);
+my $seqs_to_extract = scalar(keys(%{$wanted}));
+$logger->info('Number of sequences to search for: ' . $seqs_to_extract);
+
+#
+# Extract seqs
+#
+my ($counter) = _ExtractSeqs($input, $output, $wanted, $match_logic);
+
+$logger->info('Extracted ' . $counter . ' sequences.');
+$logger->info('Finished!');
+
+#
+##
+### Internal subs.
+##
+#
+
+sub _CreateLookupHash {
+
+	my ($file_path, $match_logic) = @_;
+
+	$logger->info('Parsing ' . $file_path . '...');
+
+	my %wanted;
+	my $file_path_fh;
+
+	eval {
+		open($file_path_fh, "<$file_path");
+	};
+	if ($@) {
+		$logger->fatal('Cannot read ID file: ' . $@);
+		exit;
+	}
+	
+	LINE: while (my $line = <$file_path_fh>) {
+
+		$line =~ s/[\r\n]+//g;
+
+		if ($match_logic eq 'literal') {
+			
+			if ($line =~ m/([a-z0-9_\-\.]+)/i) {
+
+				my $id = $1;
+				
+				if ($ignore_accession_version) {
+						
+					#
+					# Remove version from accession number if it was versioned.
+					#
+					if ($id =~ m/([^\.]+)\.(\d+)/) {
+						
+						$id = $1;
+						
+					}
+				}
+							
+				$logger->debug('Found accession/ID ' . $id);
+				$wanted{$id} = 1;
+			
+			} else {
+				$logger->warn('Accession/ID in unsupported format: ' . $line);
+				next;
+			}
+			
+		} elsif ($match_logic eq 'regex') {
+			
+			if ($line =~ m/([a-z0-9_\-\.\[\]:?^\$]+)/i) {
+				my $regex = $1;
+				$logger->debug('Found regex ' . $regex);
+				$wanted{$regex} = 1;
+			} else {
+				$logger->warn('Regex in unsupported format: ' . $line);
+				next;
+			}
+			
+		}
+	}
+
+	close($file_path_fh);
+	$logger->info('Created ID lookup list.');
+
+	return (\%wanted);
+
+}
+
+sub _ExtractSeqs {
+
+	my ($path_from, $path_to, $wanted, $match_logic) = @_;
+
+	$logger->info('Parsing ' . $path_from . '...');
+
+	my $extracted_seqs = 0;
+	my $found          = 0;
+	my $path_from_fh;
+	my $path_to_fh;
+
+	eval {
+		open($path_from_fh,	"<$path_from");
+	};
+	if ($@) {
+		$logger->fatal('Cannot read FASTA file: ' . $@);
+		exit;
+	}
+	eval {
+		open($path_to_fh,	">$path_to");
+	};
+	if ($@) {
+		$logger->fatal('Cannot write FASTA file: ' . $@);
+		exit;
+	}
+	
+	LINE: while (my $line = <$path_from_fh>) {
+
+		if ($line =~ m/^>/) {
+			$logger->debug('Found header line: ' . $line);
+			$found = 0;
+			my @header_ids;
+
+            #
+            # Check for the presence of some frequently occurring naming schemes:
+            #
+            # >IPI:CON_Trypsin|SWISS-PROT:P00761|TRYP_PIG Trypsin - Sus scrofa (Pig).
+            # >IPI:CON_IPI00174775.2|TREMBL:Q32MB2;Q86Y46 Tax_Id=9606 Gene_Symbol=KRT73 Keratin-73
+            # >sp|Q42592|APXS_ARATH L-ascorbate peroxidase S, chloroplastic/mitochondrial;
+            # >jgi|Triad1|1|gw1.3.1.1
+            #
+			if ($line =~ m/^>((([^\s\n\r:;|]+)[:]([^\s\n\r:|]+)[|;])*([^\s\n\r:;|]+)[:]([^\s\n\r:|]+))[|;]?(\s+(.+))?/i) {
+
+				#
+				# One or more namespace prefixed IDs.
+				#
+				my $concatenated_namespace_prefixed_ids = $1;
+				$logger->debug('Found prefixed IDs in header: ' . $concatenated_namespace_prefixed_ids);
+
+				#   database_namespace	= $3 && $5;
+				#   id					= $4 && $6;
+				#   description			= $8;
+				my @namespace_prefixed_ids = split(/[|;]/, $concatenated_namespace_prefixed_ids);
+
+				foreach my $prefixed_id (@namespace_prefixed_ids) {
+
+					if ($prefixed_id =~ m/([^\s\n\r:;|]+:)?([^\s\n\r:|]+)/i) {
+
+						my $this_id = $2;
+						
+						$logger->debug('Found ID: ' . $this_id);
+						push(@header_ids, $this_id);
+
+					} else {
+
+						$logger->warn(
+							'This should have been an optionally prefixed ID, '
+							  . 'but failed to match the corresponding regex: '
+							  . $prefixed_id);
+
+					}
+				}
+			
+			} elsif ($line =~ m/^>((([^\s\n\r:;|]+)[|;])*([^\s\n\r:;|]+))[|;]?(\s+(.+))?/i) {
+
+				#
+				# One or more unprefixed IDs.
+				#
+				my $concatenated_ids = $1;
+				$logger->debug('Found unprefixed IDs in header: ' . $concatenated_ids);
+
+				#   id					= $3 && $4;
+				#   description			= $7;
+				my @unprefixed_ids = split(/[|;]/, $concatenated_ids);
+
+				foreach my $unprefixed_id (@unprefixed_ids) {
+
+					$logger->debug('Found ID: ' . $unprefixed_id);
+					push(@header_ids, $unprefixed_id);
+
+				}
+
+			} else {
+
+            	#
+            	# Something else.
+            	#
+            	# The FASTA header line can basically have any format,
+            	# so this is probably one of the less frequent occurring annotation schemes.
+            	# Therefore we try to see if the IDs we are looking for are present anywhere
+            	# in the header line up until the first white space or new line.
+            	# This may be tricky as we might match other annotation from other proteins
+            	# like a description from a 'best' BLAST hit that contains one of the IDs we
+            	# are looking for. Hence, in such a case this sequence is *not* the one of
+            	# the IDs we looking for, but similar to that one at best.
+            	#
+				if ($line =~ m/>([^\n\r\s]+)/) {
+
+					my $putative_id = $1;
+					$logger->debug('Found putative ID in header: ' . $putative_id);
+					push(@header_ids, $putative_id);
+
+				} else {
+					$logger->warn('Cannot identify IDs in this header: ' . $line);
+				}
+			}
+
+			
+			if ($ignore_accession_version) {
+				
+				for my $inx (0 .. $#header_ids) {
+					
+					if ($header_ids[$inx] =~ m/([^\.]+)\.(\d+)/) {
+							
+						my $this_unversioned_id = $1;
+						my $version_number = $2;
+						$logger->debug('Dropping version number (' . $version_number . ') for versioned ID: ' . $this_unversioned_id . '.');
+						$header_ids[$inx] = $this_unversioned_id;
+							
+					}
+				}
+			}
+			
+			foreach my $id (@header_ids) {
+				$logger->debug('Checking if ID ' . $id . ' is in the list of sequences to extract.');
+				
+				if ($match_logic eq 'literal') {
+				
+					if (${$wanted}{$id}) {
+						$logger->info('Literal bingo, preserving sequence with ID ' . $id);
+						$found = 1;
+						$extracted_seqs++;
+						last;
+					}
+				
+				} elsif ($match_logic eq 'regex') {
+					
+					foreach my $regex (keys(%{$wanted})) {
+						
+						if ($id =~ m/$regex/) {
+							$logger->info('Regex bingo, preserving sequence with ID ' . $id);
+							$found = 1;
+							$extracted_seqs++;
+							last;
+						}
+					}
+				}
+			}
+		}
+		if ($found) {
+			print($path_to_fh $line);
+		}
+
+	}
+
+	close($path_from_fh);
+	close($path_to_fh);
+
+	return ($extracted_seqs);
+
+}
+
+sub _Usage {
+
+	print STDERR "\n"
+	  . "extractSeqsFromFasta.pl:\n"
+	  . "   Extracts sequences from a multi-sequence FASTA file.\n" . "\n"
+	  . "Usage:\n" . "\n"
+	  . "   extractSeqsFromFasta.pl options\n" . "\n"
+	  . "Available options are:\n" . "\n"
+	  . "   -i [file]   Input file in FASTA format.\n"
+	  . "   -o [file]   Output file in FASTA format where the extracted files will be saved.\n"
+	  . "   -f [file]   Filter file containing accession numbers or IDs that shoud be extracted from the input.\n"
+	  . "               (One accession/ID per line)\n"
+	  . "   -m [string] Match logic that defines how the accession numbers or IDs from the filter file will be.\n"
+	  . "               matched to those in the FASTA file. Supported logic types:\n"
+	  . "                  literal     for exact matching.\n"
+	  . "                  regex       for fuzzy matching using simple regular expressions (in Perl regex syntax).\n"
+	  . "   -u          Use unversioned accession numbers for matching (only with -m literal).\n"
+	  . "               If the FASTA input file contains a versioned accession number like this IPI00189968.5,\n"
+	  . "               running this tool without -u (default) IPI00189968 or IPI00189968.2 will not match IPI00189968.5, \n"
+	  . "               but with -u IPI00189968 or IPI00189968.2 will match IPI00189968.5 and the sequence will be extracted.\n"
+	  . "               Hence this allows for less strict matching, but is less fuzzy than matching with regexes.\n"
+	  . "   -l [LEVEL]  Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.\n"
+	  . "\n";
+	exit;
+
+}