Mercurial > repos > galaxyp > nbic_fasta

#!/usr/bin/perl -w
#
# This script extracts sequences from a multi-sequence FASTA file
# based on a list of accession numbers / IDs
#
# =====================================================
# $Id: ExtractSeqsFromFasta.pl 15 2010-07-08 18:07:59Z pieter $
# $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/ExtractSeqsFromFasta.pl $
# $LastChangedDate: 2010-07-08 13:07:59 -0500 (Thu, 08 Jul 2010) $
# $LastChangedRevision: 15 $
# $LastChangedBy: pieter $
# =====================================================

#
# initialise environment
#
use strict;
use Getopt::Std;
use Log::Log4perl qw(:easy);

my %log_levels = (
	'ALL'   => $ALL,
	'TRACE' => $TRACE,
	'DEBUG' => $DEBUG,
	'INFO'  => $INFO,
	'WARN'  => $WARN,
	'ERROR' => $ERROR,
	'FATAL' => $FATAL,
	'OFF'   => $OFF,
);
my %match_logic_types = (
	'literal'   => 1,
	'regex' 	=> 1,
);

#
# Get options.
#
my %opts;
Getopt::Std::getopts('ul:i:o:f:m:', \%opts);

my $input						= $opts{'i'};
my $output						= $opts{'o'};
my $filter						= $opts{'f'};
my $log_level					= $opts{'l'};
my $match_logic					= $opts{'m'};
my $ignore_accession_version	= $opts{'u'};

#
# Configure logging.
#
# Provides default if user did not specify log level:
$log_level = (defined($log_level) ? $log_level : 'INFO');

# Reset log level to default if user specified illegal log level.
$log_level = (
	defined($log_levels{$log_level})
	? $log_levels{$log_level}
	: $log_levels{'INFO'});

#Log::Log4perl->init('log4perl.properties');
Log::Log4perl->easy_init(

	#{ level    => $log_level,
	#  file     => ">>ExtractSeqsFromFasta.log",
	#  layout   => '%F{1}-%L-%M: %m%n' },
	{
		level  => $log_level,
		file   => "STDOUT",
		layout => '%d L:%L %p> %m%n'
	},
);
my $logger = Log::Log4perl::get_logger();

#
# Check user input.
#
unless (defined($input) && defined($output) && defined($filter)) {
	_Usage();
	exit;
}
if ($input =~ /^$/ || $output =~ /^$/ || $filter =~ /^$/) {
	_Usage();
	exit;
}
if ($input eq $output) {
	$logger->fatal('Output file is the same as the input file. Select a different file for the output.');
	exit;
}

#
# Check input and filter file.
#
unless (-e $input && -f $input && -r $input) {
	$logger->fatal('Cannot read from input file ' . $input . ': ' . $!);
	exit;
}
unless (-e $filter && -f $filter && -r $filter) {
	$logger->fatal('Cannot read from filter file ' . $filter . ': ' . $!);
	exit;
}
#
# Check match logic.
#
$match_logic = (defined($match_logic) ? $match_logic : 'literal');
unless (defined($match_logic_types{$match_logic})) {
	$logger->fatal('Unkown logic ' . $match_logic . ' specified for filtering of the input sequences.');
	exit;
}

#
# Read accession numbers to search the FASTA file(s) for
#
my $wanted          = _CreateLookupHash($filter, $match_logic);
my $seqs_to_extract = scalar(keys(%{$wanted}));
$logger->info('Number of sequences to search for: ' . $seqs_to_extract);

#
# Extract seqs
#
my ($counter) = _ExtractSeqs($input, $output, $wanted, $match_logic);

$logger->info('Extracted ' . $counter . ' sequences.');
$logger->info('Finished!');

#
##
### Internal subs.
##
#

sub _CreateLookupHash {

	my ($file_path, $match_logic) = @_;

	$logger->info('Parsing ' . $file_path . '...');

	my %wanted;
	my $file_path_fh;

	eval {
		open($file_path_fh, "<$file_path");
	};
	if ($@) {
		$logger->fatal('Cannot read ID file: ' . $@);
		exit;
	}

	LINE: while (my $line = <$file_path_fh>) {

		$line =~ s/[\r\n]+//g;

		if ($match_logic eq 'literal') {

			if ($line =~ m/([a-z0-9_\-\.]+)/i) {

				my $id = $1;

				if ($ignore_accession_version) {

					#
					# Remove version from accession number if it was versioned.
					#
					if ($id =~ m/([^\.]+)\.(\d+)/) {

						$id = $1;

					}
				}

				$logger->debug('Found accession/ID ' . $id);
				$wanted{$id} = 1;

			} else {
				$logger->warn('Accession/ID in unsupported format: ' . $line);
				next;
			}

		} elsif ($match_logic eq 'regex') {

			if ($line =~ m/([a-z0-9_\-\.\[\]:?^\$]+)/i) {
				my $regex = $1;
				$logger->debug('Found regex ' . $regex);
				$wanted{$regex} = 1;
			} else {
				$logger->warn('Regex in unsupported format: ' . $line);
				next;
			}

		}
	}

	close($file_path_fh);
	$logger->info('Created ID lookup list.');

	return (\%wanted);

}

sub _ExtractSeqs {

	my ($path_from, $path_to, $wanted, $match_logic) = @_;

	$logger->info('Parsing ' . $path_from . '...');

	my $extracted_seqs = 0;
	my $found          = 0;
	my $path_from_fh;
	my $path_to_fh;

	eval {
		open($path_from_fh,	"<$path_from");
	};
	if ($@) {
		$logger->fatal('Cannot read FASTA file: ' . $@);
		exit;
	}
	eval {
		open($path_to_fh,	">$path_to");
	};
	if ($@) {
		$logger->fatal('Cannot write FASTA file: ' . $@);
		exit;
	}

	LINE: while (my $line = <$path_from_fh>) {

		if ($line =~ m/^>/) {
			$logger->debug('Found header line: ' . $line);
			$found = 0;
			my @header_ids;

            #
            # Check for the presence of some frequently occurring naming schemes:
            #
            # >IPI:CON_Trypsin|SWISS-PROT:P00761|TRYP_PIG Trypsin - Sus scrofa (Pig).
            # >IPI:CON_IPI00174775.2|TREMBL:Q32MB2;Q86Y46 Tax_Id=9606 Gene_Symbol=KRT73 Keratin-73
            # >sp|Q42592|APXS_ARATH L-ascorbate peroxidase S, chloroplastic/mitochondrial;
            # >jgi|Triad1|1|gw1.3.1.1
            #
			if ($line =~ m/^>((([^\s\n\r:;|]+)[:]([^\s\n\r:|]+)[|;])*([^\s\n\r:;|]+)[:]([^\s\n\r:|]+))[|;]?(\s+(.+))?/i) {

				#
				# One or more namespace prefixed IDs.
				#
				my $concatenated_namespace_prefixed_ids = $1;
				$logger->debug('Found prefixed IDs in header: ' . $concatenated_namespace_prefixed_ids);

				#   database_namespace	= $3 && $5;
				#   id					= $4 && $6;
				#   description			= $8;
				my @namespace_prefixed_ids = split(/[|;]/, $concatenated_namespace_prefixed_ids);

				foreach my $prefixed_id (@namespace_prefixed_ids) {

					if ($prefixed_id =~ m/([^\s\n\r:;|]+:)?([^\s\n\r:|]+)/i) {

						my $this_id = $2;

						$logger->debug('Found ID: ' . $this_id);
						push(@header_ids, $this_id);

					} else {

						$logger->warn(
							'This should have been an optionally prefixed ID, '
							  . 'but failed to match the corresponding regex: '
							  . $prefixed_id);

					}
				}

			} elsif ($line =~ m/^>((([^\s\n\r:;|]+)[|;])*([^\s\n\r:;|]+))[|;]?(\s+(.+))?/i) {

				#
				# One or more unprefixed IDs.
				#
				my $concatenated_ids = $1;
				$logger->debug('Found unprefixed IDs in header: ' . $concatenated_ids);

				#   id					= $3 && $4;
				#   description			= $7;
				my @unprefixed_ids = split(/[|;]/, $concatenated_ids);

				foreach my $unprefixed_id (@unprefixed_ids) {

					$logger->debug('Found ID: ' . $unprefixed_id);
					push(@header_ids, $unprefixed_id);

				}

			} else {

            	#
            	# Something else.
            	#
            	# The FASTA header line can basically have any format,
            	# so this is probably one of the less frequent occurring annotation schemes.
            	# Therefore we try to see if the IDs we are looking for are present anywhere
            	# in the header line up until the first white space or new line.
            	# This may be tricky as we might match other annotation from other proteins
            	# like a description from a 'best' BLAST hit that contains one of the IDs we
            	# are looking for. Hence, in such a case this sequence is *not* the one of
            	# the IDs we looking for, but similar to that one at best.
            	#
				if ($line =~ m/>([^\n\r\s]+)/) {

					my $putative_id = $1;
					$logger->debug('Found putative ID in header: ' . $putative_id);
					push(@header_ids, $putative_id);

				} else {
					$logger->warn('Cannot identify IDs in this header: ' . $line);
				}
			}


			if ($ignore_accession_version) {

				for my $inx (0 .. $#header_ids) {

					if ($header_ids[$inx] =~ m/([^\.]+)\.(\d+)/) {

						my $this_unversioned_id = $1;
						my $version_number = $2;
						$logger->debug('Dropping version number (' . $version_number . ') for versioned ID: ' . $this_unversioned_id . '.');
						$header_ids[$inx] = $this_unversioned_id;

					}
				}
			}

			foreach my $id (@header_ids) {
				$logger->debug('Checking if ID ' . $id . ' is in the list of sequences to extract.');

				if ($match_logic eq 'literal') {

					if (${$wanted}{$id}) {
						$logger->info('Literal bingo, preserving sequence with ID ' . $id);
						$found = 1;
						$extracted_seqs++;
						last;
					}

				} elsif ($match_logic eq 'regex') {

					foreach my $regex (keys(%{$wanted})) {

						if ($id =~ m/$regex/) {
							$logger->info('Regex bingo, preserving sequence with ID ' . $id);
							$found = 1;
							$extracted_seqs++;
							last;
						}
					}
				}
			}
		}
		if ($found) {
			print($path_to_fh $line);
		}

	}

	close($path_from_fh);
	close($path_to_fh);

	return ($extracted_seqs);

}

sub _Usage {

	print STDERR "\n"
	  . "extractSeqsFromFasta.pl:\n"
	  . "   Extracts sequences from a multi-sequence FASTA file.\n" . "\n"
	  . "Usage:\n" . "\n"
	  . "   extractSeqsFromFasta.pl options\n" . "\n"
	  . "Available options are:\n" . "\n"
	  . "   -i [file]   Input file in FASTA format.\n"
	  . "   -o [file]   Output file in FASTA format where the extracted files will be saved.\n"
	  . "   -f [file]   Filter file containing accession numbers or IDs that shoud be extracted from the input.\n"
	  . "               (One accession/ID per line)\n"
	  . "   -m [string] Match logic that defines how the accession numbers or IDs from the filter file will be.\n"
	  . "               matched to those in the FASTA file. Supported logic types:\n"
	  . "                  literal     for exact matching.\n"
	  . "                  regex       for fuzzy matching using simple regular expressions (in Perl regex syntax).\n"
	  . "   -u          Use unversioned accession numbers for matching (only with -m literal).\n"
	  . "               If the FASTA input file contains a versioned accession number like this IPI00189968.5,\n"
	  . "               running this tool without -u (default) IPI00189968 or IPI00189968.2 will not match IPI00189968.5, \n"
	  . "               but with -u IPI00189968 or IPI00189968.2 will match IPI00189968.5 and the sequence will be extracted.\n"
	  . "               Hence this allows for less strict matching, but is less fuzzy than matching with regexes.\n"
	  . "   -l [LEVEL]  Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.\n"
	  . "\n";
	exit;

}
author	galaxyp
date	Fri, 10 May 2013 17:15:08 -0400
parents
children