Mercurial > repos > galaxyp > nbic_fasta
diff ExtractSeqsFromFasta.pl @ 0:163892325845 draft default tip
Initial commit.
author | galaxyp |
---|---|
date | Fri, 10 May 2013 17:15:08 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ExtractSeqsFromFasta.pl Fri May 10 17:15:08 2013 -0400 @@ -0,0 +1,397 @@ +#!/usr/bin/perl -w +# +# This script extracts sequences from a multi-sequence FASTA file +# based on a list of accession numbers / IDs +# +# ===================================================== +# $Id: ExtractSeqsFromFasta.pl 15 2010-07-08 18:07:59Z pieter $ +# $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/ExtractSeqsFromFasta.pl $ +# $LastChangedDate: 2010-07-08 13:07:59 -0500 (Thu, 08 Jul 2010) $ +# $LastChangedRevision: 15 $ +# $LastChangedBy: pieter $ +# ===================================================== + +# +# initialise environment +# +use strict; +use Getopt::Std; +use Log::Log4perl qw(:easy); + +my %log_levels = ( + 'ALL' => $ALL, + 'TRACE' => $TRACE, + 'DEBUG' => $DEBUG, + 'INFO' => $INFO, + 'WARN' => $WARN, + 'ERROR' => $ERROR, + 'FATAL' => $FATAL, + 'OFF' => $OFF, +); +my %match_logic_types = ( + 'literal' => 1, + 'regex' => 1, +); + +# +# Get options. +# +my %opts; +Getopt::Std::getopts('ul:i:o:f:m:', \%opts); + +my $input = $opts{'i'}; +my $output = $opts{'o'}; +my $filter = $opts{'f'}; +my $log_level = $opts{'l'}; +my $match_logic = $opts{'m'}; +my $ignore_accession_version = $opts{'u'}; + +# +# Configure logging. +# +# Provides default if user did not specify log level: +$log_level = (defined($log_level) ? $log_level : 'INFO'); + +# Reset log level to default if user specified illegal log level. +$log_level = ( + defined($log_levels{$log_level}) + ? $log_levels{$log_level} + : $log_levels{'INFO'}); + +#Log::Log4perl->init('log4perl.properties'); +Log::Log4perl->easy_init( + + #{ level => $log_level, + # file => ">>ExtractSeqsFromFasta.log", + # layout => '%F{1}-%L-%M: %m%n' }, + { + level => $log_level, + file => "STDOUT", + layout => '%d L:%L %p> %m%n' + }, +); +my $logger = Log::Log4perl::get_logger(); + +# +# Check user input. +# +unless (defined($input) && defined($output) && defined($filter)) { + _Usage(); + exit; +} +if ($input =~ /^$/ || $output =~ /^$/ || $filter =~ /^$/) { + _Usage(); + exit; +} +if ($input eq $output) { + $logger->fatal('Output file is the same as the input file. Select a different file for the output.'); + exit; +} + +# +# Check input and filter file. +# +unless (-e $input && -f $input && -r $input) { + $logger->fatal('Cannot read from input file ' . $input . ': ' . $!); + exit; +} +unless (-e $filter && -f $filter && -r $filter) { + $logger->fatal('Cannot read from filter file ' . $filter . ': ' . $!); + exit; +} +# +# Check match logic. +# +$match_logic = (defined($match_logic) ? $match_logic : 'literal'); +unless (defined($match_logic_types{$match_logic})) { + $logger->fatal('Unkown logic ' . $match_logic . ' specified for filtering of the input sequences.'); + exit; +} + +# +# Read accession numbers to search the FASTA file(s) for +# +my $wanted = _CreateLookupHash($filter, $match_logic); +my $seqs_to_extract = scalar(keys(%{$wanted})); +$logger->info('Number of sequences to search for: ' . $seqs_to_extract); + +# +# Extract seqs +# +my ($counter) = _ExtractSeqs($input, $output, $wanted, $match_logic); + +$logger->info('Extracted ' . $counter . ' sequences.'); +$logger->info('Finished!'); + +# +## +### Internal subs. +## +# + +sub _CreateLookupHash { + + my ($file_path, $match_logic) = @_; + + $logger->info('Parsing ' . $file_path . '...'); + + my %wanted; + my $file_path_fh; + + eval { + open($file_path_fh, "<$file_path"); + }; + if ($@) { + $logger->fatal('Cannot read ID file: ' . $@); + exit; + } + + LINE: while (my $line = <$file_path_fh>) { + + $line =~ s/[\r\n]+//g; + + if ($match_logic eq 'literal') { + + if ($line =~ m/([a-z0-9_\-\.]+)/i) { + + my $id = $1; + + if ($ignore_accession_version) { + + # + # Remove version from accession number if it was versioned. + # + if ($id =~ m/([^\.]+)\.(\d+)/) { + + $id = $1; + + } + } + + $logger->debug('Found accession/ID ' . $id); + $wanted{$id} = 1; + + } else { + $logger->warn('Accession/ID in unsupported format: ' . $line); + next; + } + + } elsif ($match_logic eq 'regex') { + + if ($line =~ m/([a-z0-9_\-\.\[\]:?^\$]+)/i) { + my $regex = $1; + $logger->debug('Found regex ' . $regex); + $wanted{$regex} = 1; + } else { + $logger->warn('Regex in unsupported format: ' . $line); + next; + } + + } + } + + close($file_path_fh); + $logger->info('Created ID lookup list.'); + + return (\%wanted); + +} + +sub _ExtractSeqs { + + my ($path_from, $path_to, $wanted, $match_logic) = @_; + + $logger->info('Parsing ' . $path_from . '...'); + + my $extracted_seqs = 0; + my $found = 0; + my $path_from_fh; + my $path_to_fh; + + eval { + open($path_from_fh, "<$path_from"); + }; + if ($@) { + $logger->fatal('Cannot read FASTA file: ' . $@); + exit; + } + eval { + open($path_to_fh, ">$path_to"); + }; + if ($@) { + $logger->fatal('Cannot write FASTA file: ' . $@); + exit; + } + + LINE: while (my $line = <$path_from_fh>) { + + if ($line =~ m/^>/) { + $logger->debug('Found header line: ' . $line); + $found = 0; + my @header_ids; + + # + # Check for the presence of some frequently occurring naming schemes: + # + # >IPI:CON_Trypsin|SWISS-PROT:P00761|TRYP_PIG Trypsin - Sus scrofa (Pig). + # >IPI:CON_IPI00174775.2|TREMBL:Q32MB2;Q86Y46 Tax_Id=9606 Gene_Symbol=KRT73 Keratin-73 + # >sp|Q42592|APXS_ARATH L-ascorbate peroxidase S, chloroplastic/mitochondrial; + # >jgi|Triad1|1|gw1.3.1.1 + # + if ($line =~ m/^>((([^\s\n\r:;|]+)[:]([^\s\n\r:|]+)[|;])*([^\s\n\r:;|]+)[:]([^\s\n\r:|]+))[|;]?(\s+(.+))?/i) { + + # + # One or more namespace prefixed IDs. + # + my $concatenated_namespace_prefixed_ids = $1; + $logger->debug('Found prefixed IDs in header: ' . $concatenated_namespace_prefixed_ids); + + # database_namespace = $3 && $5; + # id = $4 && $6; + # description = $8; + my @namespace_prefixed_ids = split(/[|;]/, $concatenated_namespace_prefixed_ids); + + foreach my $prefixed_id (@namespace_prefixed_ids) { + + if ($prefixed_id =~ m/([^\s\n\r:;|]+:)?([^\s\n\r:|]+)/i) { + + my $this_id = $2; + + $logger->debug('Found ID: ' . $this_id); + push(@header_ids, $this_id); + + } else { + + $logger->warn( + 'This should have been an optionally prefixed ID, ' + . 'but failed to match the corresponding regex: ' + . $prefixed_id); + + } + } + + } elsif ($line =~ m/^>((([^\s\n\r:;|]+)[|;])*([^\s\n\r:;|]+))[|;]?(\s+(.+))?/i) { + + # + # One or more unprefixed IDs. + # + my $concatenated_ids = $1; + $logger->debug('Found unprefixed IDs in header: ' . $concatenated_ids); + + # id = $3 && $4; + # description = $7; + my @unprefixed_ids = split(/[|;]/, $concatenated_ids); + + foreach my $unprefixed_id (@unprefixed_ids) { + + $logger->debug('Found ID: ' . $unprefixed_id); + push(@header_ids, $unprefixed_id); + + } + + } else { + + # + # Something else. + # + # The FASTA header line can basically have any format, + # so this is probably one of the less frequent occurring annotation schemes. + # Therefore we try to see if the IDs we are looking for are present anywhere + # in the header line up until the first white space or new line. + # This may be tricky as we might match other annotation from other proteins + # like a description from a 'best' BLAST hit that contains one of the IDs we + # are looking for. Hence, in such a case this sequence is *not* the one of + # the IDs we looking for, but similar to that one at best. + # + if ($line =~ m/>([^\n\r\s]+)/) { + + my $putative_id = $1; + $logger->debug('Found putative ID in header: ' . $putative_id); + push(@header_ids, $putative_id); + + } else { + $logger->warn('Cannot identify IDs in this header: ' . $line); + } + } + + + if ($ignore_accession_version) { + + for my $inx (0 .. $#header_ids) { + + if ($header_ids[$inx] =~ m/([^\.]+)\.(\d+)/) { + + my $this_unversioned_id = $1; + my $version_number = $2; + $logger->debug('Dropping version number (' . $version_number . ') for versioned ID: ' . $this_unversioned_id . '.'); + $header_ids[$inx] = $this_unversioned_id; + + } + } + } + + foreach my $id (@header_ids) { + $logger->debug('Checking if ID ' . $id . ' is in the list of sequences to extract.'); + + if ($match_logic eq 'literal') { + + if (${$wanted}{$id}) { + $logger->info('Literal bingo, preserving sequence with ID ' . $id); + $found = 1; + $extracted_seqs++; + last; + } + + } elsif ($match_logic eq 'regex') { + + foreach my $regex (keys(%{$wanted})) { + + if ($id =~ m/$regex/) { + $logger->info('Regex bingo, preserving sequence with ID ' . $id); + $found = 1; + $extracted_seqs++; + last; + } + } + } + } + } + if ($found) { + print($path_to_fh $line); + } + + } + + close($path_from_fh); + close($path_to_fh); + + return ($extracted_seqs); + +} + +sub _Usage { + + print STDERR "\n" + . "extractSeqsFromFasta.pl:\n" + . " Extracts sequences from a multi-sequence FASTA file.\n" . "\n" + . "Usage:\n" . "\n" + . " extractSeqsFromFasta.pl options\n" . "\n" + . "Available options are:\n" . "\n" + . " -i [file] Input file in FASTA format.\n" + . " -o [file] Output file in FASTA format where the extracted files will be saved.\n" + . " -f [file] Filter file containing accession numbers or IDs that shoud be extracted from the input.\n" + . " (One accession/ID per line)\n" + . " -m [string] Match logic that defines how the accession numbers or IDs from the filter file will be.\n" + . " matched to those in the FASTA file. Supported logic types:\n" + . " literal for exact matching.\n" + . " regex for fuzzy matching using simple regular expressions (in Perl regex syntax).\n" + . " -u Use unversioned accession numbers for matching (only with -m literal).\n" + . " If the FASTA input file contains a versioned accession number like this IPI00189968.5,\n" + . " running this tool without -u (default) IPI00189968 or IPI00189968.2 will not match IPI00189968.5, \n" + . " but with -u IPI00189968 or IPI00189968.2 will match IPI00189968.5 and the sequence will be extracted.\n" + . " Hence this allows for less strict matching, but is less fuzzy than matching with regexes.\n" + . " -l [LEVEL] Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.\n" + . "\n"; + exit; + +}