Mercurial > repos > galaxyp > nbic_fasta
view ExtractSeqsFromFasta.pl @ 0:163892325845 draft default tip
Initial commit.
author | galaxyp |
---|---|
date | Fri, 10 May 2013 17:15:08 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/perl -w # # This script extracts sequences from a multi-sequence FASTA file # based on a list of accession numbers / IDs # # ===================================================== # $Id: ExtractSeqsFromFasta.pl 15 2010-07-08 18:07:59Z pieter $ # $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/ExtractSeqsFromFasta.pl $ # $LastChangedDate: 2010-07-08 13:07:59 -0500 (Thu, 08 Jul 2010) $ # $LastChangedRevision: 15 $ # $LastChangedBy: pieter $ # ===================================================== # # initialise environment # use strict; use Getopt::Std; use Log::Log4perl qw(:easy); my %log_levels = ( 'ALL' => $ALL, 'TRACE' => $TRACE, 'DEBUG' => $DEBUG, 'INFO' => $INFO, 'WARN' => $WARN, 'ERROR' => $ERROR, 'FATAL' => $FATAL, 'OFF' => $OFF, ); my %match_logic_types = ( 'literal' => 1, 'regex' => 1, ); # # Get options. # my %opts; Getopt::Std::getopts('ul:i:o:f:m:', \%opts); my $input = $opts{'i'}; my $output = $opts{'o'}; my $filter = $opts{'f'}; my $log_level = $opts{'l'}; my $match_logic = $opts{'m'}; my $ignore_accession_version = $opts{'u'}; # # Configure logging. # # Provides default if user did not specify log level: $log_level = (defined($log_level) ? $log_level : 'INFO'); # Reset log level to default if user specified illegal log level. $log_level = ( defined($log_levels{$log_level}) ? $log_levels{$log_level} : $log_levels{'INFO'}); #Log::Log4perl->init('log4perl.properties'); Log::Log4perl->easy_init( #{ level => $log_level, # file => ">>ExtractSeqsFromFasta.log", # layout => '%F{1}-%L-%M: %m%n' }, { level => $log_level, file => "STDOUT", layout => '%d L:%L %p> %m%n' }, ); my $logger = Log::Log4perl::get_logger(); # # Check user input. # unless (defined($input) && defined($output) && defined($filter)) { _Usage(); exit; } if ($input =~ /^$/ || $output =~ /^$/ || $filter =~ /^$/) { _Usage(); exit; } if ($input eq $output) { $logger->fatal('Output file is the same as the input file. Select a different file for the output.'); exit; } # # Check input and filter file. # unless (-e $input && -f $input && -r $input) { $logger->fatal('Cannot read from input file ' . $input . ': ' . $!); exit; } unless (-e $filter && -f $filter && -r $filter) { $logger->fatal('Cannot read from filter file ' . $filter . ': ' . $!); exit; } # # Check match logic. # $match_logic = (defined($match_logic) ? $match_logic : 'literal'); unless (defined($match_logic_types{$match_logic})) { $logger->fatal('Unkown logic ' . $match_logic . ' specified for filtering of the input sequences.'); exit; } # # Read accession numbers to search the FASTA file(s) for # my $wanted = _CreateLookupHash($filter, $match_logic); my $seqs_to_extract = scalar(keys(%{$wanted})); $logger->info('Number of sequences to search for: ' . $seqs_to_extract); # # Extract seqs # my ($counter) = _ExtractSeqs($input, $output, $wanted, $match_logic); $logger->info('Extracted ' . $counter . ' sequences.'); $logger->info('Finished!'); # ## ### Internal subs. ## # sub _CreateLookupHash { my ($file_path, $match_logic) = @_; $logger->info('Parsing ' . $file_path . '...'); my %wanted; my $file_path_fh; eval { open($file_path_fh, "<$file_path"); }; if ($@) { $logger->fatal('Cannot read ID file: ' . $@); exit; } LINE: while (my $line = <$file_path_fh>) { $line =~ s/[\r\n]+//g; if ($match_logic eq 'literal') { if ($line =~ m/([a-z0-9_\-\.]+)/i) { my $id = $1; if ($ignore_accession_version) { # # Remove version from accession number if it was versioned. # if ($id =~ m/([^\.]+)\.(\d+)/) { $id = $1; } } $logger->debug('Found accession/ID ' . $id); $wanted{$id} = 1; } else { $logger->warn('Accession/ID in unsupported format: ' . $line); next; } } elsif ($match_logic eq 'regex') { if ($line =~ m/([a-z0-9_\-\.\[\]:?^\$]+)/i) { my $regex = $1; $logger->debug('Found regex ' . $regex); $wanted{$regex} = 1; } else { $logger->warn('Regex in unsupported format: ' . $line); next; } } } close($file_path_fh); $logger->info('Created ID lookup list.'); return (\%wanted); } sub _ExtractSeqs { my ($path_from, $path_to, $wanted, $match_logic) = @_; $logger->info('Parsing ' . $path_from . '...'); my $extracted_seqs = 0; my $found = 0; my $path_from_fh; my $path_to_fh; eval { open($path_from_fh, "<$path_from"); }; if ($@) { $logger->fatal('Cannot read FASTA file: ' . $@); exit; } eval { open($path_to_fh, ">$path_to"); }; if ($@) { $logger->fatal('Cannot write FASTA file: ' . $@); exit; } LINE: while (my $line = <$path_from_fh>) { if ($line =~ m/^>/) { $logger->debug('Found header line: ' . $line); $found = 0; my @header_ids; # # Check for the presence of some frequently occurring naming schemes: # # >IPI:CON_Trypsin|SWISS-PROT:P00761|TRYP_PIG Trypsin - Sus scrofa (Pig). # >IPI:CON_IPI00174775.2|TREMBL:Q32MB2;Q86Y46 Tax_Id=9606 Gene_Symbol=KRT73 Keratin-73 # >sp|Q42592|APXS_ARATH L-ascorbate peroxidase S, chloroplastic/mitochondrial; # >jgi|Triad1|1|gw1.3.1.1 # if ($line =~ m/^>((([^\s\n\r:;|]+)[:]([^\s\n\r:|]+)[|;])*([^\s\n\r:;|]+)[:]([^\s\n\r:|]+))[|;]?(\s+(.+))?/i) { # # One or more namespace prefixed IDs. # my $concatenated_namespace_prefixed_ids = $1; $logger->debug('Found prefixed IDs in header: ' . $concatenated_namespace_prefixed_ids); # database_namespace = $3 && $5; # id = $4 && $6; # description = $8; my @namespace_prefixed_ids = split(/[|;]/, $concatenated_namespace_prefixed_ids); foreach my $prefixed_id (@namespace_prefixed_ids) { if ($prefixed_id =~ m/([^\s\n\r:;|]+:)?([^\s\n\r:|]+)/i) { my $this_id = $2; $logger->debug('Found ID: ' . $this_id); push(@header_ids, $this_id); } else { $logger->warn( 'This should have been an optionally prefixed ID, ' . 'but failed to match the corresponding regex: ' . $prefixed_id); } } } elsif ($line =~ m/^>((([^\s\n\r:;|]+)[|;])*([^\s\n\r:;|]+))[|;]?(\s+(.+))?/i) { # # One or more unprefixed IDs. # my $concatenated_ids = $1; $logger->debug('Found unprefixed IDs in header: ' . $concatenated_ids); # id = $3 && $4; # description = $7; my @unprefixed_ids = split(/[|;]/, $concatenated_ids); foreach my $unprefixed_id (@unprefixed_ids) { $logger->debug('Found ID: ' . $unprefixed_id); push(@header_ids, $unprefixed_id); } } else { # # Something else. # # The FASTA header line can basically have any format, # so this is probably one of the less frequent occurring annotation schemes. # Therefore we try to see if the IDs we are looking for are present anywhere # in the header line up until the first white space or new line. # This may be tricky as we might match other annotation from other proteins # like a description from a 'best' BLAST hit that contains one of the IDs we # are looking for. Hence, in such a case this sequence is *not* the one of # the IDs we looking for, but similar to that one at best. # if ($line =~ m/>([^\n\r\s]+)/) { my $putative_id = $1; $logger->debug('Found putative ID in header: ' . $putative_id); push(@header_ids, $putative_id); } else { $logger->warn('Cannot identify IDs in this header: ' . $line); } } if ($ignore_accession_version) { for my $inx (0 .. $#header_ids) { if ($header_ids[$inx] =~ m/([^\.]+)\.(\d+)/) { my $this_unversioned_id = $1; my $version_number = $2; $logger->debug('Dropping version number (' . $version_number . ') for versioned ID: ' . $this_unversioned_id . '.'); $header_ids[$inx] = $this_unversioned_id; } } } foreach my $id (@header_ids) { $logger->debug('Checking if ID ' . $id . ' is in the list of sequences to extract.'); if ($match_logic eq 'literal') { if (${$wanted}{$id}) { $logger->info('Literal bingo, preserving sequence with ID ' . $id); $found = 1; $extracted_seqs++; last; } } elsif ($match_logic eq 'regex') { foreach my $regex (keys(%{$wanted})) { if ($id =~ m/$regex/) { $logger->info('Regex bingo, preserving sequence with ID ' . $id); $found = 1; $extracted_seqs++; last; } } } } } if ($found) { print($path_to_fh $line); } } close($path_from_fh); close($path_to_fh); return ($extracted_seqs); } sub _Usage { print STDERR "\n" . "extractSeqsFromFasta.pl:\n" . " Extracts sequences from a multi-sequence FASTA file.\n" . "\n" . "Usage:\n" . "\n" . " extractSeqsFromFasta.pl options\n" . "\n" . "Available options are:\n" . "\n" . " -i [file] Input file in FASTA format.\n" . " -o [file] Output file in FASTA format where the extracted files will be saved.\n" . " -f [file] Filter file containing accession numbers or IDs that shoud be extracted from the input.\n" . " (One accession/ID per line)\n" . " -m [string] Match logic that defines how the accession numbers or IDs from the filter file will be.\n" . " matched to those in the FASTA file. Supported logic types:\n" . " literal for exact matching.\n" . " regex for fuzzy matching using simple regular expressions (in Perl regex syntax).\n" . " -u Use unversioned accession numbers for matching (only with -m literal).\n" . " If the FASTA input file contains a versioned accession number like this IPI00189968.5,\n" . " running this tool without -u (default) IPI00189968 or IPI00189968.2 will not match IPI00189968.5, \n" . " but with -u IPI00189968 or IPI00189968.2 will match IPI00189968.5 and the sequence will be extracted.\n" . " Hence this allows for less strict matching, but is less fuzzy than matching with regexes.\n" . " -l [LEVEL] Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.\n" . "\n"; exit; }