annotate ExtractSeqsFromFasta.pl @ 0:163892325845 draft default tip

Initial commit.
author galaxyp
date Fri, 10 May 2013 17:15:08 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
163892325845 Initial commit.
galaxyp
parents:
diff changeset
1 #!/usr/bin/perl -w
163892325845 Initial commit.
galaxyp
parents:
diff changeset
2 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
3 # This script extracts sequences from a multi-sequence FASTA file
163892325845 Initial commit.
galaxyp
parents:
diff changeset
4 # based on a list of accession numbers / IDs
163892325845 Initial commit.
galaxyp
parents:
diff changeset
5 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
6 # =====================================================
163892325845 Initial commit.
galaxyp
parents:
diff changeset
7 # $Id: ExtractSeqsFromFasta.pl 15 2010-07-08 18:07:59Z pieter $
163892325845 Initial commit.
galaxyp
parents:
diff changeset
8 # $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/ExtractSeqsFromFasta.pl $
163892325845 Initial commit.
galaxyp
parents:
diff changeset
9 # $LastChangedDate: 2010-07-08 13:07:59 -0500 (Thu, 08 Jul 2010) $
163892325845 Initial commit.
galaxyp
parents:
diff changeset
10 # $LastChangedRevision: 15 $
163892325845 Initial commit.
galaxyp
parents:
diff changeset
11 # $LastChangedBy: pieter $
163892325845 Initial commit.
galaxyp
parents:
diff changeset
12 # =====================================================
163892325845 Initial commit.
galaxyp
parents:
diff changeset
13
163892325845 Initial commit.
galaxyp
parents:
diff changeset
14 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
15 # initialise environment
163892325845 Initial commit.
galaxyp
parents:
diff changeset
16 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
17 use strict;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
18 use Getopt::Std;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
19 use Log::Log4perl qw(:easy);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
20
163892325845 Initial commit.
galaxyp
parents:
diff changeset
21 my %log_levels = (
163892325845 Initial commit.
galaxyp
parents:
diff changeset
22 'ALL' => $ALL,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
23 'TRACE' => $TRACE,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
24 'DEBUG' => $DEBUG,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
25 'INFO' => $INFO,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
26 'WARN' => $WARN,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
27 'ERROR' => $ERROR,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
28 'FATAL' => $FATAL,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
29 'OFF' => $OFF,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
30 );
163892325845 Initial commit.
galaxyp
parents:
diff changeset
31 my %match_logic_types = (
163892325845 Initial commit.
galaxyp
parents:
diff changeset
32 'literal' => 1,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
33 'regex' => 1,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
34 );
163892325845 Initial commit.
galaxyp
parents:
diff changeset
35
163892325845 Initial commit.
galaxyp
parents:
diff changeset
36 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
37 # Get options.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
38 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
39 my %opts;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
40 Getopt::Std::getopts('ul:i:o:f:m:', \%opts);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
41
163892325845 Initial commit.
galaxyp
parents:
diff changeset
42 my $input = $opts{'i'};
163892325845 Initial commit.
galaxyp
parents:
diff changeset
43 my $output = $opts{'o'};
163892325845 Initial commit.
galaxyp
parents:
diff changeset
44 my $filter = $opts{'f'};
163892325845 Initial commit.
galaxyp
parents:
diff changeset
45 my $log_level = $opts{'l'};
163892325845 Initial commit.
galaxyp
parents:
diff changeset
46 my $match_logic = $opts{'m'};
163892325845 Initial commit.
galaxyp
parents:
diff changeset
47 my $ignore_accession_version = $opts{'u'};
163892325845 Initial commit.
galaxyp
parents:
diff changeset
48
163892325845 Initial commit.
galaxyp
parents:
diff changeset
49 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
50 # Configure logging.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
51 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
52 # Provides default if user did not specify log level:
163892325845 Initial commit.
galaxyp
parents:
diff changeset
53 $log_level = (defined($log_level) ? $log_level : 'INFO');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
54
163892325845 Initial commit.
galaxyp
parents:
diff changeset
55 # Reset log level to default if user specified illegal log level.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
56 $log_level = (
163892325845 Initial commit.
galaxyp
parents:
diff changeset
57 defined($log_levels{$log_level})
163892325845 Initial commit.
galaxyp
parents:
diff changeset
58 ? $log_levels{$log_level}
163892325845 Initial commit.
galaxyp
parents:
diff changeset
59 : $log_levels{'INFO'});
163892325845 Initial commit.
galaxyp
parents:
diff changeset
60
163892325845 Initial commit.
galaxyp
parents:
diff changeset
61 #Log::Log4perl->init('log4perl.properties');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
62 Log::Log4perl->easy_init(
163892325845 Initial commit.
galaxyp
parents:
diff changeset
63
163892325845 Initial commit.
galaxyp
parents:
diff changeset
64 #{ level => $log_level,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
65 # file => ">>ExtractSeqsFromFasta.log",
163892325845 Initial commit.
galaxyp
parents:
diff changeset
66 # layout => '%F{1}-%L-%M: %m%n' },
163892325845 Initial commit.
galaxyp
parents:
diff changeset
67 {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
68 level => $log_level,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
69 file => "STDOUT",
163892325845 Initial commit.
galaxyp
parents:
diff changeset
70 layout => '%d L:%L %p> %m%n'
163892325845 Initial commit.
galaxyp
parents:
diff changeset
71 },
163892325845 Initial commit.
galaxyp
parents:
diff changeset
72 );
163892325845 Initial commit.
galaxyp
parents:
diff changeset
73 my $logger = Log::Log4perl::get_logger();
163892325845 Initial commit.
galaxyp
parents:
diff changeset
74
163892325845 Initial commit.
galaxyp
parents:
diff changeset
75 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
76 # Check user input.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
77 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
78 unless (defined($input) && defined($output) && defined($filter)) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
79 _Usage();
163892325845 Initial commit.
galaxyp
parents:
diff changeset
80 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
81 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
82 if ($input =~ /^$/ || $output =~ /^$/ || $filter =~ /^$/) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
83 _Usage();
163892325845 Initial commit.
galaxyp
parents:
diff changeset
84 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
85 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
86 if ($input eq $output) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
87 $logger->fatal('Output file is the same as the input file. Select a different file for the output.');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
88 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
89 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
90
163892325845 Initial commit.
galaxyp
parents:
diff changeset
91 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
92 # Check input and filter file.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
93 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
94 unless (-e $input && -f $input && -r $input) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
95 $logger->fatal('Cannot read from input file ' . $input . ': ' . $!);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
96 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
97 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
98 unless (-e $filter && -f $filter && -r $filter) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
99 $logger->fatal('Cannot read from filter file ' . $filter . ': ' . $!);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
100 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
101 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
102 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
103 # Check match logic.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
104 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
105 $match_logic = (defined($match_logic) ? $match_logic : 'literal');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
106 unless (defined($match_logic_types{$match_logic})) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
107 $logger->fatal('Unkown logic ' . $match_logic . ' specified for filtering of the input sequences.');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
108 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
109 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
110
163892325845 Initial commit.
galaxyp
parents:
diff changeset
111 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
112 # Read accession numbers to search the FASTA file(s) for
163892325845 Initial commit.
galaxyp
parents:
diff changeset
113 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
114 my $wanted = _CreateLookupHash($filter, $match_logic);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
115 my $seqs_to_extract = scalar(keys(%{$wanted}));
163892325845 Initial commit.
galaxyp
parents:
diff changeset
116 $logger->info('Number of sequences to search for: ' . $seqs_to_extract);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
117
163892325845 Initial commit.
galaxyp
parents:
diff changeset
118 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
119 # Extract seqs
163892325845 Initial commit.
galaxyp
parents:
diff changeset
120 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
121 my ($counter) = _ExtractSeqs($input, $output, $wanted, $match_logic);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
122
163892325845 Initial commit.
galaxyp
parents:
diff changeset
123 $logger->info('Extracted ' . $counter . ' sequences.');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
124 $logger->info('Finished!');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
125
163892325845 Initial commit.
galaxyp
parents:
diff changeset
126 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
127 ##
163892325845 Initial commit.
galaxyp
parents:
diff changeset
128 ### Internal subs.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
129 ##
163892325845 Initial commit.
galaxyp
parents:
diff changeset
130 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
131
163892325845 Initial commit.
galaxyp
parents:
diff changeset
132 sub _CreateLookupHash {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
133
163892325845 Initial commit.
galaxyp
parents:
diff changeset
134 my ($file_path, $match_logic) = @_;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
135
163892325845 Initial commit.
galaxyp
parents:
diff changeset
136 $logger->info('Parsing ' . $file_path . '...');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
137
163892325845 Initial commit.
galaxyp
parents:
diff changeset
138 my %wanted;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
139 my $file_path_fh;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
140
163892325845 Initial commit.
galaxyp
parents:
diff changeset
141 eval {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
142 open($file_path_fh, "<$file_path");
163892325845 Initial commit.
galaxyp
parents:
diff changeset
143 };
163892325845 Initial commit.
galaxyp
parents:
diff changeset
144 if ($@) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
145 $logger->fatal('Cannot read ID file: ' . $@);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
146 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
147 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
148
163892325845 Initial commit.
galaxyp
parents:
diff changeset
149 LINE: while (my $line = <$file_path_fh>) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
150
163892325845 Initial commit.
galaxyp
parents:
diff changeset
151 $line =~ s/[\r\n]+//g;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
152
163892325845 Initial commit.
galaxyp
parents:
diff changeset
153 if ($match_logic eq 'literal') {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
154
163892325845 Initial commit.
galaxyp
parents:
diff changeset
155 if ($line =~ m/([a-z0-9_\-\.]+)/i) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
156
163892325845 Initial commit.
galaxyp
parents:
diff changeset
157 my $id = $1;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
158
163892325845 Initial commit.
galaxyp
parents:
diff changeset
159 if ($ignore_accession_version) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
160
163892325845 Initial commit.
galaxyp
parents:
diff changeset
161 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
162 # Remove version from accession number if it was versioned.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
163 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
164 if ($id =~ m/([^\.]+)\.(\d+)/) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
165
163892325845 Initial commit.
galaxyp
parents:
diff changeset
166 $id = $1;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
167
163892325845 Initial commit.
galaxyp
parents:
diff changeset
168 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
169 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
170
163892325845 Initial commit.
galaxyp
parents:
diff changeset
171 $logger->debug('Found accession/ID ' . $id);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
172 $wanted{$id} = 1;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
173
163892325845 Initial commit.
galaxyp
parents:
diff changeset
174 } else {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
175 $logger->warn('Accession/ID in unsupported format: ' . $line);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
176 next;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
177 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
178
163892325845 Initial commit.
galaxyp
parents:
diff changeset
179 } elsif ($match_logic eq 'regex') {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
180
163892325845 Initial commit.
galaxyp
parents:
diff changeset
181 if ($line =~ m/([a-z0-9_\-\.\[\]:?^\$]+)/i) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
182 my $regex = $1;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
183 $logger->debug('Found regex ' . $regex);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
184 $wanted{$regex} = 1;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
185 } else {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
186 $logger->warn('Regex in unsupported format: ' . $line);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
187 next;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
188 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
189
163892325845 Initial commit.
galaxyp
parents:
diff changeset
190 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
191 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
192
163892325845 Initial commit.
galaxyp
parents:
diff changeset
193 close($file_path_fh);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
194 $logger->info('Created ID lookup list.');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
195
163892325845 Initial commit.
galaxyp
parents:
diff changeset
196 return (\%wanted);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
197
163892325845 Initial commit.
galaxyp
parents:
diff changeset
198 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
199
163892325845 Initial commit.
galaxyp
parents:
diff changeset
200 sub _ExtractSeqs {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
201
163892325845 Initial commit.
galaxyp
parents:
diff changeset
202 my ($path_from, $path_to, $wanted, $match_logic) = @_;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
203
163892325845 Initial commit.
galaxyp
parents:
diff changeset
204 $logger->info('Parsing ' . $path_from . '...');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
205
163892325845 Initial commit.
galaxyp
parents:
diff changeset
206 my $extracted_seqs = 0;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
207 my $found = 0;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
208 my $path_from_fh;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
209 my $path_to_fh;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
210
163892325845 Initial commit.
galaxyp
parents:
diff changeset
211 eval {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
212 open($path_from_fh, "<$path_from");
163892325845 Initial commit.
galaxyp
parents:
diff changeset
213 };
163892325845 Initial commit.
galaxyp
parents:
diff changeset
214 if ($@) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
215 $logger->fatal('Cannot read FASTA file: ' . $@);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
216 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
217 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
218 eval {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
219 open($path_to_fh, ">$path_to");
163892325845 Initial commit.
galaxyp
parents:
diff changeset
220 };
163892325845 Initial commit.
galaxyp
parents:
diff changeset
221 if ($@) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
222 $logger->fatal('Cannot write FASTA file: ' . $@);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
223 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
224 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
225
163892325845 Initial commit.
galaxyp
parents:
diff changeset
226 LINE: while (my $line = <$path_from_fh>) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
227
163892325845 Initial commit.
galaxyp
parents:
diff changeset
228 if ($line =~ m/^>/) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
229 $logger->debug('Found header line: ' . $line);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
230 $found = 0;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
231 my @header_ids;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
232
163892325845 Initial commit.
galaxyp
parents:
diff changeset
233 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
234 # Check for the presence of some frequently occurring naming schemes:
163892325845 Initial commit.
galaxyp
parents:
diff changeset
235 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
236 # >IPI:CON_Trypsin|SWISS-PROT:P00761|TRYP_PIG Trypsin - Sus scrofa (Pig).
163892325845 Initial commit.
galaxyp
parents:
diff changeset
237 # >IPI:CON_IPI00174775.2|TREMBL:Q32MB2;Q86Y46 Tax_Id=9606 Gene_Symbol=KRT73 Keratin-73
163892325845 Initial commit.
galaxyp
parents:
diff changeset
238 # >sp|Q42592|APXS_ARATH L-ascorbate peroxidase S, chloroplastic/mitochondrial;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
239 # >jgi|Triad1|1|gw1.3.1.1
163892325845 Initial commit.
galaxyp
parents:
diff changeset
240 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
241 if ($line =~ m/^>((([^\s\n\r:;|]+)[:]([^\s\n\r:|]+)[|;])*([^\s\n\r:;|]+)[:]([^\s\n\r:|]+))[|;]?(\s+(.+))?/i) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
242
163892325845 Initial commit.
galaxyp
parents:
diff changeset
243 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
244 # One or more namespace prefixed IDs.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
245 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
246 my $concatenated_namespace_prefixed_ids = $1;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
247 $logger->debug('Found prefixed IDs in header: ' . $concatenated_namespace_prefixed_ids);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
248
163892325845 Initial commit.
galaxyp
parents:
diff changeset
249 # database_namespace = $3 && $5;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
250 # id = $4 && $6;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
251 # description = $8;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
252 my @namespace_prefixed_ids = split(/[|;]/, $concatenated_namespace_prefixed_ids);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
253
163892325845 Initial commit.
galaxyp
parents:
diff changeset
254 foreach my $prefixed_id (@namespace_prefixed_ids) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
255
163892325845 Initial commit.
galaxyp
parents:
diff changeset
256 if ($prefixed_id =~ m/([^\s\n\r:;|]+:)?([^\s\n\r:|]+)/i) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
257
163892325845 Initial commit.
galaxyp
parents:
diff changeset
258 my $this_id = $2;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
259
163892325845 Initial commit.
galaxyp
parents:
diff changeset
260 $logger->debug('Found ID: ' . $this_id);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
261 push(@header_ids, $this_id);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
262
163892325845 Initial commit.
galaxyp
parents:
diff changeset
263 } else {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
264
163892325845 Initial commit.
galaxyp
parents:
diff changeset
265 $logger->warn(
163892325845 Initial commit.
galaxyp
parents:
diff changeset
266 'This should have been an optionally prefixed ID, '
163892325845 Initial commit.
galaxyp
parents:
diff changeset
267 . 'but failed to match the corresponding regex: '
163892325845 Initial commit.
galaxyp
parents:
diff changeset
268 . $prefixed_id);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
269
163892325845 Initial commit.
galaxyp
parents:
diff changeset
270 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
271 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
272
163892325845 Initial commit.
galaxyp
parents:
diff changeset
273 } elsif ($line =~ m/^>((([^\s\n\r:;|]+)[|;])*([^\s\n\r:;|]+))[|;]?(\s+(.+))?/i) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
274
163892325845 Initial commit.
galaxyp
parents:
diff changeset
275 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
276 # One or more unprefixed IDs.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
277 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
278 my $concatenated_ids = $1;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
279 $logger->debug('Found unprefixed IDs in header: ' . $concatenated_ids);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
280
163892325845 Initial commit.
galaxyp
parents:
diff changeset
281 # id = $3 && $4;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
282 # description = $7;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
283 my @unprefixed_ids = split(/[|;]/, $concatenated_ids);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
284
163892325845 Initial commit.
galaxyp
parents:
diff changeset
285 foreach my $unprefixed_id (@unprefixed_ids) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
286
163892325845 Initial commit.
galaxyp
parents:
diff changeset
287 $logger->debug('Found ID: ' . $unprefixed_id);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
288 push(@header_ids, $unprefixed_id);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
289
163892325845 Initial commit.
galaxyp
parents:
diff changeset
290 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
291
163892325845 Initial commit.
galaxyp
parents:
diff changeset
292 } else {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
293
163892325845 Initial commit.
galaxyp
parents:
diff changeset
294 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
295 # Something else.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
296 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
297 # The FASTA header line can basically have any format,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
298 # so this is probably one of the less frequent occurring annotation schemes.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
299 # Therefore we try to see if the IDs we are looking for are present anywhere
163892325845 Initial commit.
galaxyp
parents:
diff changeset
300 # in the header line up until the first white space or new line.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
301 # This may be tricky as we might match other annotation from other proteins
163892325845 Initial commit.
galaxyp
parents:
diff changeset
302 # like a description from a 'best' BLAST hit that contains one of the IDs we
163892325845 Initial commit.
galaxyp
parents:
diff changeset
303 # are looking for. Hence, in such a case this sequence is *not* the one of
163892325845 Initial commit.
galaxyp
parents:
diff changeset
304 # the IDs we looking for, but similar to that one at best.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
305 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
306 if ($line =~ m/>([^\n\r\s]+)/) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
307
163892325845 Initial commit.
galaxyp
parents:
diff changeset
308 my $putative_id = $1;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
309 $logger->debug('Found putative ID in header: ' . $putative_id);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
310 push(@header_ids, $putative_id);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
311
163892325845 Initial commit.
galaxyp
parents:
diff changeset
312 } else {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
313 $logger->warn('Cannot identify IDs in this header: ' . $line);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
314 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
315 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
316
163892325845 Initial commit.
galaxyp
parents:
diff changeset
317
163892325845 Initial commit.
galaxyp
parents:
diff changeset
318 if ($ignore_accession_version) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
319
163892325845 Initial commit.
galaxyp
parents:
diff changeset
320 for my $inx (0 .. $#header_ids) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
321
163892325845 Initial commit.
galaxyp
parents:
diff changeset
322 if ($header_ids[$inx] =~ m/([^\.]+)\.(\d+)/) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
323
163892325845 Initial commit.
galaxyp
parents:
diff changeset
324 my $this_unversioned_id = $1;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
325 my $version_number = $2;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
326 $logger->debug('Dropping version number (' . $version_number . ') for versioned ID: ' . $this_unversioned_id . '.');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
327 $header_ids[$inx] = $this_unversioned_id;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
328
163892325845 Initial commit.
galaxyp
parents:
diff changeset
329 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
330 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
331 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
332
163892325845 Initial commit.
galaxyp
parents:
diff changeset
333 foreach my $id (@header_ids) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
334 $logger->debug('Checking if ID ' . $id . ' is in the list of sequences to extract.');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
335
163892325845 Initial commit.
galaxyp
parents:
diff changeset
336 if ($match_logic eq 'literal') {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
337
163892325845 Initial commit.
galaxyp
parents:
diff changeset
338 if (${$wanted}{$id}) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
339 $logger->info('Literal bingo, preserving sequence with ID ' . $id);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
340 $found = 1;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
341 $extracted_seqs++;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
342 last;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
343 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
344
163892325845 Initial commit.
galaxyp
parents:
diff changeset
345 } elsif ($match_logic eq 'regex') {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
346
163892325845 Initial commit.
galaxyp
parents:
diff changeset
347 foreach my $regex (keys(%{$wanted})) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
348
163892325845 Initial commit.
galaxyp
parents:
diff changeset
349 if ($id =~ m/$regex/) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
350 $logger->info('Regex bingo, preserving sequence with ID ' . $id);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
351 $found = 1;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
352 $extracted_seqs++;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
353 last;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
354 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
355 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
356 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
357 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
358 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
359 if ($found) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
360 print($path_to_fh $line);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
361 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
362
163892325845 Initial commit.
galaxyp
parents:
diff changeset
363 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
364
163892325845 Initial commit.
galaxyp
parents:
diff changeset
365 close($path_from_fh);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
366 close($path_to_fh);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
367
163892325845 Initial commit.
galaxyp
parents:
diff changeset
368 return ($extracted_seqs);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
369
163892325845 Initial commit.
galaxyp
parents:
diff changeset
370 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
371
163892325845 Initial commit.
galaxyp
parents:
diff changeset
372 sub _Usage {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
373
163892325845 Initial commit.
galaxyp
parents:
diff changeset
374 print STDERR "\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
375 . "extractSeqsFromFasta.pl:\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
376 . " Extracts sequences from a multi-sequence FASTA file.\n" . "\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
377 . "Usage:\n" . "\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
378 . " extractSeqsFromFasta.pl options\n" . "\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
379 . "Available options are:\n" . "\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
380 . " -i [file] Input file in FASTA format.\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
381 . " -o [file] Output file in FASTA format where the extracted files will be saved.\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
382 . " -f [file] Filter file containing accession numbers or IDs that shoud be extracted from the input.\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
383 . " (One accession/ID per line)\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
384 . " -m [string] Match logic that defines how the accession numbers or IDs from the filter file will be.\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
385 . " matched to those in the FASTA file. Supported logic types:\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
386 . " literal for exact matching.\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
387 . " regex for fuzzy matching using simple regular expressions (in Perl regex syntax).\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
388 . " -u Use unversioned accession numbers for matching (only with -m literal).\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
389 . " If the FASTA input file contains a versioned accession number like this IPI00189968.5,\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
390 . " running this tool without -u (default) IPI00189968 or IPI00189968.2 will not match IPI00189968.5, \n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
391 . " but with -u IPI00189968 or IPI00189968.2 will match IPI00189968.5 and the sequence will be extracted.\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
392 . " Hence this allows for less strict matching, but is less fuzzy than matching with regexes.\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
393 . " -l [LEVEL] Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.\n"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
394 . "\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
395 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
396
163892325845 Initial commit.
galaxyp
parents:
diff changeset
397 }