3
|
1 /*
|
|
2 FASTX-toolkit - FASTA/FASTQ preprocessing tools.
|
|
3 Copyright (C) 2009 A. Gordon (gordon@cshl.edu)
|
|
4
|
|
5 This program is free software: you can redistribute it and/or modify
|
|
6 it under the terms of the GNU Affero General Public License as
|
|
7 published by the Free Software Foundation, either version 3 of the
|
|
8 License, or (at your option) any later version.
|
|
9
|
|
10 This program is distributed in the hope that it will be useful,
|
|
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
13 GNU Affero General Public License for more details.
|
|
14
|
|
15 You should have received a copy of the GNU Affero General Public License
|
|
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
17 */
|
|
18 #include <cstddef>
|
|
19 #include <cstdlib>
|
|
20 #include <algorithm>
|
|
21 #include <ostream>
|
|
22 #include <iostream>
|
|
23 #include <string>
|
|
24 #include <vector>
|
|
25 #include <string.h>
|
|
26
|
|
27 #include "sequence_alignment.h"
|
|
28
|
|
29 #include <errno.h>
|
|
30 #include <err.h>
|
|
31
|
|
32 #include <config.h>
|
|
33
|
|
34 #include "fastx.h"
|
|
35 #include "fastx_args.h"
|
|
36
|
|
37
|
|
38 #define MAX_ADAPTER_LEN 100
|
|
39
|
|
40 const char* usage=
|
|
41 "usage: fastx_clipper [-h] [-a ADAPTER] [-D] [-l N] [-n] [-d N] [-c] [-C] [-o] [-v] [-z] [-i INFILE] [-o OUTFILE]\n" \
|
|
42 "\n" \
|
|
43 "version " VERSION "\n" \
|
|
44 " [-h] = This helpful help screen.\n" \
|
|
45 " [-a ADAPTER] = ADAPTER string. default is CCTTAAGG (dummy adapter).\n" \
|
|
46 " [-l N] = discard sequences shorter than N nucleotides. default is 5.\n" \
|
|
47 " [-d N] = Keep the adapter and N bases after it.\n" \
|
|
48 " (using '-d 0' is the same as not using '-d' at all. which is the default).\n" \
|
|
49 " [-c] = Discard non-clipped sequences (i.e. - keep only sequences which contained the adapter).\n" \
|
|
50 " [-C] = Discard clipped sequences (i.e. - keep only sequences which did not contained the adapter).\n" \
|
|
51 " [-k] = Report Adapter-Only sequences.\n" \
|
|
52 " [-n] = keep sequences with unknown (N) nucleotides. default is to discard such sequences.\n" \
|
|
53 " [-v] = Verbose - report number of sequences.\n" \
|
|
54 " If [-o] is specified, report will be printed to STDOUT.\n" \
|
|
55 " If [-o] is not specified (and output goes to STDOUT),\n" \
|
|
56 " report will be printed to STDERR.\n" \
|
|
57 " [-z] = Compress output with GZIP.\n" \
|
|
58 " [-D] = DEBUG output.\n" \
|
|
59 " [-i INFILE] = FASTA/Q input file. default is STDIN.\n" \
|
|
60 " [-o OUTFILE] = FASTA/Q output file. default is STDOUT.\n" \
|
|
61 "\n";
|
|
62
|
|
63 //Default adapter - Dummy sequence
|
|
64 char adapter[MAX_ADAPTER_LEN]="CCTTAAGG";
|
|
65 unsigned int min_length=5;
|
|
66 int discard_unknown_bases=1;
|
|
67 int keep_delta=0;
|
|
68 int discard_non_clipped=0;
|
|
69 int discard_clipped=0;
|
|
70 int show_adapter_only=0;
|
|
71 int debug = 0 ;
|
|
72
|
|
73
|
|
74 //Statistics for verbose report
|
|
75 unsigned int count_input=0 ;
|
|
76 unsigned int count_discarded_too_short=0; // see [-l N] option
|
|
77 unsigned int count_discarded_adapter_at_index_zero=0; //empty sequences (after clipping)
|
|
78 unsigned int count_discarded_no_adapter_found=0; // see [-c] option
|
|
79 unsigned int count_discarded_adapter_found=0; // see [-C] option
|
|
80 unsigned int count_discarded_N=0; // see [-n]
|
|
81
|
|
82 FASTX fastx;
|
|
83 HalfLocalSequenceAlignment align;
|
|
84
|
|
85 int parse_program_args(int __attribute__((unused)) optind, int optc, char* optarg)
|
|
86 {
|
|
87 switch(optc) {
|
|
88 case 'k':
|
|
89 show_adapter_only=1;
|
|
90 break;
|
|
91
|
|
92 case 'D':
|
|
93 debug++;
|
|
94 break ;
|
|
95
|
|
96 case 'c':
|
|
97 discard_non_clipped = 1;
|
|
98 break;
|
|
99
|
|
100 case 'C':
|
|
101 discard_clipped = 1 ;
|
|
102 break ;
|
|
103 case 'd':
|
|
104 if (optarg==NULL)
|
|
105 errx(1, "[-d] parameter requires an argument value");
|
|
106 keep_delta = strtoul(optarg,NULL,10);
|
|
107 if (keep_delta<0)
|
|
108 errx(1,"Invalid number bases to keep (-d %s)", optarg);
|
|
109 break;
|
|
110 case 'a':
|
|
111 strncpy(adapter,optarg,sizeof(adapter)-1);
|
|
112 //TODO:
|
|
113 //if (!valid_sequence_string(adapter))
|
|
114 // errx(1,"Invalid adapter string (-a %s)", adapter);
|
|
115 break ;
|
|
116
|
|
117 case 'l':
|
|
118 if (optarg==NULL)
|
|
119 errx(1,"[-l] parameter requires an argument value");
|
|
120
|
|
121 min_length = strtoul(optarg, NULL, 10);
|
|
122 break;
|
|
123
|
|
124 case 'n':
|
|
125 discard_unknown_bases = 0 ;
|
|
126 break;
|
|
127
|
|
128 default:
|
|
129 errx(1,"Unknown argument (%c)", optc ) ;
|
|
130
|
|
131 }
|
|
132 return 1;
|
|
133 }
|
|
134
|
|
135 int parse_commandline(int argc, char* argv[])
|
|
136 {
|
|
137
|
|
138 fastx_parse_cmdline(argc, argv, "kDCcd:a:s:l:n", parse_program_args);
|
|
139
|
|
140 if (keep_delta>0)
|
|
141 keep_delta += strlen(adapter);
|
|
142 return 1;
|
|
143 }
|
|
144
|
|
145 int adapter_cutoff_index ( const SequenceAlignmentResults& alignment_results ) __attribute__ ((const));
|
|
146 int adapter_cutoff_index ( const SequenceAlignmentResults& alignment_results )
|
|
147 {
|
|
148 #if 0
|
|
149 int mismatches = alignment_results.mismatches ;
|
|
150
|
|
151 //The adapter(=target) is expected to align from the first base.
|
|
152 //If the start is not zero (=not aligned from first base),
|
|
153 //count each skipped base as a mismatch
|
|
154 mismatches += alignment_results.target_start ;
|
|
155
|
|
156 //The adapter is expected to align up to the end
|
|
157 //of the adapter(=target), or the end of the query.
|
|
158 //If it doesn't, count the un-aligned bases as mismatches
|
|
159 int missing_from_query_end = (alignment_results.query_size - alignment_results.query_end-1);
|
|
160 int missing_from_target_end = (alignment_results.target_size - alignment_results.target_end-1);
|
|
161
|
|
162 int missing_from_end = std::min(missing_from_query_end, missing_from_target_end);
|
|
163
|
|
164 mismatches += missing_from_end ;
|
|
165
|
|
166
|
|
167
|
|
168 std::cout << "Missing from start = " << alignment_results.target_start
|
|
169 << " Missing from end = " << missing_from_end
|
|
170 << " mismatches = " << mismatches
|
|
171 << std::endl;
|
|
172
|
|
173 if (mismatches > max_mismatches)
|
|
174 return -1;
|
|
175
|
|
176 return alignment_results.query_start;
|
|
177 #endif
|
|
178
|
|
179 int alignment_size = alignment_results.neutral_matches +
|
|
180 alignment_results.matches +
|
|
181 alignment_results.mismatches +
|
|
182 alignment_results.gaps ;
|
|
183
|
|
184 //No alignment at all?
|
|
185 if (alignment_size==0)
|
|
186 return -1;
|
|
187
|
|
188 //Any good alignment at the end of the query
|
|
189 //(even only a single nucleotide)
|
|
190 //Example:
|
|
191 // The adapter starts with CTGTAG, The Query ends with CT - it's a match.
|
|
192 if ( alignment_results.query_end == alignment_results.query_size-1
|
|
193 &&
|
|
194 alignment_results.mismatches == 0 ) {
|
|
195 //printf("--1\n");
|
|
196 return alignment_results.query_start ;
|
|
197 }
|
|
198
|
|
199 if ( alignment_size > 5
|
|
200 &&
|
|
201 alignment_results.target_start == 0
|
|
202 &&
|
|
203 (alignment_results.matches * 100 / alignment_size ) >= 75 ) {
|
|
204 //printf("--2\n");
|
|
205 return alignment_results.query_start ;
|
|
206 }
|
|
207
|
|
208 if ( alignment_size > 11
|
|
209 &&
|
|
210 (alignment_results.matches * 100 / alignment_size ) >= 80 ) {
|
|
211 //printf("--2\n");
|
|
212 return alignment_results.query_start ;
|
|
213 }
|
|
214
|
|
215 //
|
|
216 //Be very lenient regarding alignments at the end of the query sequence
|
|
217 if ( alignment_results.query_end >= alignment_results.query_size-2
|
|
218 &&
|
|
219 alignment_size <= 5 && alignment_results.matches >= 3) {
|
|
220 //printf("--3\n");
|
|
221 return alignment_results.query_start ;
|
|
222 }
|
|
223
|
|
224 return -1;
|
|
225 }
|
|
226
|
|
227
|
|
228 int main(int argc, char* argv[])
|
|
229 {
|
|
230 int i;
|
|
231 int reads_count;
|
|
232
|
|
233 parse_commandline(argc, argv);
|
|
234
|
|
235 fastx_init_reader(&fastx, get_input_filename(),
|
|
236 FASTA_OR_FASTQ, ALLOW_N, REQUIRE_UPPERCASE);
|
|
237
|
|
238 fastx_init_writer(&fastx, get_output_filename(), OUTPUT_SAME_AS_INPUT, compress_output_flag());
|
|
239
|
|
240 while ( fastx_read_next_record(&fastx) ) {
|
|
241
|
|
242 reads_count = get_reads_count(&fastx);
|
|
243
|
|
244 #if 0
|
|
245 std::string query = std::string(fastx.nucleotides) + std::string( strlen(adapter), 'N' );
|
|
246 std::string target= std::string( strlen(fastx.nucleotides), 'N' ) + std::string(adapter);
|
|
247 #else
|
|
248 std::string query = std::string(fastx.nucleotides) ;
|
|
249 std::string target= std::string(adapter);
|
|
250 #endif
|
|
251
|
|
252
|
|
253 align.align( query, target ) ;
|
|
254
|
|
255 if (debug>1)
|
|
256 align.print_matrix();
|
|
257 if (debug>0)
|
|
258 align.results().print();
|
|
259
|
|
260 count_input+= reads_count;
|
|
261
|
|
262 //Find the best match with the adapter
|
|
263 i = adapter_cutoff_index ( align.results() ) ;
|
|
264
|
|
265 if (i!=-1 && i>0) {
|
|
266 i += keep_delta;
|
|
267 //Just trim the string after this position
|
|
268 fastx.nucleotides[i] = 0 ;
|
|
269 }
|
|
270
|
|
271 if (i==0) { // empty sequence ? (in which the adapter was found at index 0)
|
|
272 count_discarded_adapter_at_index_zero += reads_count;
|
|
273
|
|
274 if (show_adapter_only)
|
|
275 fastx_write_record(&fastx);
|
|
276 continue;
|
|
277 }
|
|
278
|
|
279 if (strlen(fastx.nucleotides) < min_length) { // too-short sequence ?
|
|
280 count_discarded_too_short += reads_count;
|
|
281 continue;
|
|
282 }
|
|
283
|
|
284 if ( (i==-1) && discard_non_clipped ) { // adapter not found (i.e. sequence was not clipped) ?
|
|
285 count_discarded_no_adapter_found += reads_count;
|
|
286 continue ;
|
|
287 }
|
|
288
|
|
289 if ( (i>0) && discard_clipped ) { // adapter found, and user requested to keep only non-clipped sequences
|
|
290 count_discarded_adapter_found += reads_count;
|
|
291 continue;
|
|
292 }
|
|
293
|
|
294 if ( (discard_unknown_bases && strchr(fastx.nucleotides,'N')!=NULL ) ) { // contains unknown bases (after clipping) ?
|
|
295 count_discarded_N += reads_count;
|
|
296 continue;
|
|
297 }
|
|
298
|
|
299 if (!show_adapter_only) {
|
|
300 //none of the above condition matched, so print this sequence.
|
|
301 fastx_write_record(&fastx);
|
|
302 }
|
|
303 }
|
|
304
|
|
305 //
|
|
306 //Print verbose report
|
|
307 if ( verbose_flag() ) {
|
|
308 fprintf(get_report_file(), "Clipping Adapter: %s\n", adapter );
|
|
309 fprintf(get_report_file(), "Min. Length: %d\n", min_length) ;
|
|
310
|
|
311 if (discard_clipped)
|
|
312 fprintf(get_report_file(), "Clipped reads - discarded.\n" ) ;
|
|
313 if (discard_non_clipped)
|
|
314 fprintf(get_report_file(), "Non-Clipped reads - discarded.\n" ) ;
|
|
315
|
|
316
|
|
317 fprintf(get_report_file(), "Input: %u reads.\n", count_input ) ;
|
|
318 fprintf(get_report_file(), "Output: %u reads.\n",
|
|
319 count_input - count_discarded_too_short - count_discarded_no_adapter_found - count_discarded_adapter_found -
|
|
320 count_discarded_N - count_discarded_adapter_at_index_zero ) ;
|
|
321
|
|
322 fprintf(get_report_file(), "discarded %u too-short reads.\n", count_discarded_too_short ) ;
|
|
323 fprintf(get_report_file(), "discarded %u adapter-only reads.\n", count_discarded_adapter_at_index_zero );
|
|
324 if (discard_non_clipped)
|
|
325 fprintf(get_report_file(), "discarded %u non-clipped reads.\n", count_discarded_no_adapter_found );
|
|
326 if (discard_clipped)
|
|
327 fprintf(get_report_file(), "discarded %u clipped reads.\n", count_discarded_adapter_found );
|
|
328 if (discard_unknown_bases)
|
|
329 fprintf(get_report_file(), "discarded %u N reads.\n", count_discarded_N );
|
|
330 }
|
|
331
|
|
332 return 0;
|
|
333 }
|