comparison fastx_toolkit-0.0.6/src/fastx_clipper/fastx_clipper.cpp @ 3:997f5136985f draft default tip

Uploaded
author xilinxu
date Thu, 14 Aug 2014 04:52:17 -0400
parents
children
comparison
equal deleted inserted replaced
2:dfe9332138cf 3:997f5136985f
1 /*
2 FASTX-toolkit - FASTA/FASTQ preprocessing tools.
3 Copyright (C) 2009 A. Gordon (gordon@cshl.edu)
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Affero General Public License as
7 published by the Free Software Foundation, either version 3 of the
8 License, or (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Affero General Public License for more details.
14
15 You should have received a copy of the GNU Affero General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 */
18 #include <cstddef>
19 #include <cstdlib>
20 #include <algorithm>
21 #include <ostream>
22 #include <iostream>
23 #include <string>
24 #include <vector>
25 #include <string.h>
26
27 #include "sequence_alignment.h"
28
29 #include <errno.h>
30 #include <err.h>
31
32 #include <config.h>
33
34 #include "fastx.h"
35 #include "fastx_args.h"
36
37
38 #define MAX_ADAPTER_LEN 100
39
40 const char* usage=
41 "usage: fastx_clipper [-h] [-a ADAPTER] [-D] [-l N] [-n] [-d N] [-c] [-C] [-o] [-v] [-z] [-i INFILE] [-o OUTFILE]\n" \
42 "\n" \
43 "version " VERSION "\n" \
44 " [-h] = This helpful help screen.\n" \
45 " [-a ADAPTER] = ADAPTER string. default is CCTTAAGG (dummy adapter).\n" \
46 " [-l N] = discard sequences shorter than N nucleotides. default is 5.\n" \
47 " [-d N] = Keep the adapter and N bases after it.\n" \
48 " (using '-d 0' is the same as not using '-d' at all. which is the default).\n" \
49 " [-c] = Discard non-clipped sequences (i.e. - keep only sequences which contained the adapter).\n" \
50 " [-C] = Discard clipped sequences (i.e. - keep only sequences which did not contained the adapter).\n" \
51 " [-k] = Report Adapter-Only sequences.\n" \
52 " [-n] = keep sequences with unknown (N) nucleotides. default is to discard such sequences.\n" \
53 " [-v] = Verbose - report number of sequences.\n" \
54 " If [-o] is specified, report will be printed to STDOUT.\n" \
55 " If [-o] is not specified (and output goes to STDOUT),\n" \
56 " report will be printed to STDERR.\n" \
57 " [-z] = Compress output with GZIP.\n" \
58 " [-D] = DEBUG output.\n" \
59 " [-i INFILE] = FASTA/Q input file. default is STDIN.\n" \
60 " [-o OUTFILE] = FASTA/Q output file. default is STDOUT.\n" \
61 "\n";
62
63 //Default adapter - Dummy sequence
64 char adapter[MAX_ADAPTER_LEN]="CCTTAAGG";
65 unsigned int min_length=5;
66 int discard_unknown_bases=1;
67 int keep_delta=0;
68 int discard_non_clipped=0;
69 int discard_clipped=0;
70 int show_adapter_only=0;
71 int debug = 0 ;
72
73
74 //Statistics for verbose report
75 unsigned int count_input=0 ;
76 unsigned int count_discarded_too_short=0; // see [-l N] option
77 unsigned int count_discarded_adapter_at_index_zero=0; //empty sequences (after clipping)
78 unsigned int count_discarded_no_adapter_found=0; // see [-c] option
79 unsigned int count_discarded_adapter_found=0; // see [-C] option
80 unsigned int count_discarded_N=0; // see [-n]
81
82 FASTX fastx;
83 HalfLocalSequenceAlignment align;
84
85 int parse_program_args(int __attribute__((unused)) optind, int optc, char* optarg)
86 {
87 switch(optc) {
88 case 'k':
89 show_adapter_only=1;
90 break;
91
92 case 'D':
93 debug++;
94 break ;
95
96 case 'c':
97 discard_non_clipped = 1;
98 break;
99
100 case 'C':
101 discard_clipped = 1 ;
102 break ;
103 case 'd':
104 if (optarg==NULL)
105 errx(1, "[-d] parameter requires an argument value");
106 keep_delta = strtoul(optarg,NULL,10);
107 if (keep_delta<0)
108 errx(1,"Invalid number bases to keep (-d %s)", optarg);
109 break;
110 case 'a':
111 strncpy(adapter,optarg,sizeof(adapter)-1);
112 //TODO:
113 //if (!valid_sequence_string(adapter))
114 // errx(1,"Invalid adapter string (-a %s)", adapter);
115 break ;
116
117 case 'l':
118 if (optarg==NULL)
119 errx(1,"[-l] parameter requires an argument value");
120
121 min_length = strtoul(optarg, NULL, 10);
122 break;
123
124 case 'n':
125 discard_unknown_bases = 0 ;
126 break;
127
128 default:
129 errx(1,"Unknown argument (%c)", optc ) ;
130
131 }
132 return 1;
133 }
134
135 int parse_commandline(int argc, char* argv[])
136 {
137
138 fastx_parse_cmdline(argc, argv, "kDCcd:a:s:l:n", parse_program_args);
139
140 if (keep_delta>0)
141 keep_delta += strlen(adapter);
142 return 1;
143 }
144
145 int adapter_cutoff_index ( const SequenceAlignmentResults& alignment_results ) __attribute__ ((const));
146 int adapter_cutoff_index ( const SequenceAlignmentResults& alignment_results )
147 {
148 #if 0
149 int mismatches = alignment_results.mismatches ;
150
151 //The adapter(=target) is expected to align from the first base.
152 //If the start is not zero (=not aligned from first base),
153 //count each skipped base as a mismatch
154 mismatches += alignment_results.target_start ;
155
156 //The adapter is expected to align up to the end
157 //of the adapter(=target), or the end of the query.
158 //If it doesn't, count the un-aligned bases as mismatches
159 int missing_from_query_end = (alignment_results.query_size - alignment_results.query_end-1);
160 int missing_from_target_end = (alignment_results.target_size - alignment_results.target_end-1);
161
162 int missing_from_end = std::min(missing_from_query_end, missing_from_target_end);
163
164 mismatches += missing_from_end ;
165
166
167
168 std::cout << "Missing from start = " << alignment_results.target_start
169 << " Missing from end = " << missing_from_end
170 << " mismatches = " << mismatches
171 << std::endl;
172
173 if (mismatches > max_mismatches)
174 return -1;
175
176 return alignment_results.query_start;
177 #endif
178
179 int alignment_size = alignment_results.neutral_matches +
180 alignment_results.matches +
181 alignment_results.mismatches +
182 alignment_results.gaps ;
183
184 //No alignment at all?
185 if (alignment_size==0)
186 return -1;
187
188 //Any good alignment at the end of the query
189 //(even only a single nucleotide)
190 //Example:
191 // The adapter starts with CTGTAG, The Query ends with CT - it's a match.
192 if ( alignment_results.query_end == alignment_results.query_size-1
193 &&
194 alignment_results.mismatches == 0 ) {
195 //printf("--1\n");
196 return alignment_results.query_start ;
197 }
198
199 if ( alignment_size > 5
200 &&
201 alignment_results.target_start == 0
202 &&
203 (alignment_results.matches * 100 / alignment_size ) >= 75 ) {
204 //printf("--2\n");
205 return alignment_results.query_start ;
206 }
207
208 if ( alignment_size > 11
209 &&
210 (alignment_results.matches * 100 / alignment_size ) >= 80 ) {
211 //printf("--2\n");
212 return alignment_results.query_start ;
213 }
214
215 //
216 //Be very lenient regarding alignments at the end of the query sequence
217 if ( alignment_results.query_end >= alignment_results.query_size-2
218 &&
219 alignment_size <= 5 && alignment_results.matches >= 3) {
220 //printf("--3\n");
221 return alignment_results.query_start ;
222 }
223
224 return -1;
225 }
226
227
228 int main(int argc, char* argv[])
229 {
230 int i;
231 int reads_count;
232
233 parse_commandline(argc, argv);
234
235 fastx_init_reader(&fastx, get_input_filename(),
236 FASTA_OR_FASTQ, ALLOW_N, REQUIRE_UPPERCASE);
237
238 fastx_init_writer(&fastx, get_output_filename(), OUTPUT_SAME_AS_INPUT, compress_output_flag());
239
240 while ( fastx_read_next_record(&fastx) ) {
241
242 reads_count = get_reads_count(&fastx);
243
244 #if 0
245 std::string query = std::string(fastx.nucleotides) + std::string( strlen(adapter), 'N' );
246 std::string target= std::string( strlen(fastx.nucleotides), 'N' ) + std::string(adapter);
247 #else
248 std::string query = std::string(fastx.nucleotides) ;
249 std::string target= std::string(adapter);
250 #endif
251
252
253 align.align( query, target ) ;
254
255 if (debug>1)
256 align.print_matrix();
257 if (debug>0)
258 align.results().print();
259
260 count_input+= reads_count;
261
262 //Find the best match with the adapter
263 i = adapter_cutoff_index ( align.results() ) ;
264
265 if (i!=-1 && i>0) {
266 i += keep_delta;
267 //Just trim the string after this position
268 fastx.nucleotides[i] = 0 ;
269 }
270
271 if (i==0) { // empty sequence ? (in which the adapter was found at index 0)
272 count_discarded_adapter_at_index_zero += reads_count;
273
274 if (show_adapter_only)
275 fastx_write_record(&fastx);
276 continue;
277 }
278
279 if (strlen(fastx.nucleotides) < min_length) { // too-short sequence ?
280 count_discarded_too_short += reads_count;
281 continue;
282 }
283
284 if ( (i==-1) && discard_non_clipped ) { // adapter not found (i.e. sequence was not clipped) ?
285 count_discarded_no_adapter_found += reads_count;
286 continue ;
287 }
288
289 if ( (i>0) && discard_clipped ) { // adapter found, and user requested to keep only non-clipped sequences
290 count_discarded_adapter_found += reads_count;
291 continue;
292 }
293
294 if ( (discard_unknown_bases && strchr(fastx.nucleotides,'N')!=NULL ) ) { // contains unknown bases (after clipping) ?
295 count_discarded_N += reads_count;
296 continue;
297 }
298
299 if (!show_adapter_only) {
300 //none of the above condition matched, so print this sequence.
301 fastx_write_record(&fastx);
302 }
303 }
304
305 //
306 //Print verbose report
307 if ( verbose_flag() ) {
308 fprintf(get_report_file(), "Clipping Adapter: %s\n", adapter );
309 fprintf(get_report_file(), "Min. Length: %d\n", min_length) ;
310
311 if (discard_clipped)
312 fprintf(get_report_file(), "Clipped reads - discarded.\n" ) ;
313 if (discard_non_clipped)
314 fprintf(get_report_file(), "Non-Clipped reads - discarded.\n" ) ;
315
316
317 fprintf(get_report_file(), "Input: %u reads.\n", count_input ) ;
318 fprintf(get_report_file(), "Output: %u reads.\n",
319 count_input - count_discarded_too_short - count_discarded_no_adapter_found - count_discarded_adapter_found -
320 count_discarded_N - count_discarded_adapter_at_index_zero ) ;
321
322 fprintf(get_report_file(), "discarded %u too-short reads.\n", count_discarded_too_short ) ;
323 fprintf(get_report_file(), "discarded %u adapter-only reads.\n", count_discarded_adapter_at_index_zero );
324 if (discard_non_clipped)
325 fprintf(get_report_file(), "discarded %u non-clipped reads.\n", count_discarded_no_adapter_found );
326 if (discard_clipped)
327 fprintf(get_report_file(), "discarded %u clipped reads.\n", count_discarded_adapter_found );
328 if (discard_unknown_bases)
329 fprintf(get_report_file(), "discarded %u N reads.\n", count_discarded_N );
330 }
331
332 return 0;
333 }