Mercurial > repos > xilinxu > xilinxu
comparison fastx_toolkit-0.0.6/src/fastx_clipper/fastx_clipper.cpp @ 3:997f5136985f draft default tip
Uploaded
author | xilinxu |
---|---|
date | Thu, 14 Aug 2014 04:52:17 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:dfe9332138cf | 3:997f5136985f |
---|---|
1 /* | |
2 FASTX-toolkit - FASTA/FASTQ preprocessing tools. | |
3 Copyright (C) 2009 A. Gordon (gordon@cshl.edu) | |
4 | |
5 This program is free software: you can redistribute it and/or modify | |
6 it under the terms of the GNU Affero General Public License as | |
7 published by the Free Software Foundation, either version 3 of the | |
8 License, or (at your option) any later version. | |
9 | |
10 This program is distributed in the hope that it will be useful, | |
11 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 GNU Affero General Public License for more details. | |
14 | |
15 You should have received a copy of the GNU Affero General Public License | |
16 along with this program. If not, see <http://www.gnu.org/licenses/>. | |
17 */ | |
18 #include <cstddef> | |
19 #include <cstdlib> | |
20 #include <algorithm> | |
21 #include <ostream> | |
22 #include <iostream> | |
23 #include <string> | |
24 #include <vector> | |
25 #include <string.h> | |
26 | |
27 #include "sequence_alignment.h" | |
28 | |
29 #include <errno.h> | |
30 #include <err.h> | |
31 | |
32 #include <config.h> | |
33 | |
34 #include "fastx.h" | |
35 #include "fastx_args.h" | |
36 | |
37 | |
38 #define MAX_ADAPTER_LEN 100 | |
39 | |
40 const char* usage= | |
41 "usage: fastx_clipper [-h] [-a ADAPTER] [-D] [-l N] [-n] [-d N] [-c] [-C] [-o] [-v] [-z] [-i INFILE] [-o OUTFILE]\n" \ | |
42 "\n" \ | |
43 "version " VERSION "\n" \ | |
44 " [-h] = This helpful help screen.\n" \ | |
45 " [-a ADAPTER] = ADAPTER string. default is CCTTAAGG (dummy adapter).\n" \ | |
46 " [-l N] = discard sequences shorter than N nucleotides. default is 5.\n" \ | |
47 " [-d N] = Keep the adapter and N bases after it.\n" \ | |
48 " (using '-d 0' is the same as not using '-d' at all. which is the default).\n" \ | |
49 " [-c] = Discard non-clipped sequences (i.e. - keep only sequences which contained the adapter).\n" \ | |
50 " [-C] = Discard clipped sequences (i.e. - keep only sequences which did not contained the adapter).\n" \ | |
51 " [-k] = Report Adapter-Only sequences.\n" \ | |
52 " [-n] = keep sequences with unknown (N) nucleotides. default is to discard such sequences.\n" \ | |
53 " [-v] = Verbose - report number of sequences.\n" \ | |
54 " If [-o] is specified, report will be printed to STDOUT.\n" \ | |
55 " If [-o] is not specified (and output goes to STDOUT),\n" \ | |
56 " report will be printed to STDERR.\n" \ | |
57 " [-z] = Compress output with GZIP.\n" \ | |
58 " [-D] = DEBUG output.\n" \ | |
59 " [-i INFILE] = FASTA/Q input file. default is STDIN.\n" \ | |
60 " [-o OUTFILE] = FASTA/Q output file. default is STDOUT.\n" \ | |
61 "\n"; | |
62 | |
63 //Default adapter - Dummy sequence | |
64 char adapter[MAX_ADAPTER_LEN]="CCTTAAGG"; | |
65 unsigned int min_length=5; | |
66 int discard_unknown_bases=1; | |
67 int keep_delta=0; | |
68 int discard_non_clipped=0; | |
69 int discard_clipped=0; | |
70 int show_adapter_only=0; | |
71 int debug = 0 ; | |
72 | |
73 | |
74 //Statistics for verbose report | |
75 unsigned int count_input=0 ; | |
76 unsigned int count_discarded_too_short=0; // see [-l N] option | |
77 unsigned int count_discarded_adapter_at_index_zero=0; //empty sequences (after clipping) | |
78 unsigned int count_discarded_no_adapter_found=0; // see [-c] option | |
79 unsigned int count_discarded_adapter_found=0; // see [-C] option | |
80 unsigned int count_discarded_N=0; // see [-n] | |
81 | |
82 FASTX fastx; | |
83 HalfLocalSequenceAlignment align; | |
84 | |
85 int parse_program_args(int __attribute__((unused)) optind, int optc, char* optarg) | |
86 { | |
87 switch(optc) { | |
88 case 'k': | |
89 show_adapter_only=1; | |
90 break; | |
91 | |
92 case 'D': | |
93 debug++; | |
94 break ; | |
95 | |
96 case 'c': | |
97 discard_non_clipped = 1; | |
98 break; | |
99 | |
100 case 'C': | |
101 discard_clipped = 1 ; | |
102 break ; | |
103 case 'd': | |
104 if (optarg==NULL) | |
105 errx(1, "[-d] parameter requires an argument value"); | |
106 keep_delta = strtoul(optarg,NULL,10); | |
107 if (keep_delta<0) | |
108 errx(1,"Invalid number bases to keep (-d %s)", optarg); | |
109 break; | |
110 case 'a': | |
111 strncpy(adapter,optarg,sizeof(adapter)-1); | |
112 //TODO: | |
113 //if (!valid_sequence_string(adapter)) | |
114 // errx(1,"Invalid adapter string (-a %s)", adapter); | |
115 break ; | |
116 | |
117 case 'l': | |
118 if (optarg==NULL) | |
119 errx(1,"[-l] parameter requires an argument value"); | |
120 | |
121 min_length = strtoul(optarg, NULL, 10); | |
122 break; | |
123 | |
124 case 'n': | |
125 discard_unknown_bases = 0 ; | |
126 break; | |
127 | |
128 default: | |
129 errx(1,"Unknown argument (%c)", optc ) ; | |
130 | |
131 } | |
132 return 1; | |
133 } | |
134 | |
135 int parse_commandline(int argc, char* argv[]) | |
136 { | |
137 | |
138 fastx_parse_cmdline(argc, argv, "kDCcd:a:s:l:n", parse_program_args); | |
139 | |
140 if (keep_delta>0) | |
141 keep_delta += strlen(adapter); | |
142 return 1; | |
143 } | |
144 | |
145 int adapter_cutoff_index ( const SequenceAlignmentResults& alignment_results ) __attribute__ ((const)); | |
146 int adapter_cutoff_index ( const SequenceAlignmentResults& alignment_results ) | |
147 { | |
148 #if 0 | |
149 int mismatches = alignment_results.mismatches ; | |
150 | |
151 //The adapter(=target) is expected to align from the first base. | |
152 //If the start is not zero (=not aligned from first base), | |
153 //count each skipped base as a mismatch | |
154 mismatches += alignment_results.target_start ; | |
155 | |
156 //The adapter is expected to align up to the end | |
157 //of the adapter(=target), or the end of the query. | |
158 //If it doesn't, count the un-aligned bases as mismatches | |
159 int missing_from_query_end = (alignment_results.query_size - alignment_results.query_end-1); | |
160 int missing_from_target_end = (alignment_results.target_size - alignment_results.target_end-1); | |
161 | |
162 int missing_from_end = std::min(missing_from_query_end, missing_from_target_end); | |
163 | |
164 mismatches += missing_from_end ; | |
165 | |
166 | |
167 | |
168 std::cout << "Missing from start = " << alignment_results.target_start | |
169 << " Missing from end = " << missing_from_end | |
170 << " mismatches = " << mismatches | |
171 << std::endl; | |
172 | |
173 if (mismatches > max_mismatches) | |
174 return -1; | |
175 | |
176 return alignment_results.query_start; | |
177 #endif | |
178 | |
179 int alignment_size = alignment_results.neutral_matches + | |
180 alignment_results.matches + | |
181 alignment_results.mismatches + | |
182 alignment_results.gaps ; | |
183 | |
184 //No alignment at all? | |
185 if (alignment_size==0) | |
186 return -1; | |
187 | |
188 //Any good alignment at the end of the query | |
189 //(even only a single nucleotide) | |
190 //Example: | |
191 // The adapter starts with CTGTAG, The Query ends with CT - it's a match. | |
192 if ( alignment_results.query_end == alignment_results.query_size-1 | |
193 && | |
194 alignment_results.mismatches == 0 ) { | |
195 //printf("--1\n"); | |
196 return alignment_results.query_start ; | |
197 } | |
198 | |
199 if ( alignment_size > 5 | |
200 && | |
201 alignment_results.target_start == 0 | |
202 && | |
203 (alignment_results.matches * 100 / alignment_size ) >= 75 ) { | |
204 //printf("--2\n"); | |
205 return alignment_results.query_start ; | |
206 } | |
207 | |
208 if ( alignment_size > 11 | |
209 && | |
210 (alignment_results.matches * 100 / alignment_size ) >= 80 ) { | |
211 //printf("--2\n"); | |
212 return alignment_results.query_start ; | |
213 } | |
214 | |
215 // | |
216 //Be very lenient regarding alignments at the end of the query sequence | |
217 if ( alignment_results.query_end >= alignment_results.query_size-2 | |
218 && | |
219 alignment_size <= 5 && alignment_results.matches >= 3) { | |
220 //printf("--3\n"); | |
221 return alignment_results.query_start ; | |
222 } | |
223 | |
224 return -1; | |
225 } | |
226 | |
227 | |
228 int main(int argc, char* argv[]) | |
229 { | |
230 int i; | |
231 int reads_count; | |
232 | |
233 parse_commandline(argc, argv); | |
234 | |
235 fastx_init_reader(&fastx, get_input_filename(), | |
236 FASTA_OR_FASTQ, ALLOW_N, REQUIRE_UPPERCASE); | |
237 | |
238 fastx_init_writer(&fastx, get_output_filename(), OUTPUT_SAME_AS_INPUT, compress_output_flag()); | |
239 | |
240 while ( fastx_read_next_record(&fastx) ) { | |
241 | |
242 reads_count = get_reads_count(&fastx); | |
243 | |
244 #if 0 | |
245 std::string query = std::string(fastx.nucleotides) + std::string( strlen(adapter), 'N' ); | |
246 std::string target= std::string( strlen(fastx.nucleotides), 'N' ) + std::string(adapter); | |
247 #else | |
248 std::string query = std::string(fastx.nucleotides) ; | |
249 std::string target= std::string(adapter); | |
250 #endif | |
251 | |
252 | |
253 align.align( query, target ) ; | |
254 | |
255 if (debug>1) | |
256 align.print_matrix(); | |
257 if (debug>0) | |
258 align.results().print(); | |
259 | |
260 count_input+= reads_count; | |
261 | |
262 //Find the best match with the adapter | |
263 i = adapter_cutoff_index ( align.results() ) ; | |
264 | |
265 if (i!=-1 && i>0) { | |
266 i += keep_delta; | |
267 //Just trim the string after this position | |
268 fastx.nucleotides[i] = 0 ; | |
269 } | |
270 | |
271 if (i==0) { // empty sequence ? (in which the adapter was found at index 0) | |
272 count_discarded_adapter_at_index_zero += reads_count; | |
273 | |
274 if (show_adapter_only) | |
275 fastx_write_record(&fastx); | |
276 continue; | |
277 } | |
278 | |
279 if (strlen(fastx.nucleotides) < min_length) { // too-short sequence ? | |
280 count_discarded_too_short += reads_count; | |
281 continue; | |
282 } | |
283 | |
284 if ( (i==-1) && discard_non_clipped ) { // adapter not found (i.e. sequence was not clipped) ? | |
285 count_discarded_no_adapter_found += reads_count; | |
286 continue ; | |
287 } | |
288 | |
289 if ( (i>0) && discard_clipped ) { // adapter found, and user requested to keep only non-clipped sequences | |
290 count_discarded_adapter_found += reads_count; | |
291 continue; | |
292 } | |
293 | |
294 if ( (discard_unknown_bases && strchr(fastx.nucleotides,'N')!=NULL ) ) { // contains unknown bases (after clipping) ? | |
295 count_discarded_N += reads_count; | |
296 continue; | |
297 } | |
298 | |
299 if (!show_adapter_only) { | |
300 //none of the above condition matched, so print this sequence. | |
301 fastx_write_record(&fastx); | |
302 } | |
303 } | |
304 | |
305 // | |
306 //Print verbose report | |
307 if ( verbose_flag() ) { | |
308 fprintf(get_report_file(), "Clipping Adapter: %s\n", adapter ); | |
309 fprintf(get_report_file(), "Min. Length: %d\n", min_length) ; | |
310 | |
311 if (discard_clipped) | |
312 fprintf(get_report_file(), "Clipped reads - discarded.\n" ) ; | |
313 if (discard_non_clipped) | |
314 fprintf(get_report_file(), "Non-Clipped reads - discarded.\n" ) ; | |
315 | |
316 | |
317 fprintf(get_report_file(), "Input: %u reads.\n", count_input ) ; | |
318 fprintf(get_report_file(), "Output: %u reads.\n", | |
319 count_input - count_discarded_too_short - count_discarded_no_adapter_found - count_discarded_adapter_found - | |
320 count_discarded_N - count_discarded_adapter_at_index_zero ) ; | |
321 | |
322 fprintf(get_report_file(), "discarded %u too-short reads.\n", count_discarded_too_short ) ; | |
323 fprintf(get_report_file(), "discarded %u adapter-only reads.\n", count_discarded_adapter_at_index_zero ); | |
324 if (discard_non_clipped) | |
325 fprintf(get_report_file(), "discarded %u non-clipped reads.\n", count_discarded_no_adapter_found ); | |
326 if (discard_clipped) | |
327 fprintf(get_report_file(), "discarded %u clipped reads.\n", count_discarded_adapter_found ); | |
328 if (discard_unknown_bases) | |
329 fprintf(get_report_file(), "discarded %u N reads.\n", count_discarded_N ); | |
330 } | |
331 | |
332 return 0; | |
333 } |