annotate fastx_toolkit-0.0.6/src/fastx_clipper/fastx_clipper.cpp @ 3:997f5136985f draft default tip

Uploaded
author xilinxu
date Thu, 14 Aug 2014 04:52:17 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
997f5136985f Uploaded
xilinxu
parents:
diff changeset
1 /*
997f5136985f Uploaded
xilinxu
parents:
diff changeset
2 FASTX-toolkit - FASTA/FASTQ preprocessing tools.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
3 Copyright (C) 2009 A. Gordon (gordon@cshl.edu)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
4
997f5136985f Uploaded
xilinxu
parents:
diff changeset
5 This program is free software: you can redistribute it and/or modify
997f5136985f Uploaded
xilinxu
parents:
diff changeset
6 it under the terms of the GNU Affero General Public License as
997f5136985f Uploaded
xilinxu
parents:
diff changeset
7 published by the Free Software Foundation, either version 3 of the
997f5136985f Uploaded
xilinxu
parents:
diff changeset
8 License, or (at your option) any later version.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
9
997f5136985f Uploaded
xilinxu
parents:
diff changeset
10 This program is distributed in the hope that it will be useful,
997f5136985f Uploaded
xilinxu
parents:
diff changeset
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
997f5136985f Uploaded
xilinxu
parents:
diff changeset
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
997f5136985f Uploaded
xilinxu
parents:
diff changeset
13 GNU Affero General Public License for more details.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
14
997f5136985f Uploaded
xilinxu
parents:
diff changeset
15 You should have received a copy of the GNU Affero General Public License
997f5136985f Uploaded
xilinxu
parents:
diff changeset
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
17 */
997f5136985f Uploaded
xilinxu
parents:
diff changeset
18 #include <cstddef>
997f5136985f Uploaded
xilinxu
parents:
diff changeset
19 #include <cstdlib>
997f5136985f Uploaded
xilinxu
parents:
diff changeset
20 #include <algorithm>
997f5136985f Uploaded
xilinxu
parents:
diff changeset
21 #include <ostream>
997f5136985f Uploaded
xilinxu
parents:
diff changeset
22 #include <iostream>
997f5136985f Uploaded
xilinxu
parents:
diff changeset
23 #include <string>
997f5136985f Uploaded
xilinxu
parents:
diff changeset
24 #include <vector>
997f5136985f Uploaded
xilinxu
parents:
diff changeset
25 #include <string.h>
997f5136985f Uploaded
xilinxu
parents:
diff changeset
26
997f5136985f Uploaded
xilinxu
parents:
diff changeset
27 #include "sequence_alignment.h"
997f5136985f Uploaded
xilinxu
parents:
diff changeset
28
997f5136985f Uploaded
xilinxu
parents:
diff changeset
29 #include <errno.h>
997f5136985f Uploaded
xilinxu
parents:
diff changeset
30 #include <err.h>
997f5136985f Uploaded
xilinxu
parents:
diff changeset
31
997f5136985f Uploaded
xilinxu
parents:
diff changeset
32 #include <config.h>
997f5136985f Uploaded
xilinxu
parents:
diff changeset
33
997f5136985f Uploaded
xilinxu
parents:
diff changeset
34 #include "fastx.h"
997f5136985f Uploaded
xilinxu
parents:
diff changeset
35 #include "fastx_args.h"
997f5136985f Uploaded
xilinxu
parents:
diff changeset
36
997f5136985f Uploaded
xilinxu
parents:
diff changeset
37
997f5136985f Uploaded
xilinxu
parents:
diff changeset
38 #define MAX_ADAPTER_LEN 100
997f5136985f Uploaded
xilinxu
parents:
diff changeset
39
997f5136985f Uploaded
xilinxu
parents:
diff changeset
40 const char* usage=
997f5136985f Uploaded
xilinxu
parents:
diff changeset
41 "usage: fastx_clipper [-h] [-a ADAPTER] [-D] [-l N] [-n] [-d N] [-c] [-C] [-o] [-v] [-z] [-i INFILE] [-o OUTFILE]\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
42 "\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
43 "version " VERSION "\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
44 " [-h] = This helpful help screen.\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
45 " [-a ADAPTER] = ADAPTER string. default is CCTTAAGG (dummy adapter).\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
46 " [-l N] = discard sequences shorter than N nucleotides. default is 5.\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
47 " [-d N] = Keep the adapter and N bases after it.\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
48 " (using '-d 0' is the same as not using '-d' at all. which is the default).\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
49 " [-c] = Discard non-clipped sequences (i.e. - keep only sequences which contained the adapter).\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
50 " [-C] = Discard clipped sequences (i.e. - keep only sequences which did not contained the adapter).\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
51 " [-k] = Report Adapter-Only sequences.\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
52 " [-n] = keep sequences with unknown (N) nucleotides. default is to discard such sequences.\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
53 " [-v] = Verbose - report number of sequences.\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
54 " If [-o] is specified, report will be printed to STDOUT.\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
55 " If [-o] is not specified (and output goes to STDOUT),\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
56 " report will be printed to STDERR.\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
57 " [-z] = Compress output with GZIP.\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
58 " [-D] = DEBUG output.\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
59 " [-i INFILE] = FASTA/Q input file. default is STDIN.\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
60 " [-o OUTFILE] = FASTA/Q output file. default is STDOUT.\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
61 "\n";
997f5136985f Uploaded
xilinxu
parents:
diff changeset
62
997f5136985f Uploaded
xilinxu
parents:
diff changeset
63 //Default adapter - Dummy sequence
997f5136985f Uploaded
xilinxu
parents:
diff changeset
64 char adapter[MAX_ADAPTER_LEN]="CCTTAAGG";
997f5136985f Uploaded
xilinxu
parents:
diff changeset
65 unsigned int min_length=5;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
66 int discard_unknown_bases=1;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
67 int keep_delta=0;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
68 int discard_non_clipped=0;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
69 int discard_clipped=0;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
70 int show_adapter_only=0;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
71 int debug = 0 ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
72
997f5136985f Uploaded
xilinxu
parents:
diff changeset
73
997f5136985f Uploaded
xilinxu
parents:
diff changeset
74 //Statistics for verbose report
997f5136985f Uploaded
xilinxu
parents:
diff changeset
75 unsigned int count_input=0 ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
76 unsigned int count_discarded_too_short=0; // see [-l N] option
997f5136985f Uploaded
xilinxu
parents:
diff changeset
77 unsigned int count_discarded_adapter_at_index_zero=0; //empty sequences (after clipping)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
78 unsigned int count_discarded_no_adapter_found=0; // see [-c] option
997f5136985f Uploaded
xilinxu
parents:
diff changeset
79 unsigned int count_discarded_adapter_found=0; // see [-C] option
997f5136985f Uploaded
xilinxu
parents:
diff changeset
80 unsigned int count_discarded_N=0; // see [-n]
997f5136985f Uploaded
xilinxu
parents:
diff changeset
81
997f5136985f Uploaded
xilinxu
parents:
diff changeset
82 FASTX fastx;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
83 HalfLocalSequenceAlignment align;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
84
997f5136985f Uploaded
xilinxu
parents:
diff changeset
85 int parse_program_args(int __attribute__((unused)) optind, int optc, char* optarg)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
86 {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
87 switch(optc) {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
88 case 'k':
997f5136985f Uploaded
xilinxu
parents:
diff changeset
89 show_adapter_only=1;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
90 break;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
91
997f5136985f Uploaded
xilinxu
parents:
diff changeset
92 case 'D':
997f5136985f Uploaded
xilinxu
parents:
diff changeset
93 debug++;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
94 break ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
95
997f5136985f Uploaded
xilinxu
parents:
diff changeset
96 case 'c':
997f5136985f Uploaded
xilinxu
parents:
diff changeset
97 discard_non_clipped = 1;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
98 break;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
99
997f5136985f Uploaded
xilinxu
parents:
diff changeset
100 case 'C':
997f5136985f Uploaded
xilinxu
parents:
diff changeset
101 discard_clipped = 1 ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
102 break ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
103 case 'd':
997f5136985f Uploaded
xilinxu
parents:
diff changeset
104 if (optarg==NULL)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
105 errx(1, "[-d] parameter requires an argument value");
997f5136985f Uploaded
xilinxu
parents:
diff changeset
106 keep_delta = strtoul(optarg,NULL,10);
997f5136985f Uploaded
xilinxu
parents:
diff changeset
107 if (keep_delta<0)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
108 errx(1,"Invalid number bases to keep (-d %s)", optarg);
997f5136985f Uploaded
xilinxu
parents:
diff changeset
109 break;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
110 case 'a':
997f5136985f Uploaded
xilinxu
parents:
diff changeset
111 strncpy(adapter,optarg,sizeof(adapter)-1);
997f5136985f Uploaded
xilinxu
parents:
diff changeset
112 //TODO:
997f5136985f Uploaded
xilinxu
parents:
diff changeset
113 //if (!valid_sequence_string(adapter))
997f5136985f Uploaded
xilinxu
parents:
diff changeset
114 // errx(1,"Invalid adapter string (-a %s)", adapter);
997f5136985f Uploaded
xilinxu
parents:
diff changeset
115 break ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
116
997f5136985f Uploaded
xilinxu
parents:
diff changeset
117 case 'l':
997f5136985f Uploaded
xilinxu
parents:
diff changeset
118 if (optarg==NULL)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
119 errx(1,"[-l] parameter requires an argument value");
997f5136985f Uploaded
xilinxu
parents:
diff changeset
120
997f5136985f Uploaded
xilinxu
parents:
diff changeset
121 min_length = strtoul(optarg, NULL, 10);
997f5136985f Uploaded
xilinxu
parents:
diff changeset
122 break;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
123
997f5136985f Uploaded
xilinxu
parents:
diff changeset
124 case 'n':
997f5136985f Uploaded
xilinxu
parents:
diff changeset
125 discard_unknown_bases = 0 ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
126 break;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
127
997f5136985f Uploaded
xilinxu
parents:
diff changeset
128 default:
997f5136985f Uploaded
xilinxu
parents:
diff changeset
129 errx(1,"Unknown argument (%c)", optc ) ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
130
997f5136985f Uploaded
xilinxu
parents:
diff changeset
131 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
132 return 1;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
133 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
134
997f5136985f Uploaded
xilinxu
parents:
diff changeset
135 int parse_commandline(int argc, char* argv[])
997f5136985f Uploaded
xilinxu
parents:
diff changeset
136 {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
137
997f5136985f Uploaded
xilinxu
parents:
diff changeset
138 fastx_parse_cmdline(argc, argv, "kDCcd:a:s:l:n", parse_program_args);
997f5136985f Uploaded
xilinxu
parents:
diff changeset
139
997f5136985f Uploaded
xilinxu
parents:
diff changeset
140 if (keep_delta>0)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
141 keep_delta += strlen(adapter);
997f5136985f Uploaded
xilinxu
parents:
diff changeset
142 return 1;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
143 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
144
997f5136985f Uploaded
xilinxu
parents:
diff changeset
145 int adapter_cutoff_index ( const SequenceAlignmentResults& alignment_results ) __attribute__ ((const));
997f5136985f Uploaded
xilinxu
parents:
diff changeset
146 int adapter_cutoff_index ( const SequenceAlignmentResults& alignment_results )
997f5136985f Uploaded
xilinxu
parents:
diff changeset
147 {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
148 #if 0
997f5136985f Uploaded
xilinxu
parents:
diff changeset
149 int mismatches = alignment_results.mismatches ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
150
997f5136985f Uploaded
xilinxu
parents:
diff changeset
151 //The adapter(=target) is expected to align from the first base.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
152 //If the start is not zero (=not aligned from first base),
997f5136985f Uploaded
xilinxu
parents:
diff changeset
153 //count each skipped base as a mismatch
997f5136985f Uploaded
xilinxu
parents:
diff changeset
154 mismatches += alignment_results.target_start ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
155
997f5136985f Uploaded
xilinxu
parents:
diff changeset
156 //The adapter is expected to align up to the end
997f5136985f Uploaded
xilinxu
parents:
diff changeset
157 //of the adapter(=target), or the end of the query.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
158 //If it doesn't, count the un-aligned bases as mismatches
997f5136985f Uploaded
xilinxu
parents:
diff changeset
159 int missing_from_query_end = (alignment_results.query_size - alignment_results.query_end-1);
997f5136985f Uploaded
xilinxu
parents:
diff changeset
160 int missing_from_target_end = (alignment_results.target_size - alignment_results.target_end-1);
997f5136985f Uploaded
xilinxu
parents:
diff changeset
161
997f5136985f Uploaded
xilinxu
parents:
diff changeset
162 int missing_from_end = std::min(missing_from_query_end, missing_from_target_end);
997f5136985f Uploaded
xilinxu
parents:
diff changeset
163
997f5136985f Uploaded
xilinxu
parents:
diff changeset
164 mismatches += missing_from_end ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
165
997f5136985f Uploaded
xilinxu
parents:
diff changeset
166
997f5136985f Uploaded
xilinxu
parents:
diff changeset
167
997f5136985f Uploaded
xilinxu
parents:
diff changeset
168 std::cout << "Missing from start = " << alignment_results.target_start
997f5136985f Uploaded
xilinxu
parents:
diff changeset
169 << " Missing from end = " << missing_from_end
997f5136985f Uploaded
xilinxu
parents:
diff changeset
170 << " mismatches = " << mismatches
997f5136985f Uploaded
xilinxu
parents:
diff changeset
171 << std::endl;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
172
997f5136985f Uploaded
xilinxu
parents:
diff changeset
173 if (mismatches > max_mismatches)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
174 return -1;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
175
997f5136985f Uploaded
xilinxu
parents:
diff changeset
176 return alignment_results.query_start;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
177 #endif
997f5136985f Uploaded
xilinxu
parents:
diff changeset
178
997f5136985f Uploaded
xilinxu
parents:
diff changeset
179 int alignment_size = alignment_results.neutral_matches +
997f5136985f Uploaded
xilinxu
parents:
diff changeset
180 alignment_results.matches +
997f5136985f Uploaded
xilinxu
parents:
diff changeset
181 alignment_results.mismatches +
997f5136985f Uploaded
xilinxu
parents:
diff changeset
182 alignment_results.gaps ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
183
997f5136985f Uploaded
xilinxu
parents:
diff changeset
184 //No alignment at all?
997f5136985f Uploaded
xilinxu
parents:
diff changeset
185 if (alignment_size==0)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
186 return -1;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
187
997f5136985f Uploaded
xilinxu
parents:
diff changeset
188 //Any good alignment at the end of the query
997f5136985f Uploaded
xilinxu
parents:
diff changeset
189 //(even only a single nucleotide)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
190 //Example:
997f5136985f Uploaded
xilinxu
parents:
diff changeset
191 // The adapter starts with CTGTAG, The Query ends with CT - it's a match.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
192 if ( alignment_results.query_end == alignment_results.query_size-1
997f5136985f Uploaded
xilinxu
parents:
diff changeset
193 &&
997f5136985f Uploaded
xilinxu
parents:
diff changeset
194 alignment_results.mismatches == 0 ) {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
195 //printf("--1\n");
997f5136985f Uploaded
xilinxu
parents:
diff changeset
196 return alignment_results.query_start ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
197 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
198
997f5136985f Uploaded
xilinxu
parents:
diff changeset
199 if ( alignment_size > 5
997f5136985f Uploaded
xilinxu
parents:
diff changeset
200 &&
997f5136985f Uploaded
xilinxu
parents:
diff changeset
201 alignment_results.target_start == 0
997f5136985f Uploaded
xilinxu
parents:
diff changeset
202 &&
997f5136985f Uploaded
xilinxu
parents:
diff changeset
203 (alignment_results.matches * 100 / alignment_size ) >= 75 ) {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
204 //printf("--2\n");
997f5136985f Uploaded
xilinxu
parents:
diff changeset
205 return alignment_results.query_start ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
206 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
207
997f5136985f Uploaded
xilinxu
parents:
diff changeset
208 if ( alignment_size > 11
997f5136985f Uploaded
xilinxu
parents:
diff changeset
209 &&
997f5136985f Uploaded
xilinxu
parents:
diff changeset
210 (alignment_results.matches * 100 / alignment_size ) >= 80 ) {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
211 //printf("--2\n");
997f5136985f Uploaded
xilinxu
parents:
diff changeset
212 return alignment_results.query_start ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
213 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
214
997f5136985f Uploaded
xilinxu
parents:
diff changeset
215 //
997f5136985f Uploaded
xilinxu
parents:
diff changeset
216 //Be very lenient regarding alignments at the end of the query sequence
997f5136985f Uploaded
xilinxu
parents:
diff changeset
217 if ( alignment_results.query_end >= alignment_results.query_size-2
997f5136985f Uploaded
xilinxu
parents:
diff changeset
218 &&
997f5136985f Uploaded
xilinxu
parents:
diff changeset
219 alignment_size <= 5 && alignment_results.matches >= 3) {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
220 //printf("--3\n");
997f5136985f Uploaded
xilinxu
parents:
diff changeset
221 return alignment_results.query_start ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
222 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
223
997f5136985f Uploaded
xilinxu
parents:
diff changeset
224 return -1;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
225 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
226
997f5136985f Uploaded
xilinxu
parents:
diff changeset
227
997f5136985f Uploaded
xilinxu
parents:
diff changeset
228 int main(int argc, char* argv[])
997f5136985f Uploaded
xilinxu
parents:
diff changeset
229 {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
230 int i;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
231 int reads_count;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
232
997f5136985f Uploaded
xilinxu
parents:
diff changeset
233 parse_commandline(argc, argv);
997f5136985f Uploaded
xilinxu
parents:
diff changeset
234
997f5136985f Uploaded
xilinxu
parents:
diff changeset
235 fastx_init_reader(&fastx, get_input_filename(),
997f5136985f Uploaded
xilinxu
parents:
diff changeset
236 FASTA_OR_FASTQ, ALLOW_N, REQUIRE_UPPERCASE);
997f5136985f Uploaded
xilinxu
parents:
diff changeset
237
997f5136985f Uploaded
xilinxu
parents:
diff changeset
238 fastx_init_writer(&fastx, get_output_filename(), OUTPUT_SAME_AS_INPUT, compress_output_flag());
997f5136985f Uploaded
xilinxu
parents:
diff changeset
239
997f5136985f Uploaded
xilinxu
parents:
diff changeset
240 while ( fastx_read_next_record(&fastx) ) {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
241
997f5136985f Uploaded
xilinxu
parents:
diff changeset
242 reads_count = get_reads_count(&fastx);
997f5136985f Uploaded
xilinxu
parents:
diff changeset
243
997f5136985f Uploaded
xilinxu
parents:
diff changeset
244 #if 0
997f5136985f Uploaded
xilinxu
parents:
diff changeset
245 std::string query = std::string(fastx.nucleotides) + std::string( strlen(adapter), 'N' );
997f5136985f Uploaded
xilinxu
parents:
diff changeset
246 std::string target= std::string( strlen(fastx.nucleotides), 'N' ) + std::string(adapter);
997f5136985f Uploaded
xilinxu
parents:
diff changeset
247 #else
997f5136985f Uploaded
xilinxu
parents:
diff changeset
248 std::string query = std::string(fastx.nucleotides) ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
249 std::string target= std::string(adapter);
997f5136985f Uploaded
xilinxu
parents:
diff changeset
250 #endif
997f5136985f Uploaded
xilinxu
parents:
diff changeset
251
997f5136985f Uploaded
xilinxu
parents:
diff changeset
252
997f5136985f Uploaded
xilinxu
parents:
diff changeset
253 align.align( query, target ) ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
254
997f5136985f Uploaded
xilinxu
parents:
diff changeset
255 if (debug>1)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
256 align.print_matrix();
997f5136985f Uploaded
xilinxu
parents:
diff changeset
257 if (debug>0)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
258 align.results().print();
997f5136985f Uploaded
xilinxu
parents:
diff changeset
259
997f5136985f Uploaded
xilinxu
parents:
diff changeset
260 count_input+= reads_count;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
261
997f5136985f Uploaded
xilinxu
parents:
diff changeset
262 //Find the best match with the adapter
997f5136985f Uploaded
xilinxu
parents:
diff changeset
263 i = adapter_cutoff_index ( align.results() ) ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
264
997f5136985f Uploaded
xilinxu
parents:
diff changeset
265 if (i!=-1 && i>0) {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
266 i += keep_delta;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
267 //Just trim the string after this position
997f5136985f Uploaded
xilinxu
parents:
diff changeset
268 fastx.nucleotides[i] = 0 ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
269 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
270
997f5136985f Uploaded
xilinxu
parents:
diff changeset
271 if (i==0) { // empty sequence ? (in which the adapter was found at index 0)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
272 count_discarded_adapter_at_index_zero += reads_count;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
273
997f5136985f Uploaded
xilinxu
parents:
diff changeset
274 if (show_adapter_only)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
275 fastx_write_record(&fastx);
997f5136985f Uploaded
xilinxu
parents:
diff changeset
276 continue;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
277 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
278
997f5136985f Uploaded
xilinxu
parents:
diff changeset
279 if (strlen(fastx.nucleotides) < min_length) { // too-short sequence ?
997f5136985f Uploaded
xilinxu
parents:
diff changeset
280 count_discarded_too_short += reads_count;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
281 continue;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
282 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
283
997f5136985f Uploaded
xilinxu
parents:
diff changeset
284 if ( (i==-1) && discard_non_clipped ) { // adapter not found (i.e. sequence was not clipped) ?
997f5136985f Uploaded
xilinxu
parents:
diff changeset
285 count_discarded_no_adapter_found += reads_count;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
286 continue ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
287 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
288
997f5136985f Uploaded
xilinxu
parents:
diff changeset
289 if ( (i>0) && discard_clipped ) { // adapter found, and user requested to keep only non-clipped sequences
997f5136985f Uploaded
xilinxu
parents:
diff changeset
290 count_discarded_adapter_found += reads_count;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
291 continue;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
292 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
293
997f5136985f Uploaded
xilinxu
parents:
diff changeset
294 if ( (discard_unknown_bases && strchr(fastx.nucleotides,'N')!=NULL ) ) { // contains unknown bases (after clipping) ?
997f5136985f Uploaded
xilinxu
parents:
diff changeset
295 count_discarded_N += reads_count;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
296 continue;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
297 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
298
997f5136985f Uploaded
xilinxu
parents:
diff changeset
299 if (!show_adapter_only) {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
300 //none of the above condition matched, so print this sequence.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
301 fastx_write_record(&fastx);
997f5136985f Uploaded
xilinxu
parents:
diff changeset
302 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
303 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
304
997f5136985f Uploaded
xilinxu
parents:
diff changeset
305 //
997f5136985f Uploaded
xilinxu
parents:
diff changeset
306 //Print verbose report
997f5136985f Uploaded
xilinxu
parents:
diff changeset
307 if ( verbose_flag() ) {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
308 fprintf(get_report_file(), "Clipping Adapter: %s\n", adapter );
997f5136985f Uploaded
xilinxu
parents:
diff changeset
309 fprintf(get_report_file(), "Min. Length: %d\n", min_length) ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
310
997f5136985f Uploaded
xilinxu
parents:
diff changeset
311 if (discard_clipped)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
312 fprintf(get_report_file(), "Clipped reads - discarded.\n" ) ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
313 if (discard_non_clipped)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
314 fprintf(get_report_file(), "Non-Clipped reads - discarded.\n" ) ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
315
997f5136985f Uploaded
xilinxu
parents:
diff changeset
316
997f5136985f Uploaded
xilinxu
parents:
diff changeset
317 fprintf(get_report_file(), "Input: %u reads.\n", count_input ) ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
318 fprintf(get_report_file(), "Output: %u reads.\n",
997f5136985f Uploaded
xilinxu
parents:
diff changeset
319 count_input - count_discarded_too_short - count_discarded_no_adapter_found - count_discarded_adapter_found -
997f5136985f Uploaded
xilinxu
parents:
diff changeset
320 count_discarded_N - count_discarded_adapter_at_index_zero ) ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
321
997f5136985f Uploaded
xilinxu
parents:
diff changeset
322 fprintf(get_report_file(), "discarded %u too-short reads.\n", count_discarded_too_short ) ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
323 fprintf(get_report_file(), "discarded %u adapter-only reads.\n", count_discarded_adapter_at_index_zero );
997f5136985f Uploaded
xilinxu
parents:
diff changeset
324 if (discard_non_clipped)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
325 fprintf(get_report_file(), "discarded %u non-clipped reads.\n", count_discarded_no_adapter_found );
997f5136985f Uploaded
xilinxu
parents:
diff changeset
326 if (discard_clipped)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
327 fprintf(get_report_file(), "discarded %u clipped reads.\n", count_discarded_adapter_found );
997f5136985f Uploaded
xilinxu
parents:
diff changeset
328 if (discard_unknown_bases)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
329 fprintf(get_report_file(), "discarded %u N reads.\n", count_discarded_N );
997f5136985f Uploaded
xilinxu
parents:
diff changeset
330 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
331
997f5136985f Uploaded
xilinxu
parents:
diff changeset
332 return 0;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
333 }