3
|
1 /*
|
|
2 FASTX-toolkit - FASTA/FASTQ preprocessing tools.
|
|
3 Copyright (C) 2009 A. Gordon (gordon@cshl.edu)
|
|
4
|
|
5 This program is free software: you can redistribute it and/or modify
|
|
6 it under the terms of the GNU Affero General Public License as
|
|
7 published by the Free Software Foundation, either version 3 of the
|
|
8 License, or (at your option) any later version.
|
|
9
|
|
10 This program is distributed in the hope that it will be useful,
|
|
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
13 GNU Affero General Public License for more details.
|
|
14
|
|
15 You should have received a copy of the GNU Affero General Public License
|
|
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
17 */
|
|
18 #include <limits.h>
|
|
19 #include <stdio.h>
|
|
20 #include <stdlib.h>
|
|
21 #include <string.h>
|
|
22 #include <getopt.h>
|
|
23 #include <errno.h>
|
|
24 #include <err.h>
|
|
25
|
|
26 #include <config.h>
|
|
27
|
|
28 #include "fastx.h"
|
|
29 #include "fastx_args.h"
|
|
30
|
|
31 #define MAX_ADAPTER_LEN 100
|
|
32
|
|
33 const char* usage=
|
|
34 "usage: fastq_quality_filter [-h] [-v] [-q N] [-p N] [-z] [-i INFILE] [-o OUTFILE]\n" \
|
|
35 "\n" \
|
|
36 "version " VERSION "\n" \
|
|
37 " [-h] = This helpful help screen.\n" \
|
|
38 " [-q N] = Minimum quality score to keep.\n" \
|
|
39 " [-p N] = Minimum percent of bases that must have [-q] quality.\n" \
|
|
40 " [-z] = Compress output with GZIP.\n" \
|
|
41 " [-i INFILE] = FASTA/Q input file. default is STDIN.\n" \
|
|
42 " [-o OUTFILE] = FASTA/Q output file. default is STDOUT.\n" \
|
|
43 " [-v] = Verbose - report number of sequences.\n" \
|
|
44 " If [-o] is specified, report will be printed to STDOUT.\n" \
|
|
45 " If [-o] is not specified (and output goes to STDOUT),\n" \
|
|
46 " report will be printed to STDERR.\n" \
|
|
47 "\n";
|
|
48
|
|
49 #define DO_NOT_TRIM_LAST_BASE (0)
|
|
50
|
|
51 int min_quality=0;
|
|
52 int min_percent=0;
|
|
53
|
|
54 FASTX fastx;
|
|
55
|
|
56 int parse_program_args(int __attribute__((unused)) optind, int optc, char* optarg)
|
|
57 {
|
|
58 switch(optc) {
|
|
59 case 'q':
|
|
60 if (optarg==NULL)
|
|
61 errx(1, "[-q] parameter requires an argument value");
|
|
62 min_quality = strtoul(optarg,NULL,10);
|
|
63 break;
|
|
64
|
|
65 case 'p':
|
|
66 if (optarg==NULL)
|
|
67 errx(1, "[-l] parameter requires an argument value");
|
|
68 min_percent = strtoul(optarg,NULL,10);
|
|
69 if (min_percent<=0 || min_percent>100)
|
|
70 errx(1,"Invalid percent value (-p %s)", optarg);
|
|
71 break;
|
|
72 default:
|
|
73 errx(1, __FILE__ ":%d: Unknown argument (%c)", __LINE__, optc ) ;
|
|
74 }
|
|
75 return 1;
|
|
76 }
|
|
77
|
|
78 int get_index_of_nth_element(int *array, int array_size, int n)
|
|
79 {
|
|
80 int pos;
|
|
81
|
|
82 //Find the first nono-empty index
|
|
83 pos = 0 ;
|
|
84 while ( pos < array_size && array[pos]==0 )
|
|
85 pos++;
|
|
86
|
|
87 #if 0
|
|
88 fprintf(stderr,"n=%d\n", n);
|
|
89 for (i=0; i< array_size; i++) {
|
|
90 if (array[i] != 0)
|
|
91 fprintf(stderr, "[%d]=%d ", i + MIN_QUALITY_VALUE, array[i]) ;
|
|
92 }
|
|
93 fprintf(stderr,"\n");
|
|
94 #endif
|
|
95
|
|
96 if (pos == array_size)
|
|
97 errx(1,"bug: got empty array at %s:%d", __FILE__, __LINE__);
|
|
98
|
|
99 while (n > 0) {
|
|
100 if (array[pos] > n)
|
|
101 break;
|
|
102 n -= array[pos];
|
|
103 pos++;
|
|
104 while (array[pos]==0 && pos < array_size)
|
|
105 pos++;
|
|
106 }
|
|
107 return pos;
|
|
108 }
|
|
109
|
|
110 int get_percentile_quality(const FASTX *fastx, int percentile)
|
|
111 {
|
|
112 size_t i;
|
|
113 int count=0;
|
|
114 int quality_values[QUALITY_VALUES_RANGE];
|
|
115
|
|
116 memset(quality_values, 0, sizeof(quality_values));
|
|
117
|
|
118 for (i=0; i< strlen(fastx->nucleotides); i++) {
|
|
119 count++;
|
|
120 quality_values[ fastx->quality[i] - MIN_QUALITY_VALUE ] ++ ;
|
|
121 }
|
|
122
|
|
123 i = get_index_of_nth_element(quality_values, QUALITY_VALUES_RANGE, (count * (100-percentile) / 100));
|
|
124
|
|
125 //printf(" n = %d, i = %d, i+MIN_QUAL_VALUE=%d\n",
|
|
126 // (count*(100-percentile)/100), i, i+MIN_QUALITY_VALUE) ;
|
|
127
|
|
128 return i + MIN_QUALITY_VALUE ;
|
|
129 }
|
|
130
|
|
131 int main(int argc, char* argv[])
|
|
132 {
|
|
133 fastx_parse_cmdline(argc, argv, "q:p:", parse_program_args);
|
|
134
|
|
135 fastx_init_reader(&fastx, get_input_filename(),
|
|
136 FASTQ_ONLY, ALLOW_N, REQUIRE_UPPERCASE);
|
|
137
|
|
138 fastx_init_writer(&fastx, get_output_filename(), OUTPUT_SAME_AS_INPUT, compress_output_flag());
|
|
139
|
|
140 while ( fastx_read_next_record(&fastx) ) {
|
|
141 #if 0
|
|
142 fprintf(stderr, "%s\n", fastx.nucleotides ) ;
|
|
143 for (i=0; i<strlen(fastx.nucleotides); i++) {
|
|
144 fprintf(stderr,"%d ", fastx.quality[i]);
|
|
145 }
|
|
146 fprintf(stderr,"\n");
|
|
147 #endif
|
|
148
|
|
149 int value = get_percentile_quality(&fastx, min_percent);
|
|
150
|
|
151 //fprintf(stderr, "value = %d\n\n", value ) ;
|
|
152
|
|
153
|
|
154 if (value >= min_quality) {
|
|
155 fastx_write_record(&fastx);
|
|
156 } else {
|
|
157 // fprintf(stderr, "%s\n", fastx.nucleotides ) ;
|
|
158 // fprintf(stderr, "value = %d\n", value ) ;
|
|
159 }
|
|
160 }
|
|
161
|
|
162 //
|
|
163 //Print verbose report
|
|
164 if ( verbose_flag() ) {
|
|
165 fprintf(get_report_file(), "Quality cut-off: %d\n", min_quality);
|
|
166 fprintf(get_report_file(), "Minimum percentage: %d\n", min_percent);
|
|
167
|
|
168 fprintf(get_report_file(), "Input: %zu reads.\n", num_input_reads(&fastx) ) ;
|
|
169 fprintf(get_report_file(), "Output: %zu reads.\n", num_output_reads(&fastx) ) ;
|
|
170
|
|
171 size_t discarded = num_input_reads(&fastx) - num_output_reads(&fastx) ;
|
|
172 fprintf(get_report_file(), "discarded %zu (%zu%%) low-quality reads.\n",
|
|
173 discarded, (discarded*100)/( num_input_reads(&fastx) ) ) ;
|
|
174 }
|
|
175
|
|
176 return 0;
|
|
177 }
|