Mercurial > repos > xilinxu > xilinxu
view fastx_toolkit-0.0.6/src/fastq_quality_filter/fastq_quality_filter.c @ 3:997f5136985f draft default tip
Uploaded
author | xilinxu |
---|---|
date | Thu, 14 Aug 2014 04:52:17 -0400 |
parents | |
children |
line wrap: on
line source
/* FASTX-toolkit - FASTA/FASTQ preprocessing tools. Copyright (C) 2009 A. Gordon (gordon@cshl.edu) This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <limits.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <getopt.h> #include <errno.h> #include <err.h> #include <config.h> #include "fastx.h" #include "fastx_args.h" #define MAX_ADAPTER_LEN 100 const char* usage= "usage: fastq_quality_filter [-h] [-v] [-q N] [-p N] [-z] [-i INFILE] [-o OUTFILE]\n" \ "\n" \ "version " VERSION "\n" \ " [-h] = This helpful help screen.\n" \ " [-q N] = Minimum quality score to keep.\n" \ " [-p N] = Minimum percent of bases that must have [-q] quality.\n" \ " [-z] = Compress output with GZIP.\n" \ " [-i INFILE] = FASTA/Q input file. default is STDIN.\n" \ " [-o OUTFILE] = FASTA/Q output file. default is STDOUT.\n" \ " [-v] = Verbose - report number of sequences.\n" \ " If [-o] is specified, report will be printed to STDOUT.\n" \ " If [-o] is not specified (and output goes to STDOUT),\n" \ " report will be printed to STDERR.\n" \ "\n"; #define DO_NOT_TRIM_LAST_BASE (0) int min_quality=0; int min_percent=0; FASTX fastx; int parse_program_args(int __attribute__((unused)) optind, int optc, char* optarg) { switch(optc) { case 'q': if (optarg==NULL) errx(1, "[-q] parameter requires an argument value"); min_quality = strtoul(optarg,NULL,10); break; case 'p': if (optarg==NULL) errx(1, "[-l] parameter requires an argument value"); min_percent = strtoul(optarg,NULL,10); if (min_percent<=0 || min_percent>100) errx(1,"Invalid percent value (-p %s)", optarg); break; default: errx(1, __FILE__ ":%d: Unknown argument (%c)", __LINE__, optc ) ; } return 1; } int get_index_of_nth_element(int *array, int array_size, int n) { int pos; //Find the first nono-empty index pos = 0 ; while ( pos < array_size && array[pos]==0 ) pos++; #if 0 fprintf(stderr,"n=%d\n", n); for (i=0; i< array_size; i++) { if (array[i] != 0) fprintf(stderr, "[%d]=%d ", i + MIN_QUALITY_VALUE, array[i]) ; } fprintf(stderr,"\n"); #endif if (pos == array_size) errx(1,"bug: got empty array at %s:%d", __FILE__, __LINE__); while (n > 0) { if (array[pos] > n) break; n -= array[pos]; pos++; while (array[pos]==0 && pos < array_size) pos++; } return pos; } int get_percentile_quality(const FASTX *fastx, int percentile) { size_t i; int count=0; int quality_values[QUALITY_VALUES_RANGE]; memset(quality_values, 0, sizeof(quality_values)); for (i=0; i< strlen(fastx->nucleotides); i++) { count++; quality_values[ fastx->quality[i] - MIN_QUALITY_VALUE ] ++ ; } i = get_index_of_nth_element(quality_values, QUALITY_VALUES_RANGE, (count * (100-percentile) / 100)); //printf(" n = %d, i = %d, i+MIN_QUAL_VALUE=%d\n", // (count*(100-percentile)/100), i, i+MIN_QUALITY_VALUE) ; return i + MIN_QUALITY_VALUE ; } int main(int argc, char* argv[]) { fastx_parse_cmdline(argc, argv, "q:p:", parse_program_args); fastx_init_reader(&fastx, get_input_filename(), FASTQ_ONLY, ALLOW_N, REQUIRE_UPPERCASE); fastx_init_writer(&fastx, get_output_filename(), OUTPUT_SAME_AS_INPUT, compress_output_flag()); while ( fastx_read_next_record(&fastx) ) { #if 0 fprintf(stderr, "%s\n", fastx.nucleotides ) ; for (i=0; i<strlen(fastx.nucleotides); i++) { fprintf(stderr,"%d ", fastx.quality[i]); } fprintf(stderr,"\n"); #endif int value = get_percentile_quality(&fastx, min_percent); //fprintf(stderr, "value = %d\n\n", value ) ; if (value >= min_quality) { fastx_write_record(&fastx); } else { // fprintf(stderr, "%s\n", fastx.nucleotides ) ; // fprintf(stderr, "value = %d\n", value ) ; } } // //Print verbose report if ( verbose_flag() ) { fprintf(get_report_file(), "Quality cut-off: %d\n", min_quality); fprintf(get_report_file(), "Minimum percentage: %d\n", min_percent); fprintf(get_report_file(), "Input: %zu reads.\n", num_input_reads(&fastx) ) ; fprintf(get_report_file(), "Output: %zu reads.\n", num_output_reads(&fastx) ) ; size_t discarded = num_input_reads(&fastx) - num_output_reads(&fastx) ; fprintf(get_report_file(), "discarded %zu (%zu%%) low-quality reads.\n", discarded, (discarded*100)/( num_input_reads(&fastx) ) ) ; } return 0; }