Mercurial > repos > xilinxu > xilinxu
view fastx_toolkit-0.0.6/src/fastx_artifacts_filter/fastx_artifacts_filter.c @ 3:997f5136985f draft default tip
Uploaded
author | xilinxu |
---|---|
date | Thu, 14 Aug 2014 04:52:17 -0400 |
parents | |
children |
line wrap: on
line source
/* FASTX-toolkit - FASTA/FASTQ preprocessing tools. Copyright (C) 2009 A. Gordon (gordon@cshl.edu) This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <limits.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <getopt.h> #include <errno.h> #include <err.h> #include <config.h> #include "fastx.h" #include "fastx_args.h" #define MAX_ADAPTER_LEN 100 const char* usage= "usage: fastx_artifacts_filter [-h] [-v] [-z] [-i INFILE] [-o OUTFILE]\n" \ "\n" \ "version " VERSION "\n" \ " [-h] = This helpful help screen.\n" \ " [-i INFILE] = FASTA/Q input file. default is STDIN.\n" \ " [-o OUTFILE] = FASTA/Q output file. default is STDOUT.\n" \ " [-z] = Compress output with GZIP.\n" \ " [-v] = Verbose - report number of processed reads.\n" \ " If [-o] is specified, report will be printed to STDOUT.\n" \ " If [-o] is not specified (and output goes to STDOUT),\n" \ " report will be printed to STDERR.\n" \ "\n"; #define DO_NOT_TRIM_LAST_BASE (0) FASTX fastx; int parse_commandline(int argc, char* argv[]) { return fastx_parse_cmdline(argc, argv, "", NULL); } int artifact_sequence(const FASTX *fastx) { int n_count=0; int a_count=0; int c_count=0; int t_count=0; int g_count=0; int total_count=0; int max_allowed_different_bases = 3 ; int i=0; while (1) { if (fastx->nucleotides[i]==0) break; total_count++; switch(fastx->nucleotides[i]) { case 'A': a_count++; break; case 'C': c_count++; break; case 'G': g_count++; break; case 'T': t_count++; break; case 'N': n_count++; break; default: errx(1, __FILE__":%d: invalid nucleotide value (%c) at position %d", __LINE__, fastx->nucleotides[i], i ) ; } i++; } //Rules for artifacts if ( a_count>=(total_count-max_allowed_different_bases) || c_count>=(total_count-max_allowed_different_bases) || g_count>=(total_count-max_allowed_different_bases) || t_count>=(total_count-max_allowed_different_bases) ) return 1; return 0; } int main(int argc, char* argv[]) { parse_commandline(argc, argv); fastx_init_reader(&fastx, get_input_filename(), FASTA_OR_FASTQ, ALLOW_N, REQUIRE_UPPERCASE); fastx_init_writer(&fastx, get_output_filename(), OUTPUT_SAME_AS_INPUT, compress_output_flag()); while ( fastx_read_next_record(&fastx) ) { if ( artifact_sequence(&fastx) ) { } else { fastx_write_record(&fastx); } } //Print verbose report if ( verbose_flag() ) { fprintf(get_report_file(), "Input: %zu reads.\n", num_input_reads(&fastx) ) ; fprintf(get_report_file(), "Output: %zu reads.\n", num_output_reads(&fastx) ) ; size_t discarded = num_input_reads(&fastx) - num_output_reads(&fastx) ; fprintf(get_report_file(), "discarded %zu (%zu%%) artifact reads.\n", discarded, (discarded*100)/( num_input_reads(&fastx) ) ) ; } return 0; }