comparison fastx_toolkit-0.0.6/src/fastx_artifacts_filter/fastx_artifacts_filter.c @ 3:997f5136985f draft default tip

Uploaded
author xilinxu
date Thu, 14 Aug 2014 04:52:17 -0400
parents
children
comparison
equal deleted inserted replaced
2:dfe9332138cf 3:997f5136985f
1 /*
2 FASTX-toolkit - FASTA/FASTQ preprocessing tools.
3 Copyright (C) 2009 A. Gordon (gordon@cshl.edu)
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Affero General Public License as
7 published by the Free Software Foundation, either version 3 of the
8 License, or (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Affero General Public License for more details.
14
15 You should have received a copy of the GNU Affero General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 */
18 #include <limits.h>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include <getopt.h>
23 #include <errno.h>
24 #include <err.h>
25
26 #include <config.h>
27
28 #include "fastx.h"
29 #include "fastx_args.h"
30
31 #define MAX_ADAPTER_LEN 100
32
33 const char* usage=
34 "usage: fastx_artifacts_filter [-h] [-v] [-z] [-i INFILE] [-o OUTFILE]\n" \
35 "\n" \
36 "version " VERSION "\n" \
37 " [-h] = This helpful help screen.\n" \
38 " [-i INFILE] = FASTA/Q input file. default is STDIN.\n" \
39 " [-o OUTFILE] = FASTA/Q output file. default is STDOUT.\n" \
40 " [-z] = Compress output with GZIP.\n" \
41 " [-v] = Verbose - report number of processed reads.\n" \
42 " If [-o] is specified, report will be printed to STDOUT.\n" \
43 " If [-o] is not specified (and output goes to STDOUT),\n" \
44 " report will be printed to STDERR.\n" \
45 "\n";
46
47 #define DO_NOT_TRIM_LAST_BASE (0)
48
49 FASTX fastx;
50
51 int parse_commandline(int argc, char* argv[])
52 {
53 return fastx_parse_cmdline(argc, argv, "", NULL);
54 }
55
56 int artifact_sequence(const FASTX *fastx)
57 {
58 int n_count=0;
59 int a_count=0;
60 int c_count=0;
61 int t_count=0;
62 int g_count=0;
63 int total_count=0;
64
65 int max_allowed_different_bases = 3 ;
66
67 int i=0;
68
69 while (1) {
70 if (fastx->nucleotides[i]==0)
71 break;
72
73 total_count++;
74 switch(fastx->nucleotides[i])
75 {
76 case 'A':
77 a_count++;
78 break;
79 case 'C':
80 c_count++;
81 break;
82 case 'G':
83 g_count++;
84 break;
85 case 'T':
86 t_count++;
87 break;
88 case 'N':
89 n_count++;
90 break;
91 default:
92 errx(1, __FILE__":%d: invalid nucleotide value (%c) at position %d",
93 __LINE__, fastx->nucleotides[i], i ) ;
94 }
95 i++;
96 }
97
98 //Rules for artifacts
99
100 if ( a_count>=(total_count-max_allowed_different_bases)
101 ||
102 c_count>=(total_count-max_allowed_different_bases)
103 ||
104 g_count>=(total_count-max_allowed_different_bases)
105 ||
106 t_count>=(total_count-max_allowed_different_bases)
107 )
108 return 1;
109
110
111 return 0;
112 }
113
114 int main(int argc, char* argv[])
115 {
116 parse_commandline(argc, argv);
117
118 fastx_init_reader(&fastx, get_input_filename(),
119 FASTA_OR_FASTQ, ALLOW_N, REQUIRE_UPPERCASE);
120
121 fastx_init_writer(&fastx, get_output_filename(),
122 OUTPUT_SAME_AS_INPUT, compress_output_flag());
123
124 while ( fastx_read_next_record(&fastx) ) {
125
126 if ( artifact_sequence(&fastx) ) {
127 } else {
128 fastx_write_record(&fastx);
129 }
130 }
131
132 //Print verbose report
133 if ( verbose_flag() ) {
134 fprintf(get_report_file(), "Input: %zu reads.\n", num_input_reads(&fastx) ) ;
135 fprintf(get_report_file(), "Output: %zu reads.\n", num_output_reads(&fastx) ) ;
136
137 size_t discarded = num_input_reads(&fastx) - num_output_reads(&fastx) ;
138 fprintf(get_report_file(), "discarded %zu (%zu%%) artifact reads.\n",
139 discarded, (discarded*100)/( num_input_reads(&fastx) ) ) ;
140 }
141
142 return 0;
143 }