annotate fastx_toolkit-0.0.6/src/fastx_artifacts_filter/fastx_artifacts_filter.c @ 3:997f5136985f draft default tip

Uploaded
author xilinxu
date Thu, 14 Aug 2014 04:52:17 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
997f5136985f Uploaded
xilinxu
parents:
diff changeset
1 /*
997f5136985f Uploaded
xilinxu
parents:
diff changeset
2 FASTX-toolkit - FASTA/FASTQ preprocessing tools.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
3 Copyright (C) 2009 A. Gordon (gordon@cshl.edu)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
4
997f5136985f Uploaded
xilinxu
parents:
diff changeset
5 This program is free software: you can redistribute it and/or modify
997f5136985f Uploaded
xilinxu
parents:
diff changeset
6 it under the terms of the GNU Affero General Public License as
997f5136985f Uploaded
xilinxu
parents:
diff changeset
7 published by the Free Software Foundation, either version 3 of the
997f5136985f Uploaded
xilinxu
parents:
diff changeset
8 License, or (at your option) any later version.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
9
997f5136985f Uploaded
xilinxu
parents:
diff changeset
10 This program is distributed in the hope that it will be useful,
997f5136985f Uploaded
xilinxu
parents:
diff changeset
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
997f5136985f Uploaded
xilinxu
parents:
diff changeset
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
997f5136985f Uploaded
xilinxu
parents:
diff changeset
13 GNU Affero General Public License for more details.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
14
997f5136985f Uploaded
xilinxu
parents:
diff changeset
15 You should have received a copy of the GNU Affero General Public License
997f5136985f Uploaded
xilinxu
parents:
diff changeset
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
17 */
997f5136985f Uploaded
xilinxu
parents:
diff changeset
18 #include <limits.h>
997f5136985f Uploaded
xilinxu
parents:
diff changeset
19 #include <stdio.h>
997f5136985f Uploaded
xilinxu
parents:
diff changeset
20 #include <stdlib.h>
997f5136985f Uploaded
xilinxu
parents:
diff changeset
21 #include <string.h>
997f5136985f Uploaded
xilinxu
parents:
diff changeset
22 #include <getopt.h>
997f5136985f Uploaded
xilinxu
parents:
diff changeset
23 #include <errno.h>
997f5136985f Uploaded
xilinxu
parents:
diff changeset
24 #include <err.h>
997f5136985f Uploaded
xilinxu
parents:
diff changeset
25
997f5136985f Uploaded
xilinxu
parents:
diff changeset
26 #include <config.h>
997f5136985f Uploaded
xilinxu
parents:
diff changeset
27
997f5136985f Uploaded
xilinxu
parents:
diff changeset
28 #include "fastx.h"
997f5136985f Uploaded
xilinxu
parents:
diff changeset
29 #include "fastx_args.h"
997f5136985f Uploaded
xilinxu
parents:
diff changeset
30
997f5136985f Uploaded
xilinxu
parents:
diff changeset
31 #define MAX_ADAPTER_LEN 100
997f5136985f Uploaded
xilinxu
parents:
diff changeset
32
997f5136985f Uploaded
xilinxu
parents:
diff changeset
33 const char* usage=
997f5136985f Uploaded
xilinxu
parents:
diff changeset
34 "usage: fastx_artifacts_filter [-h] [-v] [-z] [-i INFILE] [-o OUTFILE]\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
35 "\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
36 "version " VERSION "\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
37 " [-h] = This helpful help screen.\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
38 " [-i INFILE] = FASTA/Q input file. default is STDIN.\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
39 " [-o OUTFILE] = FASTA/Q output file. default is STDOUT.\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
40 " [-z] = Compress output with GZIP.\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
41 " [-v] = Verbose - report number of processed reads.\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
42 " If [-o] is specified, report will be printed to STDOUT.\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
43 " If [-o] is not specified (and output goes to STDOUT),\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
44 " report will be printed to STDERR.\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
45 "\n";
997f5136985f Uploaded
xilinxu
parents:
diff changeset
46
997f5136985f Uploaded
xilinxu
parents:
diff changeset
47 #define DO_NOT_TRIM_LAST_BASE (0)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
48
997f5136985f Uploaded
xilinxu
parents:
diff changeset
49 FASTX fastx;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
50
997f5136985f Uploaded
xilinxu
parents:
diff changeset
51 int parse_commandline(int argc, char* argv[])
997f5136985f Uploaded
xilinxu
parents:
diff changeset
52 {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
53 return fastx_parse_cmdline(argc, argv, "", NULL);
997f5136985f Uploaded
xilinxu
parents:
diff changeset
54 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
55
997f5136985f Uploaded
xilinxu
parents:
diff changeset
56 int artifact_sequence(const FASTX *fastx)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
57 {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
58 int n_count=0;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
59 int a_count=0;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
60 int c_count=0;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
61 int t_count=0;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
62 int g_count=0;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
63 int total_count=0;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
64
997f5136985f Uploaded
xilinxu
parents:
diff changeset
65 int max_allowed_different_bases = 3 ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
66
997f5136985f Uploaded
xilinxu
parents:
diff changeset
67 int i=0;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
68
997f5136985f Uploaded
xilinxu
parents:
diff changeset
69 while (1) {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
70 if (fastx->nucleotides[i]==0)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
71 break;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
72
997f5136985f Uploaded
xilinxu
parents:
diff changeset
73 total_count++;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
74 switch(fastx->nucleotides[i])
997f5136985f Uploaded
xilinxu
parents:
diff changeset
75 {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
76 case 'A':
997f5136985f Uploaded
xilinxu
parents:
diff changeset
77 a_count++;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
78 break;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
79 case 'C':
997f5136985f Uploaded
xilinxu
parents:
diff changeset
80 c_count++;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
81 break;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
82 case 'G':
997f5136985f Uploaded
xilinxu
parents:
diff changeset
83 g_count++;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
84 break;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
85 case 'T':
997f5136985f Uploaded
xilinxu
parents:
diff changeset
86 t_count++;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
87 break;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
88 case 'N':
997f5136985f Uploaded
xilinxu
parents:
diff changeset
89 n_count++;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
90 break;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
91 default:
997f5136985f Uploaded
xilinxu
parents:
diff changeset
92 errx(1, __FILE__":%d: invalid nucleotide value (%c) at position %d",
997f5136985f Uploaded
xilinxu
parents:
diff changeset
93 __LINE__, fastx->nucleotides[i], i ) ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
94 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
95 i++;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
96 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
97
997f5136985f Uploaded
xilinxu
parents:
diff changeset
98 //Rules for artifacts
997f5136985f Uploaded
xilinxu
parents:
diff changeset
99
997f5136985f Uploaded
xilinxu
parents:
diff changeset
100 if ( a_count>=(total_count-max_allowed_different_bases)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
101 ||
997f5136985f Uploaded
xilinxu
parents:
diff changeset
102 c_count>=(total_count-max_allowed_different_bases)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
103 ||
997f5136985f Uploaded
xilinxu
parents:
diff changeset
104 g_count>=(total_count-max_allowed_different_bases)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
105 ||
997f5136985f Uploaded
xilinxu
parents:
diff changeset
106 t_count>=(total_count-max_allowed_different_bases)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
107 )
997f5136985f Uploaded
xilinxu
parents:
diff changeset
108 return 1;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
109
997f5136985f Uploaded
xilinxu
parents:
diff changeset
110
997f5136985f Uploaded
xilinxu
parents:
diff changeset
111 return 0;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
112 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
113
997f5136985f Uploaded
xilinxu
parents:
diff changeset
114 int main(int argc, char* argv[])
997f5136985f Uploaded
xilinxu
parents:
diff changeset
115 {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
116 parse_commandline(argc, argv);
997f5136985f Uploaded
xilinxu
parents:
diff changeset
117
997f5136985f Uploaded
xilinxu
parents:
diff changeset
118 fastx_init_reader(&fastx, get_input_filename(),
997f5136985f Uploaded
xilinxu
parents:
diff changeset
119 FASTA_OR_FASTQ, ALLOW_N, REQUIRE_UPPERCASE);
997f5136985f Uploaded
xilinxu
parents:
diff changeset
120
997f5136985f Uploaded
xilinxu
parents:
diff changeset
121 fastx_init_writer(&fastx, get_output_filename(),
997f5136985f Uploaded
xilinxu
parents:
diff changeset
122 OUTPUT_SAME_AS_INPUT, compress_output_flag());
997f5136985f Uploaded
xilinxu
parents:
diff changeset
123
997f5136985f Uploaded
xilinxu
parents:
diff changeset
124 while ( fastx_read_next_record(&fastx) ) {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
125
997f5136985f Uploaded
xilinxu
parents:
diff changeset
126 if ( artifact_sequence(&fastx) ) {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
127 } else {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
128 fastx_write_record(&fastx);
997f5136985f Uploaded
xilinxu
parents:
diff changeset
129 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
130 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
131
997f5136985f Uploaded
xilinxu
parents:
diff changeset
132 //Print verbose report
997f5136985f Uploaded
xilinxu
parents:
diff changeset
133 if ( verbose_flag() ) {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
134 fprintf(get_report_file(), "Input: %zu reads.\n", num_input_reads(&fastx) ) ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
135 fprintf(get_report_file(), "Output: %zu reads.\n", num_output_reads(&fastx) ) ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
136
997f5136985f Uploaded
xilinxu
parents:
diff changeset
137 size_t discarded = num_input_reads(&fastx) - num_output_reads(&fastx) ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
138 fprintf(get_report_file(), "discarded %zu (%zu%%) artifact reads.\n",
997f5136985f Uploaded
xilinxu
parents:
diff changeset
139 discarded, (discarded*100)/( num_input_reads(&fastx) ) ) ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
140 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
141
997f5136985f Uploaded
xilinxu
parents:
diff changeset
142 return 0;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
143 }