3
|
1 /*
|
|
2 FASTX-toolkit - FASTA/FASTQ preprocessing tools.
|
|
3 Copyright (C) 2009 A. Gordon (gordon@cshl.edu)
|
|
4
|
|
5 This program is free software: you can redistribute it and/or modify
|
|
6 it under the terms of the GNU Affero General Public License as
|
|
7 published by the Free Software Foundation, either version 3 of the
|
|
8 License, or (at your option) any later version.
|
|
9
|
|
10 This program is distributed in the hope that it will be useful,
|
|
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
13 GNU Affero General Public License for more details.
|
|
14
|
|
15 You should have received a copy of the GNU Affero General Public License
|
|
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
17 */
|
|
18 #ifndef __FASTX_HEADER__
|
|
19 #define __FASTX_HEADER__
|
|
20
|
|
21 #ifdef __cplusplus
|
|
22 extern "C" {
|
|
23 #endif
|
|
24
|
|
25 #ifndef PATH_MAX
|
|
26 #include <linux/limits.h>
|
|
27 #endif
|
|
28
|
|
29 #define MIN_QUALITY_VALUE (-50)
|
|
30 #define MAX_QUALITY_VALUE 50
|
|
31 #define QUALITY_VALUES_RANGE (MAX_QUALITY_VALUE-MIN_QUALITY_VALUE)
|
|
32
|
|
33
|
|
34 #ifndef MAX_SEQ_LINE_LENGTH
|
|
35 #define MAX_SEQ_LINE_LENGTH (25000)
|
|
36 #endif
|
|
37
|
|
38 typedef enum {
|
|
39 FASTA_ONLY=0,
|
|
40 FASTA_OR_FASTQ=1,
|
|
41 FASTQ_ONLY=2
|
|
42 } ALLOWED_INPUT_FILE_TYPES;
|
|
43
|
|
44 typedef enum {
|
|
45 DISALLOW_N=0,
|
|
46 ALLOW_N=1
|
|
47 } ALLOWED_INPUT_UNKNOWN_BASES;
|
|
48
|
|
49 typedef enum {
|
|
50 REQUIRE_UPPERCASE=0,
|
|
51 ALLOW_LOWERCASE=1
|
|
52 } ALLOWED_INPUT_CASE;
|
|
53
|
|
54 typedef enum {
|
|
55 OUTPUT_FASTA=0,
|
|
56 OUTPUT_FASTQ_ASCII_QUAL=1,
|
|
57 OUTPUT_FASTQ_NUMERIC_QUAL=2,
|
|
58 OUTPUT_SAME_AS_INPUT=3
|
|
59 } OUTPUT_FILE_TYPE;
|
|
60
|
|
61 #pragma pack(1)
|
|
62 typedef struct
|
|
63 {
|
|
64 /* Record data - common for FASTA/FASTQ */
|
|
65 char input_sequence_id_prefix[1]; //DON'T touch this - this hack will read the entire name into the variable 'name',
|
|
66 //leaving the prefix ('>' or '@') in 'input_sequence_id_name'.
|
|
67 char name[MAX_SEQ_LINE_LENGTH+1];
|
|
68 char nucleotides[MAX_SEQ_LINE_LENGTH+1];
|
|
69 /* Record data - only for FASTQ */
|
|
70 char input_name2_prefix[1]; //same hack as 'input_sequence_id_prefix'
|
|
71 char name2[MAX_SEQ_LINE_LENGTH+1];
|
|
72 int quality[MAX_SEQ_LINE_LENGTH+1]; //note: this is NOT ascii values, but numerical values
|
|
73 // numeric quality scores and ASCII quality scores
|
|
74 // are automatically converted to numbers (-15 to 40)
|
|
75
|
|
76 /* Configuration */
|
|
77 int allow_input_filetype; // 0 = Allow only FASTA
|
|
78 int allow_N; // 1 = N is valid nucleotide, 0 = only A/G/C/T are valid
|
|
79 int allow_lowercase;
|
|
80 int read_fastq; // 1 = Input is FASTQ (only if allow_input_fastq==1)
|
|
81 int read_fastq_ascii; // 1 = Input is FASTQ with ASCII quality scores (0 = with numeric quality scores)
|
|
82 int write_fastq; // 0 = Write only FASTA (regardless of input type)
|
|
83 int write_fastq_ascii; // 1 = Write ASCII quality scores, 0 = write numeric quality scores
|
|
84 int compress_output; // 1 = pass output through GZIP
|
|
85
|
|
86 int copy_input_fastq_format_to_output ; // 1 = copy 'read_fastq_ascii' to 'write_fastq_ascii'
|
|
87 // so that the output format is the same as the input
|
|
88
|
|
89
|
|
90 /* Internal data */
|
|
91 int allowed_nucleotides[256]; //quick lookup table for valid input
|
|
92 char output_sequence_id_prefix; // '>' or '@', depending on the requested output type
|
|
93
|
|
94 char input_file_name[PATH_MAX]; //in linux, PATH_MAX is defined in <linux/limits.h>
|
|
95 unsigned long long input_line_number;
|
|
96 char output_file_name[PATH_MAX]; //in linux, PATH_MAX is defined in <linux/limits.h>
|
|
97
|
|
98 size_t num_input_sequences;
|
|
99 size_t num_output_sequences;
|
|
100 size_t num_input_reads;
|
|
101 size_t num_output_reads;
|
|
102
|
|
103 FILE* input;
|
|
104 FILE* output;
|
|
105 } FASTX ;
|
|
106
|
|
107
|
|
108 void fastx_init_reader(FASTX *pFASTX, const char* filename,
|
|
109 ALLOWED_INPUT_FILE_TYPES allowed_input_filetype,
|
|
110 ALLOWED_INPUT_UNKNOWN_BASES allow_N,
|
|
111 ALLOWED_INPUT_CASE allow_lowercase);
|
|
112
|
|
113 // If the sequence identifier is collapsed (= "N-N") returns the reads_count,
|
|
114 // otherwise, returns 1
|
|
115 int get_reads_count(const FASTX *pFASTX);
|
|
116
|
|
117 void fastx_init_writer(FASTX *pFASTX,
|
|
118 const char* filename,
|
|
119 OUTPUT_FILE_TYPE output_type,
|
|
120 int compress_output);
|
|
121
|
|
122 int fastx_read_next_record(FASTX *pFASTX);
|
|
123
|
|
124 void fastx_write_record(FASTX *pFASTX);
|
|
125
|
|
126 size_t num_input_sequences(const FASTX *pFASTX);
|
|
127 size_t num_input_reads(const FASTX *pFASTX);
|
|
128 size_t num_output_sequences(const FASTX *pFASTX);
|
|
129 size_t num_output_reads(const FASTX *pFASTX);
|
|
130
|
|
131 #ifdef __cplusplus
|
|
132 }
|
|
133 #endif
|
|
134
|
|
135 #endif
|
|
136
|