comparison fastx_toolkit-0.0.6/src/libfastx/fastx.h @ 3:997f5136985f draft default tip

Uploaded
author xilinxu
date Thu, 14 Aug 2014 04:52:17 -0400
parents
children
comparison
equal deleted inserted replaced
2:dfe9332138cf 3:997f5136985f
1 /*
2 FASTX-toolkit - FASTA/FASTQ preprocessing tools.
3 Copyright (C) 2009 A. Gordon (gordon@cshl.edu)
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Affero General Public License as
7 published by the Free Software Foundation, either version 3 of the
8 License, or (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Affero General Public License for more details.
14
15 You should have received a copy of the GNU Affero General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 */
18 #ifndef __FASTX_HEADER__
19 #define __FASTX_HEADER__
20
21 #ifdef __cplusplus
22 extern "C" {
23 #endif
24
25 #ifndef PATH_MAX
26 #include <linux/limits.h>
27 #endif
28
29 #define MIN_QUALITY_VALUE (-50)
30 #define MAX_QUALITY_VALUE 50
31 #define QUALITY_VALUES_RANGE (MAX_QUALITY_VALUE-MIN_QUALITY_VALUE)
32
33
34 #ifndef MAX_SEQ_LINE_LENGTH
35 #define MAX_SEQ_LINE_LENGTH (25000)
36 #endif
37
38 typedef enum {
39 FASTA_ONLY=0,
40 FASTA_OR_FASTQ=1,
41 FASTQ_ONLY=2
42 } ALLOWED_INPUT_FILE_TYPES;
43
44 typedef enum {
45 DISALLOW_N=0,
46 ALLOW_N=1
47 } ALLOWED_INPUT_UNKNOWN_BASES;
48
49 typedef enum {
50 REQUIRE_UPPERCASE=0,
51 ALLOW_LOWERCASE=1
52 } ALLOWED_INPUT_CASE;
53
54 typedef enum {
55 OUTPUT_FASTA=0,
56 OUTPUT_FASTQ_ASCII_QUAL=1,
57 OUTPUT_FASTQ_NUMERIC_QUAL=2,
58 OUTPUT_SAME_AS_INPUT=3
59 } OUTPUT_FILE_TYPE;
60
61 #pragma pack(1)
62 typedef struct
63 {
64 /* Record data - common for FASTA/FASTQ */
65 char input_sequence_id_prefix[1]; //DON'T touch this - this hack will read the entire name into the variable 'name',
66 //leaving the prefix ('>' or '@') in 'input_sequence_id_name'.
67 char name[MAX_SEQ_LINE_LENGTH+1];
68 char nucleotides[MAX_SEQ_LINE_LENGTH+1];
69 /* Record data - only for FASTQ */
70 char input_name2_prefix[1]; //same hack as 'input_sequence_id_prefix'
71 char name2[MAX_SEQ_LINE_LENGTH+1];
72 int quality[MAX_SEQ_LINE_LENGTH+1]; //note: this is NOT ascii values, but numerical values
73 // numeric quality scores and ASCII quality scores
74 // are automatically converted to numbers (-15 to 40)
75
76 /* Configuration */
77 int allow_input_filetype; // 0 = Allow only FASTA
78 int allow_N; // 1 = N is valid nucleotide, 0 = only A/G/C/T are valid
79 int allow_lowercase;
80 int read_fastq; // 1 = Input is FASTQ (only if allow_input_fastq==1)
81 int read_fastq_ascii; // 1 = Input is FASTQ with ASCII quality scores (0 = with numeric quality scores)
82 int write_fastq; // 0 = Write only FASTA (regardless of input type)
83 int write_fastq_ascii; // 1 = Write ASCII quality scores, 0 = write numeric quality scores
84 int compress_output; // 1 = pass output through GZIP
85
86 int copy_input_fastq_format_to_output ; // 1 = copy 'read_fastq_ascii' to 'write_fastq_ascii'
87 // so that the output format is the same as the input
88
89
90 /* Internal data */
91 int allowed_nucleotides[256]; //quick lookup table for valid input
92 char output_sequence_id_prefix; // '>' or '@', depending on the requested output type
93
94 char input_file_name[PATH_MAX]; //in linux, PATH_MAX is defined in <linux/limits.h>
95 unsigned long long input_line_number;
96 char output_file_name[PATH_MAX]; //in linux, PATH_MAX is defined in <linux/limits.h>
97
98 size_t num_input_sequences;
99 size_t num_output_sequences;
100 size_t num_input_reads;
101 size_t num_output_reads;
102
103 FILE* input;
104 FILE* output;
105 } FASTX ;
106
107
108 void fastx_init_reader(FASTX *pFASTX, const char* filename,
109 ALLOWED_INPUT_FILE_TYPES allowed_input_filetype,
110 ALLOWED_INPUT_UNKNOWN_BASES allow_N,
111 ALLOWED_INPUT_CASE allow_lowercase);
112
113 // If the sequence identifier is collapsed (= "N-N") returns the reads_count,
114 // otherwise, returns 1
115 int get_reads_count(const FASTX *pFASTX);
116
117 void fastx_init_writer(FASTX *pFASTX,
118 const char* filename,
119 OUTPUT_FILE_TYPE output_type,
120 int compress_output);
121
122 int fastx_read_next_record(FASTX *pFASTX);
123
124 void fastx_write_record(FASTX *pFASTX);
125
126 size_t num_input_sequences(const FASTX *pFASTX);
127 size_t num_input_reads(const FASTX *pFASTX);
128 size_t num_output_sequences(const FASTX *pFASTX);
129 size_t num_output_reads(const FASTX *pFASTX);
130
131 #ifdef __cplusplus
132 }
133 #endif
134
135 #endif
136