0
|
1 /* The MIT License
|
|
2
|
|
3 Copyright (c) 2008 Genome Research Ltd (GRL).
|
|
4
|
|
5 Permission is hereby granted, free of charge, to any person obtaining
|
|
6 a copy of this software and associated documentation files (the
|
|
7 "Software"), to deal in the Software without restriction, including
|
|
8 without limitation the rights to use, copy, modify, merge, publish,
|
|
9 distribute, sublicense, and/or sell copies of the Software, and to
|
|
10 permit persons to whom the Software is furnished to do so, subject to
|
|
11 the following conditions:
|
|
12
|
|
13 The above copyright notice and this permission notice shall be
|
|
14 included in all copies or substantial portions of the Software.
|
|
15
|
|
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
|
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
|
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
23 SOFTWARE.
|
|
24 */
|
|
25
|
|
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */
|
|
27
|
|
28 /*
|
|
29 2009-07-16 (lh3): in kstream_t, change "char*" to "unsigned char*"
|
|
30 */
|
|
31
|
|
32 /* Last Modified: 12APR2009 */
|
|
33
|
|
34 #ifndef AC_KSEQ_H
|
|
35 #define AC_KSEQ_H
|
|
36
|
|
37 #include <ctype.h>
|
|
38 #include <string.h>
|
|
39 #include <stdlib.h>
|
|
40
|
|
41 #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
|
|
42 #define KS_SEP_TAB 1 // isspace() && !' '
|
|
43 #define KS_SEP_MAX 1
|
|
44
|
|
45 #define __KS_TYPE(type_t) \
|
|
46 typedef struct __kstream_t { \
|
|
47 unsigned char *buf; \
|
|
48 int begin, end, is_eof; \
|
|
49 type_t f; \
|
|
50 } kstream_t;
|
|
51
|
|
52 #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
|
|
53 #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
|
|
54
|
|
55 #define __KS_BASIC(type_t, __bufsize) \
|
|
56 static inline kstream_t *ks_init(type_t f) \
|
|
57 { \
|
|
58 kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
|
|
59 ks->f = f; \
|
|
60 ks->buf = malloc(__bufsize); \
|
|
61 return ks; \
|
|
62 } \
|
|
63 static inline void ks_destroy(kstream_t *ks) \
|
|
64 { \
|
|
65 if (ks) { \
|
|
66 free(ks->buf); \
|
|
67 free(ks); \
|
|
68 } \
|
|
69 }
|
|
70
|
|
71 #define __KS_GETC(__read, __bufsize) \
|
|
72 static inline int ks_getc(kstream_t *ks) \
|
|
73 { \
|
|
74 if (ks->is_eof && ks->begin >= ks->end) return -1; \
|
|
75 if (ks->begin >= ks->end) { \
|
|
76 ks->begin = 0; \
|
|
77 ks->end = __read(ks->f, ks->buf, __bufsize); \
|
|
78 if (ks->end < __bufsize) ks->is_eof = 1; \
|
|
79 if (ks->end == 0) return -1; \
|
|
80 } \
|
|
81 return (int)ks->buf[ks->begin++]; \
|
|
82 }
|
|
83
|
|
84 #ifndef KSTRING_T
|
|
85 #define KSTRING_T kstring_t
|
|
86 typedef struct __kstring_t {
|
|
87 size_t l, m;
|
|
88 char *s;
|
|
89 } kstring_t;
|
|
90 #endif
|
|
91
|
|
92 #ifndef kroundup32
|
|
93 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
|
|
94 #endif
|
|
95
|
|
96 #define __KS_GETUNTIL(__read, __bufsize) \
|
|
97 static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
|
|
98 { \
|
|
99 if (dret) *dret = 0; \
|
|
100 str->l = 0; \
|
|
101 if (ks->begin >= ks->end && ks->is_eof) return -1; \
|
|
102 for (;;) { \
|
|
103 int i; \
|
|
104 if (ks->begin >= ks->end) { \
|
|
105 if (!ks->is_eof) { \
|
|
106 ks->begin = 0; \
|
|
107 ks->end = __read(ks->f, ks->buf, __bufsize); \
|
|
108 if (ks->end < __bufsize) ks->is_eof = 1; \
|
|
109 if (ks->end == 0) break; \
|
|
110 } else break; \
|
|
111 } \
|
|
112 if (delimiter > KS_SEP_MAX) { \
|
|
113 for (i = ks->begin; i < ks->end; ++i) \
|
|
114 if (ks->buf[i] == delimiter) break; \
|
|
115 } else if (delimiter == KS_SEP_SPACE) { \
|
|
116 for (i = ks->begin; i < ks->end; ++i) \
|
|
117 if (isspace(ks->buf[i])) break; \
|
|
118 } else if (delimiter == KS_SEP_TAB) { \
|
|
119 for (i = ks->begin; i < ks->end; ++i) \
|
|
120 if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
|
|
121 } else i = 0; /* never come to here! */ \
|
|
122 if (str->m - str->l < i - ks->begin + 1) { \
|
|
123 str->m = str->l + (i - ks->begin) + 1; \
|
|
124 kroundup32(str->m); \
|
|
125 str->s = (char*)realloc(str->s, str->m); \
|
|
126 } \
|
|
127 memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
|
|
128 str->l = str->l + (i - ks->begin); \
|
|
129 ks->begin = i + 1; \
|
|
130 if (i < ks->end) { \
|
|
131 if (dret) *dret = ks->buf[i]; \
|
|
132 break; \
|
|
133 } \
|
|
134 } \
|
|
135 if (str->l == 0) { \
|
|
136 str->m = 1; \
|
|
137 str->s = (char*)calloc(1, 1); \
|
|
138 } \
|
|
139 str->s[str->l] = '\0'; \
|
|
140 return str->l; \
|
|
141 }
|
|
142
|
|
143 #define KSTREAM_INIT(type_t, __read, __bufsize) \
|
|
144 __KS_TYPE(type_t) \
|
|
145 __KS_BASIC(type_t, __bufsize) \
|
|
146 __KS_GETC(__read, __bufsize) \
|
|
147 __KS_GETUNTIL(__read, __bufsize)
|
|
148
|
|
149 #define __KSEQ_BASIC(type_t) \
|
|
150 static inline kseq_t *kseq_init(type_t fd) \
|
|
151 { \
|
|
152 kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
|
|
153 s->f = ks_init(fd); \
|
|
154 return s; \
|
|
155 } \
|
|
156 static inline void kseq_rewind(kseq_t *ks) \
|
|
157 { \
|
|
158 ks->last_char = 0; \
|
|
159 ks->f->is_eof = ks->f->begin = ks->f->end = 0; \
|
|
160 } \
|
|
161 static inline void kseq_destroy(kseq_t *ks) \
|
|
162 { \
|
|
163 if (!ks) return; \
|
|
164 free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
|
|
165 ks_destroy(ks->f); \
|
|
166 free(ks); \
|
|
167 }
|
|
168
|
|
169 /* Return value:
|
|
170 >=0 length of the sequence (normal)
|
|
171 -1 end-of-file
|
|
172 -2 truncated quality string
|
|
173 */
|
|
174 #define __KSEQ_READ \
|
|
175 static int kseq_read(kseq_t *seq) \
|
|
176 { \
|
|
177 int c; \
|
|
178 kstream_t *ks = seq->f; \
|
|
179 if (seq->last_char == 0) { /* then jump to the next header line */ \
|
|
180 while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
|
|
181 if (c == -1) return -1; /* end of file */ \
|
|
182 seq->last_char = c; \
|
|
183 } /* the first header char has been read */ \
|
|
184 seq->comment.l = seq->seq.l = seq->qual.l = 0; \
|
|
185 if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \
|
|
186 if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \
|
|
187 while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
|
|
188 if (isgraph(c)) { /* printable non-space character */ \
|
|
189 if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \
|
|
190 seq->seq.m = seq->seq.l + 2; \
|
|
191 kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \
|
|
192 seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
|
|
193 } \
|
|
194 seq->seq.s[seq->seq.l++] = (char)c; \
|
|
195 } \
|
|
196 } \
|
|
197 if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
|
|
198 seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
|
|
199 if (c != '+') return seq->seq.l; /* FASTA */ \
|
|
200 if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \
|
|
201 seq->qual.m = seq->seq.m; \
|
|
202 seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
|
|
203 } \
|
|
204 while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
|
|
205 if (c == -1) return -2; /* we should not stop here */ \
|
|
206 while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \
|
|
207 if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \
|
|
208 seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \
|
|
209 seq->last_char = 0; /* we have not come to the next header line */ \
|
|
210 if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \
|
|
211 return seq->seq.l; \
|
|
212 }
|
|
213
|
|
214 #define __KSEQ_TYPE(type_t) \
|
|
215 typedef struct { \
|
|
216 kstring_t name, comment, seq, qual; \
|
|
217 int last_char; \
|
|
218 kstream_t *f; \
|
|
219 } kseq_t;
|
|
220
|
|
221 #define KSEQ_INIT(type_t, __read) \
|
|
222 KSTREAM_INIT(type_t, __read, 4096) \
|
|
223 __KSEQ_TYPE(type_t) \
|
|
224 __KSEQ_BASIC(type_t) \
|
|
225 __KSEQ_READ
|
|
226
|
|
227 #endif
|