comparison src/kseq.h @ 9:7939dd56c4b4 draft

Uploaded
author nikhil-joshi
date Sat, 14 Mar 2015 18:19:57 -0400
parents
children
comparison
equal deleted inserted replaced
8:3ef3eb63a297 9:7939dd56c4b4
1 /* The MIT License
2
3 Copyright (c) 2008 Genome Research Ltd (GRL).
4
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
12
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
15
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 SOFTWARE.
24 */
25
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */
27
28 /* Last Modified: 12APR2009 */
29
30 #ifndef AC_KSEQ_H
31 #define AC_KSEQ_H
32
33 #include <ctype.h>
34 #include <string.h>
35 #include <stdlib.h>
36
37 #define KS_SEP_SPACE 0 /* isspace(): \t, \n, \v, \f, \r */
38 #define KS_SEP_TAB 1 /* isspace() && !' ' */
39 #define KS_SEP_MAX 1
40
41 #define __KS_TYPE(type_t) \
42 typedef struct __kstream_t { \
43 char *buf; \
44 int begin, end, is_eof; \
45 type_t f; \
46 } kstream_t;
47
48 #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
49 #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
50
51 #define __KS_BASIC(type_t, __bufsize) \
52 static inline kstream_t *ks_init(type_t f) \
53 { \
54 kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
55 ks->f = f; \
56 ks->buf = (char*)malloc(__bufsize); \
57 return ks; \
58 } \
59 static inline void ks_destroy(kstream_t *ks) \
60 { \
61 if (ks) { \
62 free(ks->buf); \
63 free(ks); \
64 } \
65 }
66
67 #define __KS_GETC(__read, __bufsize) \
68 static inline int ks_getc(kstream_t *ks) \
69 { \
70 if (ks->is_eof && ks->begin >= ks->end) return -1; \
71 if (ks->begin >= ks->end) { \
72 ks->begin = 0; \
73 ks->end = __read(ks->f, ks->buf, __bufsize); \
74 if (ks->end < __bufsize) ks->is_eof = 1; \
75 if (ks->end == 0) return -1; \
76 } \
77 return (int)ks->buf[ks->begin++]; \
78 }
79
80 #ifndef KSTRING_T
81 #define KSTRING_T kstring_t
82 typedef struct __kstring_t {
83 size_t l, m;
84 char *s;
85 } kstring_t;
86 #endif
87
88 #ifndef kroundup32
89 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
90 #endif
91
92 #define __KS_GETUNTIL(__read, __bufsize) \
93 static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
94 { \
95 if (dret) *dret = 0; \
96 str->l = 0; \
97 if (ks->begin >= ks->end && ks->is_eof) return -1; \
98 for (;;) { \
99 int i; \
100 if (ks->begin >= ks->end) { \
101 if (!ks->is_eof) { \
102 ks->begin = 0; \
103 ks->end = __read(ks->f, ks->buf, __bufsize); \
104 if (ks->end < __bufsize) ks->is_eof = 1; \
105 if (ks->end == 0) break; \
106 } else break; \
107 } \
108 if (delimiter > KS_SEP_MAX) { \
109 for (i = ks->begin; i < ks->end; ++i) \
110 if (ks->buf[i] == delimiter) break; \
111 } else if (delimiter == KS_SEP_SPACE) { \
112 for (i = ks->begin; i < ks->end; ++i) \
113 if (isspace(ks->buf[i])) break; \
114 } else if (delimiter == KS_SEP_TAB) { \
115 for (i = ks->begin; i < ks->end; ++i) \
116 if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
117 } else i = 0; /* never come to here! */ \
118 if (str->m - str->l < i - ks->begin + 1) { \
119 str->m = str->l + (i - ks->begin) + 1; \
120 kroundup32(str->m); \
121 str->s = (char*)realloc(str->s, str->m); \
122 } \
123 memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
124 str->l = str->l + (i - ks->begin); \
125 ks->begin = i + 1; \
126 if (i < ks->end) { \
127 if (dret) *dret = ks->buf[i]; \
128 break; \
129 } \
130 } \
131 if (str->l == 0) { \
132 str->m = 1; \
133 str->s = (char*)calloc(1, 1); \
134 } \
135 str->s[str->l] = '\0'; \
136 return str->l; \
137 }
138
139 #define KSTREAM_INIT(type_t, __read, __bufsize) \
140 __KS_TYPE(type_t) \
141 __KS_BASIC(type_t, __bufsize) \
142 __KS_GETC(__read, __bufsize) \
143 __KS_GETUNTIL(__read, __bufsize)
144
145 #define __KSEQ_BASIC(type_t) \
146 static inline kseq_t *kseq_init(type_t fd) \
147 { \
148 kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
149 s->f = ks_init(fd); \
150 return s; \
151 } \
152 static inline void kseq_rewind(kseq_t *ks) \
153 { \
154 ks->last_char = 0; \
155 ks->f->is_eof = ks->f->begin = ks->f->end = 0; \
156 } \
157 static inline void kseq_destroy(kseq_t *ks) \
158 { \
159 if (!ks) return; \
160 free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
161 ks_destroy(ks->f); \
162 free(ks); \
163 }
164
165 /* Return value:
166 >=0 length of the sequence (normal)
167 -1 end-of-file
168 -2 truncated quality string
169 */
170 #define __KSEQ_READ \
171 static int kseq_read(kseq_t *seq) \
172 { \
173 int c; \
174 kstream_t *ks = seq->f; \
175 if (seq->last_char == 0) { /* then jump to the next header line */ \
176 while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
177 if (c == -1) return -1; /* end of file */ \
178 seq->last_char = c; \
179 } /* the first header char has been read */ \
180 seq->comment.l = seq->seq.l = seq->qual.l = 0; \
181 if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \
182 if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \
183 while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
184 if (isgraph(c)) { /* printable non-space character */ \
185 if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \
186 seq->seq.m = seq->seq.l + 2; \
187 kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \
188 seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
189 } \
190 seq->seq.s[seq->seq.l++] = (char)c; \
191 } \
192 } \
193 if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
194 if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
195 seq->seq.m = seq->seq.l + 2; \
196 kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
197 seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
198 } \
199 seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
200 if (c != '+') return seq->seq.l; /* FASTA */ \
201 if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \
202 seq->qual.m = seq->seq.m; \
203 seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
204 } \
205 while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
206 if (c == -1) return -2; /* we should not stop here */ \
207 while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \
208 if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \
209 seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \
210 seq->last_char = 0; /* we have not come to the next header line */ \
211 if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \
212 return seq->seq.l; \
213 }
214
215 #define __KSEQ_TYPE(type_t) \
216 typedef struct { \
217 kstring_t name, comment, seq, qual; \
218 int last_char; \
219 kstream_t *f; \
220 } kseq_t;
221
222 #define KSEQ_INIT(type_t, __read) \
223 KSTREAM_INIT(type_t, __read, 4096) \
224 __KSEQ_TYPE(type_t) \
225 __KSEQ_BASIC(type_t) \
226 __KSEQ_READ
227
228 #endif