| 9 | 1 /* The MIT License | 
|  | 2 | 
|  | 3    Copyright (c) 2008 Genome Research Ltd (GRL). | 
|  | 4 | 
|  | 5    Permission is hereby granted, free of charge, to any person obtaining | 
|  | 6    a copy of this software and associated documentation files (the | 
|  | 7    "Software"), to deal in the Software without restriction, including | 
|  | 8    without limitation the rights to use, copy, modify, merge, publish, | 
|  | 9    distribute, sublicense, and/or sell copies of the Software, and to | 
|  | 10    permit persons to whom the Software is furnished to do so, subject to | 
|  | 11    the following conditions: | 
|  | 12 | 
|  | 13    The above copyright notice and this permission notice shall be | 
|  | 14    included in all copies or substantial portions of the Software. | 
|  | 15 | 
|  | 16    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | 
|  | 17    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | 
|  | 18    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | 
|  | 19    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | 
|  | 20    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | 
|  | 21    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | 
|  | 22    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | 
|  | 23    SOFTWARE. | 
|  | 24 */ | 
|  | 25 | 
|  | 26 /* Contact: Heng Li <lh3@sanger.ac.uk> */ | 
|  | 27 | 
|  | 28 /* Last Modified: 12APR2009 */ | 
|  | 29 | 
|  | 30 #ifndef AC_KSEQ_H | 
|  | 31 #define AC_KSEQ_H | 
|  | 32 | 
|  | 33 #include <ctype.h> | 
|  | 34 #include <string.h> | 
|  | 35 #include <stdlib.h> | 
|  | 36 | 
|  | 37 #define KS_SEP_SPACE 0 /* isspace(): \t, \n, \v, \f, \r */ | 
|  | 38 #define KS_SEP_TAB   1 /* isspace() && !' ' */ | 
|  | 39 #define KS_SEP_MAX   1 | 
|  | 40 | 
|  | 41 #define __KS_TYPE(type_t)                       \ | 
|  | 42   typedef struct __kstream_t {                  \ | 
|  | 43     char *buf;                                  \ | 
|  | 44     int begin, end, is_eof;                     \ | 
|  | 45     type_t f;                                   \ | 
|  | 46   } kstream_t; | 
|  | 47 | 
|  | 48 #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) | 
|  | 49 #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) | 
|  | 50 | 
|  | 51 #define __KS_BASIC(type_t, __bufsize)                                   \ | 
|  | 52   static inline kstream_t *ks_init(type_t f)                            \ | 
|  | 53 	{                                                               \ | 
|  | 54           kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t));	\ | 
|  | 55           ks->f = f;                                                    \ | 
|  | 56           ks->buf = (char*)malloc(__bufsize);                           \ | 
|  | 57           return ks;                                                    \ | 
|  | 58 	}                                                               \ | 
|  | 59         static inline void ks_destroy(kstream_t *ks)                    \ | 
|  | 60 	{                                                               \ | 
|  | 61           if (ks) {                                                     \ | 
|  | 62           free(ks->buf);                                                \ | 
|  | 63           free(ks);                                                     \ | 
|  | 64         }                                                               \ | 
|  | 65           } | 
|  | 66 | 
|  | 67 #define __KS_GETC(__read, __bufsize)                                    \ | 
|  | 68   static inline int ks_getc(kstream_t *ks)				\ | 
|  | 69 	{                                                               \ | 
|  | 70           if (ks->is_eof && ks->begin >= ks->end) return -1;            \ | 
|  | 71           if (ks->begin >= ks->end) {                                   \ | 
|  | 72           ks->begin = 0;                                                \ | 
|  | 73           ks->end = __read(ks->f, ks->buf, __bufsize);                  \ | 
|  | 74           if (ks->end < __bufsize) ks->is_eof = 1;                      \ | 
|  | 75           if (ks->end == 0) return -1;					\ | 
|  | 76         }                                                               \ | 
|  | 77           return (int)ks->buf[ks->begin++];                             \ | 
|  | 78 	} | 
|  | 79 | 
|  | 80 #ifndef KSTRING_T | 
|  | 81 #define KSTRING_T kstring_t | 
|  | 82 typedef struct __kstring_t { | 
|  | 83 	size_t l, m; | 
|  | 84 	char *s; | 
|  | 85 } kstring_t; | 
|  | 86 #endif | 
|  | 87 | 
|  | 88 #ifndef kroundup32 | 
|  | 89 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) | 
|  | 90 #endif | 
|  | 91 | 
|  | 92 #define __KS_GETUNTIL(__read, __bufsize)                                \ | 
|  | 93   static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ | 
|  | 94   {                                                                     \ | 
|  | 95    if (dret) *dret = 0;                                                 \ | 
|  | 96    str->l = 0;                                                          \ | 
|  | 97    if (ks->begin >= ks->end && ks->is_eof) return -1;                   \ | 
|  | 98    for (;;) {                                                           \ | 
|  | 99              int i;                                                     \ | 
|  | 100              if (ks->begin >= ks->end) {                                \ | 
|  | 101                if (!ks->is_eof) {                                       \ | 
|  | 102                ks->begin = 0;                                           \ | 
|  | 103                ks->end = __read(ks->f, ks->buf, __bufsize);             \ | 
|  | 104                if (ks->end < __bufsize) ks->is_eof = 1;                 \ | 
|  | 105                if (ks->end == 0) break;                                 \ | 
|  | 106              } else break;                                              \ | 
|  | 107              }                                                          \ | 
|  | 108                if (delimiter > KS_SEP_MAX) {                            \ | 
|  | 109                for (i = ks->begin; i < ks->end; ++i)                    \ | 
|  | 110                  if (ks->buf[i] == delimiter) break;                    \ | 
|  | 111              } else if (delimiter == KS_SEP_SPACE) {                    \ | 
|  | 112                for (i = ks->begin; i < ks->end; ++i)                    \ | 
|  | 113                  if (isspace(ks->buf[i])) break;                        \ | 
|  | 114              } else if (delimiter == KS_SEP_TAB) {                      \ | 
|  | 115                for (i = ks->begin; i < ks->end; ++i)                    \ | 
|  | 116                  if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break;   \ | 
|  | 117              } else i = 0; /* never come to here! */                    \ | 
|  | 118                if (str->m - str->l < i - ks->begin + 1) {               \ | 
|  | 119                str->m = str->l + (i - ks->begin) + 1;                   \ | 
|  | 120                kroundup32(str->m);                                      \ | 
|  | 121                str->s = (char*)realloc(str->s, str->m);                 \ | 
|  | 122              }                                                          \ | 
|  | 123                memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ | 
|  | 124                str->l = str->l + (i - ks->begin);                       \ | 
|  | 125                ks->begin = i + 1;                                       \ | 
|  | 126                if (i < ks->end) {                                       \ | 
|  | 127                if (dret) *dret = ks->buf[i];                            \ | 
|  | 128                break;                                                   \ | 
|  | 129              }                                                          \ | 
|  | 130              }                                                          \ | 
|  | 131                if (str->l == 0) {                                       \ | 
|  | 132                str->m = 1;                                              \ | 
|  | 133                str->s = (char*)calloc(1, 1);                            \ | 
|  | 134              }                                                          \ | 
|  | 135                str->s[str->l] = '\0';                                   \ | 
|  | 136                return str->l;                                           \ | 
|  | 137                } | 
|  | 138 | 
|  | 139 #define KSTREAM_INIT(type_t, __read, __bufsize)                         \ | 
|  | 140   __KS_TYPE(type_t)							\ | 
|  | 141   __KS_BASIC(type_t, __bufsize)                                         \ | 
|  | 142   __KS_GETC(__read, __bufsize)                                          \ | 
|  | 143   __KS_GETUNTIL(__read, __bufsize) | 
|  | 144 | 
|  | 145 #define __KSEQ_BASIC(type_t)                                            \ | 
|  | 146   static inline kseq_t *kseq_init(type_t fd)                            \ | 
|  | 147   {                                                                     \ | 
|  | 148    kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t));                      \ | 
|  | 149    s->f = ks_init(fd);                                                  \ | 
|  | 150    return s;                                                            \ | 
|  | 151    }                                                                    \ | 
|  | 152   static inline void kseq_rewind(kseq_t *ks)                            \ | 
|  | 153   {                                                                     \ | 
|  | 154    ks->last_char = 0;                                                   \ | 
|  | 155    ks->f->is_eof = ks->f->begin = ks->f->end = 0;                       \ | 
|  | 156    }                                                                    \ | 
|  | 157   static inline void kseq_destroy(kseq_t *ks)                           \ | 
|  | 158   {                                                                     \ | 
|  | 159    if (!ks) return;                                                     \ | 
|  | 160    free(ks->name.s); free(ks->comment.s); free(ks->seq.s);	free(ks->qual.s); \ | 
|  | 161    ks_destroy(ks->f);                                                   \ | 
|  | 162    free(ks);                                                            \ | 
|  | 163    } | 
|  | 164 | 
|  | 165 /* Return value: | 
|  | 166    >=0  length of the sequence (normal) | 
|  | 167    -1   end-of-file | 
|  | 168    -2   truncated quality string | 
|  | 169 */ | 
|  | 170 #define __KSEQ_READ                                                     \ | 
|  | 171   static int kseq_read(kseq_t *seq)                                     \ | 
|  | 172   {                                                                     \ | 
|  | 173    int c;                                                               \ | 
|  | 174    kstream_t *ks = seq->f;                                              \ | 
|  | 175    if (seq->last_char == 0) { /* then jump to the next header line */   \ | 
|  | 176                              while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ | 
|  | 177                              if (c == -1) return -1; /* end of file */  \ | 
|  | 178                              seq->last_char = c;                        \ | 
|  | 179                              } /* the first header char has been read */ \ | 
|  | 180    seq->comment.l = seq->seq.l = seq->qual.l = 0;                       \ | 
|  | 181    if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1;               \ | 
|  | 182    if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0);              \ | 
|  | 183    while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ | 
|  | 184      if (isgraph(c)) { /* printable non-space character */              \ | 
|  | 185      if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */        \ | 
|  | 186      seq->seq.m = seq->seq.l + 2;                                       \ | 
|  | 187      kroundup32(seq->seq.m); /* rounded to next closest 2^k */          \ | 
|  | 188      seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m);               \ | 
|  | 189    }                                                                    \ | 
|  | 190      seq->seq.s[seq->seq.l++] = (char)c;                                \ | 
|  | 191    }                                                                    \ | 
|  | 192    }                                                                    \ | 
|  | 193      if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ | 
|  | 194      if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ | 
|  | 195      seq->seq.m = seq->seq.l + 2; \ | 
|  | 196      kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ | 
|  | 197      seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ | 
|  | 198      } \ | 
|  | 199      seq->seq.s[seq->seq.l] = 0;    /* null terminated string */    \ | 
|  | 200      if (c != '+') return seq->seq.l; /* FASTA */                       \ | 
|  | 201      if (seq->qual.m < seq->seq.m) {	/* allocate enough memory */	\ | 
|  | 202      seq->qual.m = seq->seq.m;                                          \ | 
|  | 203      seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m);		\ | 
|  | 204    }                                                                    \ | 
|  | 205      while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ | 
|  | 206      if (c == -1) return -2; /* we should not stop here */              \ | 
|  | 207      while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l)        \ | 
|  | 208        if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \ | 
|  | 209      seq->qual.s[seq->qual.l] = 0; /* null terminated string */		\ | 
|  | 210      seq->last_char = 0;	/* we have not come to the next header line */ \ | 
|  | 211      if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \ | 
|  | 212      return seq->seq.l;                                                 \ | 
|  | 213      } | 
|  | 214 | 
|  | 215 #define __KSEQ_TYPE(type_t)						\ | 
|  | 216   typedef struct {							\ | 
|  | 217     kstring_t name, comment, seq, qual;                                 \ | 
|  | 218     int last_char;							\ | 
|  | 219     kstream_t *f;							\ | 
|  | 220   } kseq_t; | 
|  | 221 | 
|  | 222 #define KSEQ_INIT(type_t, __read)                                       \ | 
|  | 223   KSTREAM_INIT(type_t, __read, 4096)                                    \ | 
|  | 224   __KSEQ_TYPE(type_t)							\ | 
|  | 225   __KSEQ_BASIC(type_t)                                                  \ | 
|  | 226   __KSEQ_READ | 
|  | 227 | 
|  | 228 #endif |