| 
9
 | 
     1 /* The MIT License
 | 
| 
 | 
     2 
 | 
| 
 | 
     3    Copyright (c) 2008 Genome Research Ltd (GRL).
 | 
| 
 | 
     4 
 | 
| 
 | 
     5    Permission is hereby granted, free of charge, to any person obtaining
 | 
| 
 | 
     6    a copy of this software and associated documentation files (the
 | 
| 
 | 
     7    "Software"), to deal in the Software without restriction, including
 | 
| 
 | 
     8    without limitation the rights to use, copy, modify, merge, publish,
 | 
| 
 | 
     9    distribute, sublicense, and/or sell copies of the Software, and to
 | 
| 
 | 
    10    permit persons to whom the Software is furnished to do so, subject to
 | 
| 
 | 
    11    the following conditions:
 | 
| 
 | 
    12 
 | 
| 
 | 
    13    The above copyright notice and this permission notice shall be
 | 
| 
 | 
    14    included in all copies or substantial portions of the Software.
 | 
| 
 | 
    15 
 | 
| 
 | 
    16    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 | 
| 
 | 
    17    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 | 
| 
 | 
    18    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 | 
| 
 | 
    19    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 | 
| 
 | 
    20    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 | 
| 
 | 
    21    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 | 
| 
 | 
    22    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 | 
| 
 | 
    23    SOFTWARE.
 | 
| 
 | 
    24 */
 | 
| 
 | 
    25 
 | 
| 
 | 
    26 /* Contact: Heng Li <lh3@sanger.ac.uk> */
 | 
| 
 | 
    27 
 | 
| 
 | 
    28 /* Last Modified: 12APR2009 */
 | 
| 
 | 
    29 
 | 
| 
 | 
    30 #ifndef AC_KSEQ_H
 | 
| 
 | 
    31 #define AC_KSEQ_H
 | 
| 
 | 
    32 
 | 
| 
 | 
    33 #include <ctype.h>
 | 
| 
 | 
    34 #include <string.h>
 | 
| 
 | 
    35 #include <stdlib.h>
 | 
| 
 | 
    36 
 | 
| 
 | 
    37 #define KS_SEP_SPACE 0 /* isspace(): \t, \n, \v, \f, \r */
 | 
| 
 | 
    38 #define KS_SEP_TAB   1 /* isspace() && !' ' */
 | 
| 
 | 
    39 #define KS_SEP_MAX   1
 | 
| 
 | 
    40 
 | 
| 
 | 
    41 #define __KS_TYPE(type_t)                       \
 | 
| 
 | 
    42   typedef struct __kstream_t {                  \
 | 
| 
 | 
    43     char *buf;                                  \
 | 
| 
 | 
    44     int begin, end, is_eof;                     \
 | 
| 
 | 
    45     type_t f;                                   \
 | 
| 
 | 
    46   } kstream_t;
 | 
| 
 | 
    47 
 | 
| 
 | 
    48 #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
 | 
| 
 | 
    49 #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
 | 
| 
 | 
    50 
 | 
| 
 | 
    51 #define __KS_BASIC(type_t, __bufsize)                                   \
 | 
| 
 | 
    52   static inline kstream_t *ks_init(type_t f)                            \
 | 
| 
 | 
    53 	{                                                               \
 | 
| 
 | 
    54           kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t));	\
 | 
| 
 | 
    55           ks->f = f;                                                    \
 | 
| 
 | 
    56           ks->buf = (char*)malloc(__bufsize);                           \
 | 
| 
 | 
    57           return ks;                                                    \
 | 
| 
 | 
    58 	}                                                               \
 | 
| 
 | 
    59         static inline void ks_destroy(kstream_t *ks)                    \
 | 
| 
 | 
    60 	{                                                               \
 | 
| 
 | 
    61           if (ks) {                                                     \
 | 
| 
 | 
    62           free(ks->buf);                                                \
 | 
| 
 | 
    63           free(ks);                                                     \
 | 
| 
 | 
    64         }                                                               \
 | 
| 
 | 
    65           }
 | 
| 
 | 
    66 
 | 
| 
 | 
    67 #define __KS_GETC(__read, __bufsize)                                    \
 | 
| 
 | 
    68   static inline int ks_getc(kstream_t *ks)				\
 | 
| 
 | 
    69 	{                                                               \
 | 
| 
 | 
    70           if (ks->is_eof && ks->begin >= ks->end) return -1;            \
 | 
| 
 | 
    71           if (ks->begin >= ks->end) {                                   \
 | 
| 
 | 
    72           ks->begin = 0;                                                \
 | 
| 
 | 
    73           ks->end = __read(ks->f, ks->buf, __bufsize);                  \
 | 
| 
 | 
    74           if (ks->end < __bufsize) ks->is_eof = 1;                      \
 | 
| 
 | 
    75           if (ks->end == 0) return -1;					\
 | 
| 
 | 
    76         }                                                               \
 | 
| 
 | 
    77           return (int)ks->buf[ks->begin++];                             \
 | 
| 
 | 
    78 	}
 | 
| 
 | 
    79 
 | 
| 
 | 
    80 #ifndef KSTRING_T
 | 
| 
 | 
    81 #define KSTRING_T kstring_t
 | 
| 
 | 
    82 typedef struct __kstring_t {
 | 
| 
 | 
    83 	size_t l, m;
 | 
| 
 | 
    84 	char *s;
 | 
| 
 | 
    85 } kstring_t;
 | 
| 
 | 
    86 #endif
 | 
| 
 | 
    87 
 | 
| 
 | 
    88 #ifndef kroundup32
 | 
| 
 | 
    89 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
 | 
| 
 | 
    90 #endif
 | 
| 
 | 
    91 
 | 
| 
 | 
    92 #define __KS_GETUNTIL(__read, __bufsize)                                \
 | 
| 
 | 
    93   static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
 | 
| 
 | 
    94   {                                                                     \
 | 
| 
 | 
    95    if (dret) *dret = 0;                                                 \
 | 
| 
 | 
    96    str->l = 0;                                                          \
 | 
| 
 | 
    97    if (ks->begin >= ks->end && ks->is_eof) return -1;                   \
 | 
| 
 | 
    98    for (;;) {                                                           \
 | 
| 
 | 
    99              int i;                                                     \
 | 
| 
 | 
   100              if (ks->begin >= ks->end) {                                \
 | 
| 
 | 
   101                if (!ks->is_eof) {                                       \
 | 
| 
 | 
   102                ks->begin = 0;                                           \
 | 
| 
 | 
   103                ks->end = __read(ks->f, ks->buf, __bufsize);             \
 | 
| 
 | 
   104                if (ks->end < __bufsize) ks->is_eof = 1;                 \
 | 
| 
 | 
   105                if (ks->end == 0) break;                                 \
 | 
| 
 | 
   106              } else break;                                              \
 | 
| 
 | 
   107              }                                                          \
 | 
| 
 | 
   108                if (delimiter > KS_SEP_MAX) {                            \
 | 
| 
 | 
   109                for (i = ks->begin; i < ks->end; ++i)                    \
 | 
| 
 | 
   110                  if (ks->buf[i] == delimiter) break;                    \
 | 
| 
 | 
   111              } else if (delimiter == KS_SEP_SPACE) {                    \
 | 
| 
 | 
   112                for (i = ks->begin; i < ks->end; ++i)                    \
 | 
| 
 | 
   113                  if (isspace(ks->buf[i])) break;                        \
 | 
| 
 | 
   114              } else if (delimiter == KS_SEP_TAB) {                      \
 | 
| 
 | 
   115                for (i = ks->begin; i < ks->end; ++i)                    \
 | 
| 
 | 
   116                  if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break;   \
 | 
| 
 | 
   117              } else i = 0; /* never come to here! */                    \
 | 
| 
 | 
   118                if (str->m - str->l < i - ks->begin + 1) {               \
 | 
| 
 | 
   119                str->m = str->l + (i - ks->begin) + 1;                   \
 | 
| 
 | 
   120                kroundup32(str->m);                                      \
 | 
| 
 | 
   121                str->s = (char*)realloc(str->s, str->m);                 \
 | 
| 
 | 
   122              }                                                          \
 | 
| 
 | 
   123                memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
 | 
| 
 | 
   124                str->l = str->l + (i - ks->begin);                       \
 | 
| 
 | 
   125                ks->begin = i + 1;                                       \
 | 
| 
 | 
   126                if (i < ks->end) {                                       \
 | 
| 
 | 
   127                if (dret) *dret = ks->buf[i];                            \
 | 
| 
 | 
   128                break;                                                   \
 | 
| 
 | 
   129              }                                                          \
 | 
| 
 | 
   130              }                                                          \
 | 
| 
 | 
   131                if (str->l == 0) {                                       \
 | 
| 
 | 
   132                str->m = 1;                                              \
 | 
| 
 | 
   133                str->s = (char*)calloc(1, 1);                            \
 | 
| 
 | 
   134              }                                                          \
 | 
| 
 | 
   135                str->s[str->l] = '\0';                                   \
 | 
| 
 | 
   136                return str->l;                                           \
 | 
| 
 | 
   137                }
 | 
| 
 | 
   138 
 | 
| 
 | 
   139 #define KSTREAM_INIT(type_t, __read, __bufsize)                         \
 | 
| 
 | 
   140   __KS_TYPE(type_t)							\
 | 
| 
 | 
   141   __KS_BASIC(type_t, __bufsize)                                         \
 | 
| 
 | 
   142   __KS_GETC(__read, __bufsize)                                          \
 | 
| 
 | 
   143   __KS_GETUNTIL(__read, __bufsize)
 | 
| 
 | 
   144 
 | 
| 
 | 
   145 #define __KSEQ_BASIC(type_t)                                            \
 | 
| 
 | 
   146   static inline kseq_t *kseq_init(type_t fd)                            \
 | 
| 
 | 
   147   {                                                                     \
 | 
| 
 | 
   148    kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t));                      \
 | 
| 
 | 
   149    s->f = ks_init(fd);                                                  \
 | 
| 
 | 
   150    return s;                                                            \
 | 
| 
 | 
   151    }                                                                    \
 | 
| 
 | 
   152   static inline void kseq_rewind(kseq_t *ks)                            \
 | 
| 
 | 
   153   {                                                                     \
 | 
| 
 | 
   154    ks->last_char = 0;                                                   \
 | 
| 
 | 
   155    ks->f->is_eof = ks->f->begin = ks->f->end = 0;                       \
 | 
| 
 | 
   156    }                                                                    \
 | 
| 
 | 
   157   static inline void kseq_destroy(kseq_t *ks)                           \
 | 
| 
 | 
   158   {                                                                     \
 | 
| 
 | 
   159    if (!ks) return;                                                     \
 | 
| 
 | 
   160    free(ks->name.s); free(ks->comment.s); free(ks->seq.s);	free(ks->qual.s); \
 | 
| 
 | 
   161    ks_destroy(ks->f);                                                   \
 | 
| 
 | 
   162    free(ks);                                                            \
 | 
| 
 | 
   163    }
 | 
| 
 | 
   164 
 | 
| 
 | 
   165 /* Return value:
 | 
| 
 | 
   166    >=0  length of the sequence (normal)
 | 
| 
 | 
   167    -1   end-of-file
 | 
| 
 | 
   168    -2   truncated quality string
 | 
| 
 | 
   169 */
 | 
| 
 | 
   170 #define __KSEQ_READ                                                     \
 | 
| 
 | 
   171   static int kseq_read(kseq_t *seq)                                     \
 | 
| 
 | 
   172   {                                                                     \
 | 
| 
 | 
   173    int c;                                                               \
 | 
| 
 | 
   174    kstream_t *ks = seq->f;                                              \
 | 
| 
 | 
   175    if (seq->last_char == 0) { /* then jump to the next header line */   \
 | 
| 
 | 
   176                              while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
 | 
| 
 | 
   177                              if (c == -1) return -1; /* end of file */  \
 | 
| 
 | 
   178                              seq->last_char = c;                        \
 | 
| 
 | 
   179                              } /* the first header char has been read */ \
 | 
| 
 | 
   180    seq->comment.l = seq->seq.l = seq->qual.l = 0;                       \
 | 
| 
 | 
   181    if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1;               \
 | 
| 
 | 
   182    if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0);              \
 | 
| 
 | 
   183    while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
 | 
| 
 | 
   184      if (isgraph(c)) { /* printable non-space character */              \
 | 
| 
 | 
   185      if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */        \
 | 
| 
 | 
   186      seq->seq.m = seq->seq.l + 2;                                       \
 | 
| 
 | 
   187      kroundup32(seq->seq.m); /* rounded to next closest 2^k */          \
 | 
| 
 | 
   188      seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m);               \
 | 
| 
 | 
   189    }                                                                    \
 | 
| 
 | 
   190      seq->seq.s[seq->seq.l++] = (char)c;                                \
 | 
| 
 | 
   191    }                                                                    \
 | 
| 
 | 
   192    }                                                                    \
 | 
| 
 | 
   193      if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
 | 
| 
 | 
   194      if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
 | 
| 
 | 
   195      seq->seq.m = seq->seq.l + 2; \
 | 
| 
 | 
   196      kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
 | 
| 
 | 
   197      seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
 | 
| 
 | 
   198      } \
 | 
| 
 | 
   199      seq->seq.s[seq->seq.l] = 0;    /* null terminated string */    \
 | 
| 
 | 
   200      if (c != '+') return seq->seq.l; /* FASTA */                       \
 | 
| 
 | 
   201      if (seq->qual.m < seq->seq.m) {	/* allocate enough memory */	\
 | 
| 
 | 
   202      seq->qual.m = seq->seq.m;                                          \
 | 
| 
 | 
   203      seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m);		\
 | 
| 
 | 
   204    }                                                                    \
 | 
| 
 | 
   205      while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
 | 
| 
 | 
   206      if (c == -1) return -2; /* we should not stop here */              \
 | 
| 
 | 
   207      while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l)        \
 | 
| 
 | 
   208        if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \
 | 
| 
 | 
   209      seq->qual.s[seq->qual.l] = 0; /* null terminated string */		\
 | 
| 
 | 
   210      seq->last_char = 0;	/* we have not come to the next header line */ \
 | 
| 
 | 
   211      if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \
 | 
| 
 | 
   212      return seq->seq.l;                                                 \
 | 
| 
 | 
   213      }
 | 
| 
 | 
   214 
 | 
| 
 | 
   215 #define __KSEQ_TYPE(type_t)						\
 | 
| 
 | 
   216   typedef struct {							\
 | 
| 
 | 
   217     kstring_t name, comment, seq, qual;                                 \
 | 
| 
 | 
   218     int last_char;							\
 | 
| 
 | 
   219     kstream_t *f;							\
 | 
| 
 | 
   220   } kseq_t;
 | 
| 
 | 
   221 
 | 
| 
 | 
   222 #define KSEQ_INIT(type_t, __read)                                       \
 | 
| 
 | 
   223   KSTREAM_INIT(type_t, __read, 4096)                                    \
 | 
| 
 | 
   224   __KSEQ_TYPE(type_t)							\
 | 
| 
 | 
   225   __KSEQ_BASIC(type_t)                                                  \
 | 
| 
 | 
   226   __KSEQ_READ
 | 
| 
 | 
   227 
 | 
| 
 | 
   228 #endif
 |