comparison ezBAMQC/src/htslib/cram/sam_header.c @ 0:dfa3745e5fd8

Uploaded
author youngkim
date Thu, 24 Mar 2016 17:12:52 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dfa3745e5fd8
1 /*
2 Copyright (c) 2013 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7
8 1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10
11 2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 #ifdef HAVE_CONFIG_H
32 #include "io_lib_config.h"
33 #endif
34
35 #include <string.h>
36 #include <assert.h>
37
38 #include "cram/sam_header.h"
39 #include "cram/string_alloc.h"
40
41 static void sam_hdr_error(char *msg, char *line, int len, int lno) {
42 int j;
43
44 for (j = 0; j < len && line[j] != '\n'; j++)
45 ;
46 fprintf(stderr, "%s at line %d: \"%.*s\"\n", msg, lno, j, line);
47 }
48
49 void sam_hdr_dump(SAM_hdr *hdr) {
50 khint_t k;
51 int i;
52
53 printf("===DUMP===\n");
54 for (k = kh_begin(hdr->h); k != kh_end(hdr->h); k++) {
55 SAM_hdr_type *t1, *t2;
56 char c[2];
57
58 if (!kh_exist(hdr->h, k))
59 continue;
60
61 t1 = t2 = kh_val(hdr->h, k);
62 c[0] = kh_key(hdr->h, k)>>8;
63 c[1] = kh_key(hdr->h, k)&0xff;
64 printf("Type %.2s, count %d\n", c, t1->prev->order+1);
65
66 do {
67 SAM_hdr_tag *tag;
68 printf(">>>%d ", t1->order);
69 for (tag = t1->tag; tag; tag=tag->next) {
70 printf("\"%.2s\":\"%.*s\"\t",
71 tag->str, tag->len-3, tag->str+3);
72 }
73 putchar('\n');
74 t1 = t1->next;
75 } while (t1 != t2);
76 }
77
78 /* Dump out PG chains */
79 printf("\n@PG chains:\n");
80 for (i = 0; i < hdr->npg_end; i++) {
81 int j;
82 printf(" %d:", i);
83 for (j = hdr->pg_end[i]; j != -1; j = hdr->pg[j].prev_id) {
84 printf("%s%d(%.*s)",
85 j == hdr->pg_end[i] ? " " : "->",
86 j, hdr->pg[j].name_len, hdr->pg[j].name);
87 }
88 printf("\n");
89 }
90
91 puts("===END DUMP===");
92 }
93
94 /* Updates the hash tables in the SAM_hdr structure.
95 *
96 * Returns 0 on success;
97 * -1 on failure
98 */
99 static int sam_hdr_update_hashes(SAM_hdr *sh,
100 int type,
101 SAM_hdr_type *h_type) {
102 /* Add to reference hash? */
103 if ((type>>8) == 'S' && (type&0xff) == 'Q') {
104 SAM_hdr_tag *tag;
105 int nref = sh->nref;
106
107 sh->ref = realloc(sh->ref, (sh->nref+1)*sizeof(*sh->ref));
108 if (!sh->ref)
109 return -1;
110
111 tag = h_type->tag;
112 sh->ref[nref].name = NULL;
113 sh->ref[nref].len = 0;
114 sh->ref[nref].ty = h_type;
115 sh->ref[nref].tag = tag;
116
117 while (tag) {
118 if (tag->str[0] == 'S' && tag->str[1] == 'N') {
119 if (!(sh->ref[nref].name = malloc(tag->len)))
120 return -1;
121 strncpy(sh->ref[nref].name, tag->str+3, tag->len-3);
122 sh->ref[nref].name[tag->len-3] = 0;
123 } else if (tag->str[0] == 'L' && tag->str[1] == 'N') {
124 sh->ref[nref].len = atoi(tag->str+3);
125 }
126 tag = tag->next;
127 }
128
129 if (sh->ref[nref].name) {
130 khint_t k;
131 int r;
132 k = kh_put(m_s2i, sh->ref_hash, sh->ref[nref].name, &r);
133 if (-1 == r) return -1;
134 kh_val(sh->ref_hash, k) = nref;
135 }
136
137 sh->nref++;
138 }
139
140 /* Add to read-group hash? */
141 if ((type>>8) == 'R' && (type&0xff) == 'G') {
142 SAM_hdr_tag *tag;
143 int nrg = sh->nrg;
144
145 sh->rg = realloc(sh->rg, (sh->nrg+1)*sizeof(*sh->rg));
146 if (!sh->rg)
147 return -1;
148
149 tag = h_type->tag;
150 sh->rg[nrg].name = NULL;
151 sh->rg[nrg].name_len = 0;
152 sh->rg[nrg].ty = h_type;
153 sh->rg[nrg].tag = tag;
154 sh->rg[nrg].id = nrg;
155
156 while (tag) {
157 if (tag->str[0] == 'I' && tag->str[1] == 'D') {
158 if (!(sh->rg[nrg].name = malloc(tag->len)))
159 return -1;
160 strncpy(sh->rg[nrg].name, tag->str+3, tag->len-3);
161 sh->rg[nrg].name[tag->len-3] = 0;
162 sh->rg[nrg].name_len = strlen(sh->rg[nrg].name);
163 }
164 tag = tag->next;
165 }
166
167 if (sh->rg[nrg].name) {
168 khint_t k;
169 int r;
170 k = kh_put(m_s2i, sh->rg_hash, sh->rg[nrg].name, &r);
171 if (-1 == r) return -1;
172 kh_val(sh->rg_hash, k) = nrg;
173 }
174
175 sh->nrg++;
176 }
177
178 /* Add to program hash? */
179 if ((type>>8) == 'P' && (type&0xff) == 'G') {
180 SAM_hdr_tag *tag;
181 int npg = sh->npg;
182
183 sh->pg = realloc(sh->pg, (sh->npg+1)*sizeof(*sh->pg));
184 if (!sh->pg)
185 return -1;
186
187 tag = h_type->tag;
188 sh->pg[npg].name = NULL;
189 sh->pg[npg].name_len = 0;
190 sh->pg[npg].ty = h_type;
191 sh->pg[npg].tag = tag;
192 sh->pg[npg].id = npg;
193 sh->pg[npg].prev_id = -1;
194
195 while (tag) {
196 if (tag->str[0] == 'I' && tag->str[1] == 'D') {
197 if (!(sh->pg[npg].name = malloc(tag->len)))
198 return -1;
199 strncpy(sh->pg[npg].name, tag->str+3, tag->len-3);
200 sh->pg[npg].name[tag->len-3] = 0;
201 sh->pg[npg].name_len = strlen(sh->pg[npg].name);
202 } else if (tag->str[0] == 'P' && tag->str[1] == 'P') {
203 // Resolve later if needed
204 khint_t k;
205 char tmp = tag->str[tag->len]; tag->str[tag->len] = 0;
206 k = kh_get(m_s2i, sh->pg_hash, tag->str+3);
207 tag->str[tag->len] = tmp;
208
209 if (k != kh_end(sh->pg_hash)) {
210 int p_id = kh_val(sh->pg_hash, k);
211 sh->pg[npg].prev_id = sh->pg[p_id].id;
212
213 /* Unmark previous entry as a PG termination */
214 if (sh->npg_end > 0 &&
215 sh->pg_end[sh->npg_end-1] == p_id) {
216 sh->npg_end--;
217 } else {
218 int i;
219 for (i = 0; i < sh->npg_end; i++) {
220 if (sh->pg_end[i] == p_id) {
221 memmove(&sh->pg_end[i], &sh->pg_end[i+1],
222 (sh->npg_end-i-1)*sizeof(*sh->pg_end));
223 sh->npg_end--;
224 }
225 }
226 }
227 } else {
228 sh->pg[npg].prev_id = -1;
229 }
230 }
231 tag = tag->next;
232 }
233
234 if (sh->pg[npg].name) {
235 khint_t k;
236 int r;
237 k = kh_put(m_s2i, sh->pg_hash, sh->pg[npg].name, &r);
238 if (-1 == r) return -1;
239 kh_val(sh->pg_hash, k) = npg;
240 }
241
242 /* Add to npg_end[] array. Remove later if we find a PP line */
243 if (sh->npg_end >= sh->npg_end_alloc) {
244 sh->npg_end_alloc = sh->npg_end_alloc
245 ? sh->npg_end_alloc*2
246 : 4;
247 sh->pg_end = realloc(sh->pg_end,
248 sh->npg_end_alloc * sizeof(int));
249 if (!sh->pg_end)
250 return -1;
251 }
252 sh->pg_end[sh->npg_end++] = npg;
253
254 sh->npg++;
255 }
256
257 return 0;
258 }
259
260 /*
261 * Appends a formatted line to an existing SAM header.
262 * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with
263 * optional new-line. If it contains more than 1 line then multiple lines
264 * will be added in order.
265 *
266 * Len is the length of the text data, or 0 if unknown (in which case
267 * it should be null terminated).
268 *
269 * Returns 0 on success
270 * -1 on failure
271 */
272 int sam_hdr_add_lines(SAM_hdr *sh, const char *lines, int len) {
273 int i, lno = 1, text_offset;
274 char *hdr;
275
276 if (!len)
277 len = strlen(lines);
278
279 text_offset = ks_len(&sh->text);
280 if (EOF == kputsn(lines, len, &sh->text))
281 return -1;
282 hdr = ks_str(&sh->text) + text_offset;
283
284 for (i = 0; i < len; i++) {
285 khint32_t type;
286 khint_t k;
287
288 int l_start = i, new;
289 SAM_hdr_type *h_type;
290 SAM_hdr_tag *h_tag, *last;
291
292 if (hdr[i] != '@') {
293 int j;
294 for (j = i; j < len && hdr[j] != '\n'; j++)
295 ;
296 sam_hdr_error("Header line does not start with '@'",
297 &hdr[l_start], len - l_start, lno);
298 return -1;
299 }
300
301 type = (hdr[i+1]<<8) | hdr[i+2];
302 if (hdr[i+1] < 'A' || hdr[i+1] > 'z' ||
303 hdr[i+2] < 'A' || hdr[i+2] > 'z') {
304 sam_hdr_error("Header line does not have a two character key",
305 &hdr[l_start], len - l_start, lno);
306 return -1;
307 }
308
309 i += 3;
310 if (hdr[i] == '\n')
311 continue;
312
313 // Add the header line type
314 if (!(h_type = pool_alloc(sh->type_pool)))
315 return -1;
316 if (-1 == (k = kh_put(sam_hdr, sh->h, type, &new)))
317 return -1;
318
319 // Form the ring, either with self or other lines of this type
320 if (!new) {
321 SAM_hdr_type *t = kh_val(sh->h, k), *p;
322 p = t->prev;
323
324 assert(p->next = t);
325 p->next = h_type;
326 h_type->prev = p;
327
328 t->prev = h_type;
329 h_type->next = t;
330 h_type->order = p->order+1;
331 } else {
332 kh_val(sh->h, k) = h_type;
333 h_type->prev = h_type->next = h_type;
334 h_type->order = 0;
335 }
336
337 // Parse the tags on this line
338 last = NULL;
339 if ((type>>8) == 'C' && (type&0xff) == 'O') {
340 int j;
341 if (hdr[i] != '\t') {
342 sam_hdr_error("Missing tab",
343 &hdr[l_start], len - l_start, lno);
344 return -1;
345 }
346
347 for (j = ++i; j < len && hdr[j] != '\n'; j++)
348 ;
349
350 if (!(h_type->tag = h_tag = pool_alloc(sh->tag_pool)))
351 return -1;
352 h_tag->str = string_ndup(sh->str_pool, &hdr[i], j-i);
353 h_tag->len = j-i;
354 h_tag->next = NULL;
355 if (!h_tag->str)
356 return -1;
357
358 i = j;
359
360 } else {
361 do {
362 int j;
363 if (hdr[i] != '\t') {
364 sam_hdr_error("Missing tab",
365 &hdr[l_start], len - l_start, lno);
366 return -1;
367 }
368
369 for (j = ++i; j < len && hdr[j] != '\n' && hdr[j] != '\t'; j++)
370 ;
371
372 if (!(h_tag = pool_alloc(sh->tag_pool)))
373 return -1;
374 h_tag->str = string_ndup(sh->str_pool, &hdr[i], j-i);
375 h_tag->len = j-i;
376 h_tag->next = NULL;
377 if (!h_tag->str)
378 return -1;
379
380 if (h_tag->len < 3 || h_tag->str[2] != ':') {
381 sam_hdr_error("Malformed key:value pair",
382 &hdr[l_start], len - l_start, lno);
383 return -1;
384 }
385
386 if (last)
387 last->next = h_tag;
388 else
389 h_type->tag = h_tag;
390
391 last = h_tag;
392 i = j;
393 } while (i < len && hdr[i] != '\n');
394 }
395
396 /* Update RG/SQ hashes */
397 if (-1 == sam_hdr_update_hashes(sh, type, h_type))
398 return -1;
399 }
400
401 return 0;
402 }
403
404 /*
405 * Adds a single line to a SAM header.
406 * Specify type and one or more key,value pairs, ending with the NULL key.
407 * Eg. sam_hdr_add(h, "SQ", "ID", "foo", "LN", "100", NULL).
408 *
409 * Returns index for specific entry on success (eg 2nd SQ, 4th RG)
410 * -1 on failure
411 */
412 int sam_hdr_add(SAM_hdr *sh, const char *type, ...) {
413 va_list args;
414 va_start(args, type);
415 return sam_hdr_vadd(sh, type, args, NULL);
416 }
417
418 int sam_hdr_vadd(SAM_hdr *sh, const char *type, va_list ap, ...) {
419 va_list args;
420 SAM_hdr_type *h_type;
421 SAM_hdr_tag *h_tag, *last;
422 int new;
423 khint32_t type_i = (type[0]<<8) | type[1], k;
424
425 #if defined(HAVE_VA_COPY)
426 va_list ap_local;
427 #endif
428
429 if (EOF == kputc_('@', &sh->text))
430 return -1;
431 if (EOF == kputsn(type, 2, &sh->text))
432 return -1;
433
434 if (!(h_type = pool_alloc(sh->type_pool)))
435 return -1;
436 if (-1 == (k = kh_put(sam_hdr, sh->h, type_i, &new)))
437 return -1;
438 kh_val(sh->h, k) = h_type;
439
440 // Form the ring, either with self or other lines of this type
441 if (!new) {
442 SAM_hdr_type *t = kh_val(sh->h, k), *p;
443 p = t->prev;
444
445 assert(p->next = t);
446 p->next = h_type;
447 h_type->prev = p;
448
449 t->prev = h_type;
450 h_type->next = t;
451 h_type->order = p->order + 1;
452 } else {
453 h_type->prev = h_type->next = h_type;
454 h_type->order = 0;
455 }
456
457 last = NULL;
458
459 // Any ... varargs
460 va_start(args, ap);
461 for (;;) {
462 char *k, *v;
463 int idx;
464
465 if (!(k = (char *)va_arg(args, char *)))
466 break;
467 v = va_arg(args, char *);
468
469 if (EOF == kputc_('\t', &sh->text))
470 return -1;
471
472 if (!(h_tag = pool_alloc(sh->tag_pool)))
473 return -1;
474 idx = ks_len(&sh->text);
475
476 if (EOF == kputs(k, &sh->text))
477 return -1;
478 if (EOF == kputc_(':', &sh->text))
479 return -1;
480 if (EOF == kputs(v, &sh->text))
481 return -1;
482
483 h_tag->len = ks_len(&sh->text) - idx;
484 h_tag->str = string_ndup(sh->str_pool,
485 ks_str(&sh->text) + idx,
486 h_tag->len);
487 h_tag->next = NULL;
488 if (!h_tag->str)
489 return -1;
490
491 if (last)
492 last->next = h_tag;
493 else
494 h_type->tag = h_tag;
495
496 last = h_tag;
497 }
498 va_end(args);
499
500 #if defined(HAVE_VA_COPY)
501 va_copy(ap_local, ap);
502 # define ap ap_local
503 #endif
504
505 // Plus the specified va_list params
506 for (;;) {
507 char *k, *v;
508 int idx;
509
510 if (!(k = (char *)va_arg(ap, char *)))
511 break;
512 v = va_arg(ap, char *);
513
514 if (EOF == kputc_('\t', &sh->text))
515 return -1;
516
517 if (!(h_tag = pool_alloc(sh->tag_pool)))
518 return -1;
519 idx = ks_len(&sh->text);
520
521 if (EOF == kputs(k, &sh->text))
522 return -1;
523 if (EOF == kputc_(':', &sh->text))
524 return -1;
525 if (EOF == kputs(v, &sh->text))
526 return -1;
527
528 h_tag->len = ks_len(&sh->text) - idx;
529 h_tag->str = string_ndup(sh->str_pool,
530 ks_str(&sh->text) + idx,
531 h_tag->len);
532 h_tag->next = NULL;
533 if (!h_tag->str)
534 return -1;
535
536 if (last)
537 last->next = h_tag;
538 else
539 h_type->tag = h_tag;
540
541 last = h_tag;
542 }
543 va_end(ap);
544
545 if (EOF == kputc('\n', &sh->text))
546 return -1;
547
548 int itype = (type[0]<<8) | type[1];
549 if (-1 == sam_hdr_update_hashes(sh, itype, h_type))
550 return -1;
551
552 return h_type->order;
553 }
554
555 /*
556 * Returns the first header item matching 'type'. If ID is non-NULL it checks
557 * for the tag ID: and compares against the specified ID.
558 *
559 * Returns NULL if no type/ID is found
560 */
561 SAM_hdr_type *sam_hdr_find(SAM_hdr *hdr, char *type,
562 char *ID_key, char *ID_value) {
563 SAM_hdr_type *t1, *t2;
564 int itype = (type[0]<<8)|(type[1]);
565 khint_t k;
566
567 /* Special case for types we have prebuilt hashes on */
568 if (ID_key) {
569 if (type[0] == 'S' && type[1] == 'Q' &&
570 ID_key[0] == 'S' && ID_key[1] == 'N') {
571 k = kh_get(m_s2i, hdr->ref_hash, ID_value);
572 return k != kh_end(hdr->ref_hash)
573 ? hdr->ref[kh_val(hdr->ref_hash, k)].ty
574 : NULL;
575 }
576
577 if (type[0] == 'R' && type[1] == 'G' &&
578 ID_key[0] == 'I' && ID_key[1] == 'D') {
579 k = kh_get(m_s2i, hdr->rg_hash, ID_value);
580 return k != kh_end(hdr->rg_hash)
581 ? hdr->rg[kh_val(hdr->rg_hash, k)].ty
582 : NULL;
583 }
584
585 if (type[0] == 'P' && type[1] == 'G' &&
586 ID_key[0] == 'I' && ID_key[1] == 'D') {
587 k = kh_get(m_s2i, hdr->pg_hash, ID_value);
588 return k != kh_end(hdr->pg_hash)
589 ? hdr->pg[kh_val(hdr->pg_hash, k)].ty
590 : NULL;
591 }
592 }
593
594 k = kh_get(sam_hdr, hdr->h, itype);
595 if (k == kh_end(hdr->h))
596 return NULL;
597
598 if (!ID_key)
599 return kh_val(hdr->h, k);
600
601 t1 = t2 = kh_val(hdr->h, k);
602 do {
603 SAM_hdr_tag *tag;
604 for (tag = t1->tag; tag; tag = tag->next) {
605 if (tag->str[0] == ID_key[0] && tag->str[1] == ID_key[1]) {
606 char *cp1 = tag->str+3;
607 char *cp2 = ID_value;
608 while (*cp1 && *cp1 == *cp2)
609 cp1++, cp2++;
610 if (*cp2 || *cp1)
611 continue;
612 return t1;
613 }
614 }
615 t1 = t1->next;
616 } while (t1 != t2);
617
618 return NULL;
619 }
620
621 /*
622 * As per SAM_hdr_type, but returns a complete line of formatted text
623 * for a specific head type/ID combination. If ID is NULL then it returns
624 * the first line of the specified type.
625 *
626 * The returned string is malloced and should be freed by the calling
627 * function with free().
628 *
629 * Returns NULL if no type/ID is found.
630 */
631 char *sam_hdr_find_line(SAM_hdr *hdr, char *type,
632 char *ID_key, char *ID_value) {
633 SAM_hdr_type *ty = sam_hdr_find(hdr, type, ID_key, ID_value);
634 kstring_t ks = KS_INITIALIZER;
635 SAM_hdr_tag *tag;
636 int r = 0;
637
638 if (!ty)
639 return NULL;
640
641 // Paste together the line from the hashed copy
642 r |= (kputc_('@', &ks) == EOF);
643 r |= (kputs(type, &ks) == EOF);
644 for (tag = ty->tag; tag; tag = tag->next) {
645 r |= (kputc_('\t', &ks) == EOF);
646 r |= (kputsn(tag->str, tag->len, &ks) == EOF);
647 }
648
649 if (r) {
650 KS_FREE(&ks);
651 return NULL;
652 }
653
654 return ks_str(&ks);
655 }
656
657
658 /*
659 * Looks for a specific key in a single sam header line.
660 * If prev is non-NULL it also fills this out with the previous tag, to
661 * permit use in key removal. *prev is set to NULL when the tag is the first
662 * key in the list. When a tag isn't found, prev (if non NULL) will be the last
663 * tag in the existing list.
664 *
665 * Returns the tag pointer on success
666 * NULL on failure
667 */
668 SAM_hdr_tag *sam_hdr_find_key(SAM_hdr *sh,
669 SAM_hdr_type *type,
670 char *key,
671 SAM_hdr_tag **prev) {
672 SAM_hdr_tag *tag, *p = NULL;
673
674 for (tag = type->tag; tag; p = tag, tag = tag->next) {
675 if (tag->str[0] == key[0] && tag->str[1] == key[1]) {
676 if (prev)
677 *prev = p;
678 return tag;
679 }
680 }
681
682 if (prev)
683 *prev = p;
684
685 return NULL;
686 }
687
688
689 /*
690 * Adds or updates tag key,value pairs in a header line.
691 * Eg for adding M5 tags to @SQ lines or updating sort order for the
692 * @HD line (although use the sam_hdr_sort_order() function for
693 * HD manipulation, which is a wrapper around this funuction).
694 *
695 * Specify multiple key,value pairs ending in NULL.
696 *
697 * Returns 0 on success
698 * -1 on failure
699 */
700 int sam_hdr_update(SAM_hdr *hdr, SAM_hdr_type *type, ...) {
701 va_list ap;
702
703 va_start(ap, type);
704
705 for (;;) {
706 char *k, *v;
707 int idx;
708 SAM_hdr_tag *tag, *prev;
709
710 if (!(k = (char *)va_arg(ap, char *)))
711 break;
712 v = va_arg(ap, char *);
713
714 tag = sam_hdr_find_key(hdr, type, k, &prev);
715 if (!tag) {
716 if (!(tag = pool_alloc(hdr->tag_pool)))
717 return -1;
718 if (prev)
719 prev->next = tag;
720 else
721 type->tag = tag;
722
723 tag->next = NULL;
724 }
725
726 idx = ks_len(&hdr->text);
727 if (ksprintf(&hdr->text, "%2.2s:%s", k, v) < 0)
728 return -1;
729 tag->len = ks_len(&hdr->text) - idx;
730 tag->str = string_ndup(hdr->str_pool,
731 ks_str(&hdr->text) + idx,
732 tag->len);
733 if (!tag->str)
734 return -1;
735 }
736
737 va_end(ap);
738
739 return 0;
740 }
741
742 #define K(a) (((a)[0]<<8)|((a)[1]))
743
744 /*
745 * Reconstructs the kstring from the header hash table.
746 * Returns 0 on success
747 * -1 on failure
748 */
749 int sam_hdr_rebuild(SAM_hdr *hdr) {
750 /* Order: HD then others */
751 kstring_t ks = KS_INITIALIZER;
752 khint_t k;
753
754
755 k = kh_get(sam_hdr, hdr->h, K("HD"));
756 if (k != kh_end(hdr->h)) {
757 SAM_hdr_type *ty = kh_val(hdr->h, k);
758 SAM_hdr_tag *tag;
759 if (EOF == kputs("@HD", &ks))
760 return -1;
761 for (tag = ty->tag; tag; tag = tag->next) {
762 if (EOF == kputc_('\t', &ks))
763 return -1;
764 if (EOF == kputsn_(tag->str, tag->len, &ks))
765 return -1;
766 }
767 if (EOF == kputc('\n', &ks))
768 return -1;
769 }
770
771 for (k = kh_begin(hdr->h); k != kh_end(hdr->h); k++) {
772 SAM_hdr_type *t1, *t2;
773
774 if (!kh_exist(hdr->h, k))
775 continue;
776
777 if (kh_key(hdr->h, k) == K("HD"))
778 continue;
779
780 t1 = t2 = kh_val(hdr->h, k);
781 do {
782 SAM_hdr_tag *tag;
783 char c[2];
784
785 if (EOF == kputc_('@', &ks))
786 return -1;
787 c[0] = kh_key(hdr->h, k)>>8;
788 c[1] = kh_key(hdr->h, k)&0xff;
789 if (EOF == kputsn_(c, 2, &ks))
790 return -1;
791 for (tag = t1->tag; tag; tag=tag->next) {
792 if (EOF == kputc_('\t', &ks))
793 return -1;
794 if (EOF == kputsn_(tag->str, tag->len, &ks))
795 return -1;
796 }
797 if (EOF == kputc('\n', &ks))
798 return -1;
799 t1 = t1->next;
800 } while (t1 != t2);
801 }
802
803 if (ks_str(&hdr->text))
804 KS_FREE(&hdr->text);
805
806 hdr->text = ks;
807
808 return 0;
809 }
810
811
812 /*
813 * Creates an empty SAM header, ready to be populated.
814 *
815 * Returns a SAM_hdr struct on success (free with sam_hdr_free())
816 * NULL on failure
817 */
818 SAM_hdr *sam_hdr_new() {
819 SAM_hdr *sh = calloc(1, sizeof(*sh));
820
821 if (!sh)
822 return NULL;
823
824 sh->h = kh_init(sam_hdr);
825 if (!sh->h)
826 goto err;
827
828 sh->ID_cnt = 1;
829 sh->ref_count = 1;
830
831 sh->nref = 0;
832 sh->ref = NULL;
833 if (!(sh->ref_hash = kh_init(m_s2i)))
834 goto err;
835
836 sh->nrg = 0;
837 sh->rg = NULL;
838 if (!(sh->rg_hash = kh_init(m_s2i)))
839 goto err;
840
841 sh->npg = 0;
842 sh->pg = NULL;
843 sh->npg_end = sh->npg_end_alloc = 0;
844 sh->pg_end = NULL;
845 if (!(sh->pg_hash = kh_init(m_s2i)))
846 goto err;
847
848 KS_INIT(&sh->text);
849
850 if (!(sh->tag_pool = pool_create(sizeof(SAM_hdr_tag))))
851 goto err;
852
853 if (!(sh->type_pool = pool_create(sizeof(SAM_hdr_type))))
854 goto err;
855
856 if (!(sh->str_pool = string_pool_create(8192)))
857 goto err;
858
859 return sh;
860
861 err:
862 if (sh->h)
863 kh_destroy(sam_hdr, sh->h);
864
865 if (sh->tag_pool)
866 pool_destroy(sh->tag_pool);
867
868 if (sh->type_pool)
869 pool_destroy(sh->type_pool);
870
871 if (sh->str_pool)
872 string_pool_destroy(sh->str_pool);
873
874 free(sh);
875
876 return NULL;
877 }
878
879
880 /*
881 * Tokenises a SAM header into a hash table.
882 * Also extracts a few bits on specific data types, such as @RG lines.
883 *
884 * Returns a SAM_hdr struct on success (free with sam_hdr_free())
885 * NULL on failure
886 */
887 SAM_hdr *sam_hdr_parse_(const char *hdr, int len) {
888 /* Make an empty SAM_hdr */
889 SAM_hdr *sh;
890
891 sh = sam_hdr_new();
892 if (NULL == sh) return NULL;
893
894 if (NULL == hdr) return sh; // empty header is permitted
895
896 /* Parse the header, line by line */
897 if (-1 == sam_hdr_add_lines(sh, hdr, len)) {
898 sam_hdr_free(sh);
899 return NULL;
900 }
901
902 //sam_hdr_dump(sh);
903 //sam_hdr_add(sh, "RG", "ID", "foo", "SM", "bar", NULL);
904 //sam_hdr_rebuild(sh);
905 //printf(">>%s<<", ks_str(sh->text));
906
907 //parse_references(sh);
908 //parse_read_groups(sh);
909
910 sam_hdr_link_pg(sh);
911 //sam_hdr_dump(sh);
912
913 return sh;
914 }
915
916 /*
917 * Produces a duplicate copy of hdr and returns it.
918 * Returns NULL on failure
919 */
920 SAM_hdr *sam_hdr_dup(SAM_hdr *hdr) {
921 if (-1 == sam_hdr_rebuild(hdr))
922 return NULL;
923
924 return sam_hdr_parse_(sam_hdr_str(hdr), sam_hdr_length(hdr));
925 }
926
927 /*! Increments a reference count on hdr.
928 *
929 * This permits multiple files to share the same header, all calling
930 * sam_hdr_free when done, without causing errors for other open files.
931 */
932 void sam_hdr_incr_ref(SAM_hdr *hdr) {
933 hdr->ref_count++;
934 }
935
936 /*! Increments a reference count on hdr.
937 *
938 * This permits multiple files to share the same header, all calling
939 * sam_hdr_free when done, without causing errors for other open files.
940 *
941 * If the reference count hits zero then the header is automatically
942 * freed. This makes it a synonym for sam_hdr_free().
943 */
944 void sam_hdr_decr_ref(SAM_hdr *hdr) {
945 sam_hdr_free(hdr);
946 }
947
948 /*! Deallocates all storage used by a SAM_hdr struct.
949 *
950 * This also decrements the header reference count. If after decrementing
951 * it is still non-zero then the header is assumed to be in use by another
952 * caller and the free is not done.
953 *
954 * This is a synonym for sam_hdr_dec_ref().
955 */
956 void sam_hdr_free(SAM_hdr *hdr) {
957 if (!hdr)
958 return;
959
960 if (--hdr->ref_count > 0)
961 return;
962
963 if (ks_str(&hdr->text))
964 KS_FREE(&hdr->text);
965
966 if (hdr->h)
967 kh_destroy(sam_hdr, hdr->h);
968
969 if (hdr->ref_hash)
970 kh_destroy(m_s2i, hdr->ref_hash);
971
972 if (hdr->ref) {
973 int i;
974 for (i = 0; i < hdr->nref; i++)
975 if (hdr->ref[i].name)
976 free(hdr->ref[i].name);
977 free(hdr->ref);
978 }
979
980 if (hdr->rg_hash)
981 kh_destroy(m_s2i, hdr->rg_hash);
982
983 if (hdr->rg) {
984 int i;
985 for (i = 0; i < hdr->nrg; i++)
986 if (hdr->rg[i].name)
987 free(hdr->rg[i].name);
988 free(hdr->rg);
989 }
990
991 if (hdr->pg_hash)
992 kh_destroy(m_s2i, hdr->pg_hash);
993
994 if (hdr->pg) {
995 int i;
996 for (i = 0; i < hdr->npg; i++)
997 if (hdr->pg[i].name)
998 free(hdr->pg[i].name);
999 free(hdr->pg);
1000 }
1001
1002 if (hdr->pg_end)
1003 free(hdr->pg_end);
1004
1005 if (hdr->type_pool)
1006 pool_destroy(hdr->type_pool);
1007
1008 if (hdr->tag_pool)
1009 pool_destroy(hdr->tag_pool);
1010
1011 if (hdr->str_pool)
1012 string_pool_destroy(hdr->str_pool);
1013
1014 free(hdr);
1015 }
1016
1017 int sam_hdr_length(SAM_hdr *hdr) {
1018 return ks_len(&hdr->text);
1019 }
1020
1021 char *sam_hdr_str(SAM_hdr *hdr) {
1022 return ks_str(&hdr->text);
1023 }
1024
1025 /*
1026 * Looks up a reference sequence by name and returns the numerical ID.
1027 * Returns -1 if unknown reference.
1028 */
1029 int sam_hdr_name2ref(SAM_hdr *hdr, const char *ref) {
1030 khint_t k = kh_get(m_s2i, hdr->ref_hash, ref);
1031 return k == kh_end(hdr->ref_hash) ? -1 : kh_val(hdr->ref_hash, k);
1032 }
1033
1034 /*
1035 * Looks up a read-group by name and returns a pointer to the start of the
1036 * associated tag list.
1037 *
1038 * Returns NULL on failure
1039 */
1040 SAM_RG *sam_hdr_find_rg(SAM_hdr *hdr, const char *rg) {
1041 khint_t k = kh_get(m_s2i, hdr->rg_hash, rg);
1042 return k == kh_end(hdr->rg_hash)
1043 ? NULL
1044 : &hdr->rg[kh_val(hdr->rg_hash, k)];
1045 }
1046
1047
1048 /*
1049 * Fixes any PP links in @PG headers.
1050 * If the entries are in order then this doesn't need doing, but incase
1051 * our header is out of order this goes through the sh->pg[] array
1052 * setting the prev_id field.
1053 *
1054 * Note we can have multiple complete chains. This code should identify the
1055 * tails of these chains as these are the entries we have to link to in
1056 * subsequent PP records.
1057 *
1058 * Returns 0 on sucess
1059 * -1 on failure (indicating broken PG/PP records)
1060 */
1061 int sam_hdr_link_pg(SAM_hdr *hdr) {
1062 int i, j, ret = 0;
1063
1064 hdr->npg_end_alloc = hdr->npg;
1065 hdr->pg_end = realloc(hdr->pg_end, hdr->npg * sizeof(*hdr->pg_end));
1066 if (!hdr->pg_end)
1067 return -1;
1068
1069 for (i = 0; i < hdr->npg; i++)
1070 hdr->pg_end[i] = i;
1071
1072 for (i = 0; i < hdr->npg; i++) {
1073 khint_t k;
1074 SAM_hdr_tag *tag;
1075 char tmp;
1076
1077 for (tag = hdr->pg[i].tag; tag; tag = tag->next) {
1078 if (tag->str[0] == 'P' && tag->str[1] == 'P')
1079 break;
1080 }
1081 if (!tag) {
1082 /* Chain start points */
1083 continue;
1084 }
1085
1086 tmp = tag->str[tag->len]; tag->str[tag->len] = 0;
1087 k = kh_get(m_s2i, hdr->pg_hash, tag->str+3);
1088 tag->str[tag->len] = tmp;
1089
1090 if (k == kh_end(hdr->pg_hash)) {
1091 ret = -1;
1092 continue;
1093 }
1094
1095 hdr->pg[i].prev_id = hdr->pg[kh_val(hdr->pg_hash, k)].id;
1096 hdr->pg_end[kh_val(hdr->pg_hash, k)] = -1;
1097 }
1098
1099 for (i = j = 0; i < hdr->npg; i++) {
1100 if (hdr->pg_end[i] != -1)
1101 hdr->pg_end[j++] = hdr->pg_end[i];
1102 }
1103 hdr->npg_end = j;
1104
1105 return ret;
1106 }
1107
1108 /*
1109 * Returns a unique ID from a base name.
1110 *
1111 * The value returned is valid until the next call to
1112 * this function.
1113 */
1114 const char *sam_hdr_PG_ID(SAM_hdr *sh, const char *name) {
1115 khint_t k = kh_get(m_s2i, sh->pg_hash, name);
1116 if (k == kh_end(sh->pg_hash))
1117 return name;
1118
1119 do {
1120 sprintf(sh->ID_buf, "%.1000s.%d", name, sh->ID_cnt++);
1121 k = kh_get(m_s2i, sh->pg_hash, sh->ID_buf);
1122 } while (k == kh_end(sh->pg_hash));
1123
1124 return sh->ID_buf;
1125 }
1126
1127 /*
1128 * Add an @PG line.
1129 *
1130 * If we wish complete control over this use sam_hdr_add() directly. This
1131 * function uses that, but attempts to do a lot of tedious house work for
1132 * you too.
1133 *
1134 * - It will generate a suitable ID if the supplied one clashes.
1135 * - It will generate multiple @PG records if we have multiple PG chains.
1136 *
1137 * Call it as per sam_hdr_add() with a series of key,value pairs ending
1138 * in NULL.
1139 *
1140 * Returns 0 on success
1141 * -1 on failure
1142 */
1143 int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...) {
1144 va_list args;
1145 va_start(args, name);
1146
1147 if (sh->npg_end) {
1148 /* Copy ends array to avoid us looping while modifying it */
1149 int *end = malloc(sh->npg_end * sizeof(int));
1150 int i, nends = sh->npg_end;
1151
1152 if (!end)
1153 return -1;
1154
1155 memcpy(end, sh->pg_end, nends * sizeof(*end));
1156
1157 for (i = 0; i < nends; i++) {
1158 if (-1 == sam_hdr_vadd(sh, "PG", args,
1159 "ID", sam_hdr_PG_ID(sh, name),
1160 "PN", name,
1161 "PP", sh->pg[end[i]].name,
1162 NULL)) {
1163 free(end);
1164 return -1;
1165 }
1166 }
1167
1168 free(end);
1169 } else {
1170 if (-1 == sam_hdr_vadd(sh, "PG", args,
1171 "ID", sam_hdr_PG_ID(sh, name),
1172 "PN", name,
1173 NULL))
1174 return -1;
1175 }
1176
1177 //sam_hdr_dump(sh);
1178
1179 return 0;
1180 }
1181
1182 /*
1183 * A function to help with construction of CL tags in @PG records.
1184 * Takes an argc, argv pair and returns a single space-separated string.
1185 * This string should be deallocated by the calling function.
1186 *
1187 * Returns malloced char * on success
1188 * NULL on failure
1189 */
1190 char *stringify_argv(int argc, char *argv[]) {
1191 char *str, *cp;
1192 size_t nbytes = 1;
1193 int i, j;
1194
1195 /* Allocate */
1196 for (i = 0; i < argc; i++) {
1197 nbytes += strlen(argv[i]) + 1;
1198 }
1199 if (!(str = malloc(nbytes)))
1200 return NULL;
1201
1202 /* Copy */
1203 cp = str;
1204 for (i = 0; i < argc; i++) {
1205 j = 0;
1206 while (argv[i][j]) {
1207 if (argv[i][j] == '\t')
1208 *cp++ = ' ';
1209 else
1210 *cp++ = argv[i][j];
1211 j++;
1212 }
1213 *cp++ = ' ';
1214 }
1215 *cp++ = 0;
1216
1217 return str;
1218 }