0
|
1 #include "sam_header.h"
|
|
2 #include <stdio.h>
|
|
3 #include <string.h>
|
|
4 #include <ctype.h>
|
|
5 #include <stdlib.h>
|
|
6 #include <stdarg.h>
|
|
7
|
|
8 #include "khash.h"
|
|
9 KHASH_MAP_INIT_STR(str, const char *)
|
|
10
|
|
11 struct _HeaderList
|
|
12 {
|
|
13 struct _HeaderList *last; // Hack: Used and maintained only by list_append_to_end. Maintained in the root node only.
|
|
14 struct _HeaderList *next;
|
|
15 void *data;
|
|
16 };
|
|
17 typedef struct _HeaderList list_t;
|
|
18 typedef list_t HeaderDict;
|
|
19
|
|
20 typedef struct
|
|
21 {
|
|
22 char key[2];
|
|
23 char *value;
|
|
24 }
|
|
25 HeaderTag;
|
|
26
|
|
27 typedef struct
|
|
28 {
|
|
29 char type[2];
|
|
30 list_t *tags;
|
|
31 }
|
|
32 HeaderLine;
|
|
33
|
|
34 const char *o_hd_tags[] = {"SO","GO",NULL};
|
|
35 const char *r_hd_tags[] = {"VN",NULL};
|
|
36
|
|
37 const char *o_sq_tags[] = {"AS","M5","UR","SP",NULL};
|
|
38 const char *r_sq_tags[] = {"SN","LN",NULL};
|
|
39 const char *u_sq_tags[] = {"SN",NULL};
|
|
40
|
|
41 const char *o_rg_tags[] = {"CN","DS","DT","FO","KS","LB","PG","PI","PL","PU","SM",NULL};
|
|
42 const char *r_rg_tags[] = {"ID",NULL};
|
|
43 const char *u_rg_tags[] = {"ID",NULL};
|
|
44
|
|
45 const char *o_pg_tags[] = {"VN","CL",NULL};
|
|
46 const char *r_pg_tags[] = {"ID",NULL};
|
|
47
|
|
48 const char *types[] = {"HD","SQ","RG","PG","CO",NULL};
|
|
49 const char **optional_tags[] = {o_hd_tags,o_sq_tags,o_rg_tags,o_pg_tags,NULL,NULL};
|
|
50 const char **required_tags[] = {r_hd_tags,r_sq_tags,r_rg_tags,r_pg_tags,NULL,NULL};
|
|
51 const char **unique_tags[] = {NULL, u_sq_tags,u_rg_tags,NULL,NULL,NULL};
|
|
52
|
|
53
|
|
54 static void debug(const char *format, ...)
|
|
55 {
|
|
56 va_list ap;
|
|
57 va_start(ap, format);
|
|
58 vfprintf(stderr, format, ap);
|
|
59 va_end(ap);
|
|
60 }
|
|
61
|
|
62 #if 0
|
|
63 // Replaced by list_append_to_end
|
|
64 static list_t *list_prepend(list_t *root, void *data)
|
|
65 {
|
|
66 list_t *l = malloc(sizeof(list_t));
|
|
67 l->next = root;
|
|
68 l->data = data;
|
|
69 return l;
|
|
70 }
|
|
71 #endif
|
|
72
|
|
73 // Relies on the root->last being correct. Do not use with the other list_*
|
|
74 // routines unless they are fixed to modify root->last as well.
|
|
75 static list_t *list_append_to_end(list_t *root, void *data)
|
|
76 {
|
|
77 list_t *l = malloc(sizeof(list_t));
|
|
78 l->last = l;
|
|
79 l->next = NULL;
|
|
80 l->data = data;
|
|
81
|
|
82 if ( !root )
|
|
83 return l;
|
|
84
|
|
85 root->last->next = l;
|
|
86 root->last = l;
|
|
87 return root;
|
|
88 }
|
|
89
|
|
90 static list_t *list_append(list_t *root, void *data)
|
|
91 {
|
|
92 list_t *l = root;
|
|
93 while (l && l->next)
|
|
94 l = l->next;
|
|
95 if ( l )
|
|
96 {
|
|
97 l->next = malloc(sizeof(list_t));
|
|
98 l = l->next;
|
|
99 }
|
|
100 else
|
|
101 {
|
|
102 l = malloc(sizeof(list_t));
|
|
103 root = l;
|
|
104 }
|
|
105 l->data = data;
|
|
106 l->next = NULL;
|
|
107 return root;
|
|
108 }
|
|
109
|
|
110 static void list_free(list_t *root)
|
|
111 {
|
|
112 list_t *l = root;
|
|
113 while (root)
|
|
114 {
|
|
115 l = root;
|
|
116 root = root->next;
|
|
117 free(l);
|
|
118 }
|
|
119 }
|
|
120
|
|
121
|
|
122
|
|
123 // Look for a tag "XY" in a predefined const char *[] array.
|
|
124 static int tag_exists(const char *tag, const char **tags)
|
|
125 {
|
|
126 int itag=0;
|
|
127 if ( !tags ) return -1;
|
|
128 while ( tags[itag] )
|
|
129 {
|
|
130 if ( tags[itag][0]==tag[0] && tags[itag][1]==tag[1] ) return itag;
|
|
131 itag++;
|
|
132 }
|
|
133 return -1;
|
|
134 }
|
|
135
|
|
136
|
|
137
|
|
138 // Mimics the behaviour of getline, except it returns pointer to the next chunk of the text
|
|
139 // or NULL if everything has been read. The lineptr should be freed by the caller. The
|
|
140 // newline character is stripped.
|
|
141 static const char *nextline(char **lineptr, size_t *n, const char *text)
|
|
142 {
|
|
143 int len;
|
|
144 const char *to = text;
|
|
145
|
|
146 if ( !*to ) return NULL;
|
|
147
|
|
148 while ( *to && *to!='\n' && *to!='\r' ) to++;
|
|
149 len = to - text + 1;
|
|
150
|
|
151 if ( *to )
|
|
152 {
|
|
153 // Advance the pointer for the next call
|
|
154 if ( *to=='\n' ) to++;
|
|
155 else if ( *to=='\r' && *(to+1)=='\n' ) to+=2;
|
|
156 }
|
|
157 if ( !len )
|
|
158 return to;
|
|
159
|
|
160 if ( !*lineptr )
|
|
161 {
|
|
162 *lineptr = malloc(len);
|
|
163 *n = len;
|
|
164 }
|
|
165 else if ( *n<len )
|
|
166 {
|
|
167 *lineptr = realloc(*lineptr, len);
|
|
168 *n = len;
|
|
169 }
|
|
170 if ( !*lineptr ) {
|
|
171 debug("[nextline] Insufficient memory!\n");
|
|
172 return 0;
|
|
173 }
|
|
174
|
|
175 memcpy(*lineptr,text,len);
|
|
176 (*lineptr)[len-1] = 0;
|
|
177
|
|
178 return to;
|
|
179 }
|
|
180
|
|
181 // name points to "XY", value_from points to the first character of the value string and
|
|
182 // value_to points to the last character of the value string.
|
|
183 static HeaderTag *new_tag(const char *name, const char *value_from, const char *value_to)
|
|
184 {
|
|
185 HeaderTag *tag = malloc(sizeof(HeaderTag));
|
|
186 int len = value_to-value_from+1;
|
|
187
|
|
188 tag->key[0] = name[0];
|
|
189 tag->key[1] = name[1];
|
|
190 tag->value = malloc(len+1);
|
|
191 memcpy(tag->value,value_from,len+1);
|
|
192 tag->value[len] = 0;
|
|
193 return tag;
|
|
194 }
|
|
195
|
|
196 static HeaderTag *header_line_has_tag(HeaderLine *hline, const char *key)
|
|
197 {
|
|
198 list_t *tags = hline->tags;
|
|
199 while (tags)
|
|
200 {
|
|
201 HeaderTag *tag = tags->data;
|
|
202 if ( tag->key[0]==key[0] && tag->key[1]==key[1] ) return tag;
|
|
203 tags = tags->next;
|
|
204 }
|
|
205 return NULL;
|
|
206 }
|
|
207
|
|
208
|
|
209 // Return codes:
|
|
210 // 0 .. different types or unique tags differ or conflicting tags, cannot be merged
|
|
211 // 1 .. all tags identical -> no need to merge, drop one
|
|
212 // 2 .. the unique tags match and there are some conflicting tags (same tag, different value) -> error, cannot be merged nor duplicated
|
|
213 // 3 .. there are some missing complementary tags and no unique conflict -> can be merged into a single line
|
|
214 static int sam_header_compare_lines(HeaderLine *hline1, HeaderLine *hline2)
|
|
215 {
|
|
216 HeaderTag *t1, *t2;
|
|
217
|
|
218 if ( hline1->type[0]!=hline2->type[0] || hline1->type[1]!=hline2->type[1] )
|
|
219 return 0;
|
|
220
|
|
221 int itype = tag_exists(hline1->type,types);
|
|
222 if ( itype==-1 ) {
|
|
223 debug("[sam_header_compare_lines] Unknown type [%c%c]\n", hline1->type[0],hline1->type[1]);
|
|
224 return -1; // FIXME (lh3): error; I do not know how this will be handled in Petr's code
|
|
225 }
|
|
226
|
|
227 if ( unique_tags[itype] )
|
|
228 {
|
|
229 t1 = header_line_has_tag(hline1,unique_tags[itype][0]);
|
|
230 t2 = header_line_has_tag(hline2,unique_tags[itype][0]);
|
|
231 if ( !t1 || !t2 ) // this should never happen, the unique tags are required
|
|
232 return 2;
|
|
233
|
|
234 if ( strcmp(t1->value,t2->value) )
|
|
235 return 0; // the unique tags differ, cannot be merged
|
|
236 }
|
|
237 if ( !required_tags[itype] && !optional_tags[itype] )
|
|
238 {
|
|
239 t1 = hline1->tags->data;
|
|
240 t2 = hline2->tags->data;
|
|
241 if ( !strcmp(t1->value,t2->value) ) return 1; // identical comments
|
|
242 return 0;
|
|
243 }
|
|
244
|
|
245 int missing=0, itag=0;
|
|
246 while ( required_tags[itype] && required_tags[itype][itag] )
|
|
247 {
|
|
248 t1 = header_line_has_tag(hline1,required_tags[itype][itag]);
|
|
249 t2 = header_line_has_tag(hline2,required_tags[itype][itag]);
|
|
250 if ( !t1 && !t2 )
|
|
251 return 2; // this should never happen
|
|
252 else if ( !t1 || !t2 )
|
|
253 missing = 1; // there is some tag missing in one of the hlines
|
|
254 else if ( strcmp(t1->value,t2->value) )
|
|
255 {
|
|
256 if ( unique_tags[itype] )
|
|
257 return 2; // the lines have a matching unique tag but have a conflicting tag
|
|
258
|
|
259 return 0; // the lines contain conflicting tags, cannot be merged
|
|
260 }
|
|
261 itag++;
|
|
262 }
|
|
263 itag = 0;
|
|
264 while ( optional_tags[itype] && optional_tags[itype][itag] )
|
|
265 {
|
|
266 t1 = header_line_has_tag(hline1,optional_tags[itype][itag]);
|
|
267 t2 = header_line_has_tag(hline2,optional_tags[itype][itag]);
|
|
268 if ( !t1 && !t2 )
|
|
269 {
|
|
270 itag++;
|
|
271 continue;
|
|
272 }
|
|
273 if ( !t1 || !t2 )
|
|
274 missing = 1; // there is some tag missing in one of the hlines
|
|
275 else if ( strcmp(t1->value,t2->value) )
|
|
276 {
|
|
277 if ( unique_tags[itype] )
|
|
278 return 2; // the lines have a matching unique tag but have a conflicting tag
|
|
279
|
|
280 return 0; // the lines contain conflicting tags, cannot be merged
|
|
281 }
|
|
282 itag++;
|
|
283 }
|
|
284 if ( missing ) return 3; // there are some missing complementary tags with no conflicts, can be merged
|
|
285 return 1;
|
|
286 }
|
|
287
|
|
288
|
|
289 static HeaderLine *sam_header_line_clone(const HeaderLine *hline)
|
|
290 {
|
|
291 list_t *tags;
|
|
292 HeaderLine *out = malloc(sizeof(HeaderLine));
|
|
293 out->type[0] = hline->type[0];
|
|
294 out->type[1] = hline->type[1];
|
|
295 out->tags = NULL;
|
|
296
|
|
297 tags = hline->tags;
|
|
298 while (tags)
|
|
299 {
|
|
300 HeaderTag *old = tags->data;
|
|
301
|
|
302 HeaderTag *new = malloc(sizeof(HeaderTag));
|
|
303 new->key[0] = old->key[0];
|
|
304 new->key[1] = old->key[1];
|
|
305 new->value = strdup(old->value);
|
|
306 out->tags = list_append(out->tags, new);
|
|
307
|
|
308 tags = tags->next;
|
|
309 }
|
|
310 return out;
|
|
311 }
|
|
312
|
|
313 static int sam_header_line_merge_with(HeaderLine *out_hline, const HeaderLine *tmpl_hline)
|
|
314 {
|
|
315 list_t *tmpl_tags;
|
|
316
|
|
317 if ( out_hline->type[0]!=tmpl_hline->type[0] || out_hline->type[1]!=tmpl_hline->type[1] )
|
|
318 return 0;
|
|
319
|
|
320 tmpl_tags = tmpl_hline->tags;
|
|
321 while (tmpl_tags)
|
|
322 {
|
|
323 HeaderTag *tmpl_tag = tmpl_tags->data;
|
|
324 HeaderTag *out_tag = header_line_has_tag(out_hline, tmpl_tag->key);
|
|
325 if ( !out_tag )
|
|
326 {
|
|
327 HeaderTag *tag = malloc(sizeof(HeaderTag));
|
|
328 tag->key[0] = tmpl_tag->key[0];
|
|
329 tag->key[1] = tmpl_tag->key[1];
|
|
330 tag->value = strdup(tmpl_tag->value);
|
|
331 out_hline->tags = list_append(out_hline->tags,tag);
|
|
332 }
|
|
333 tmpl_tags = tmpl_tags->next;
|
|
334 }
|
|
335 return 1;
|
|
336 }
|
|
337
|
|
338
|
|
339 static HeaderLine *sam_header_line_parse(const char *headerLine)
|
|
340 {
|
|
341 HeaderLine *hline;
|
|
342 HeaderTag *tag;
|
|
343 const char *from, *to;
|
|
344 from = headerLine;
|
|
345
|
|
346 if ( *from != '@' ) {
|
|
347 debug("[sam_header_line_parse] expected '@', got [%s]\n", headerLine);
|
|
348 return 0;
|
|
349 }
|
|
350 to = ++from;
|
|
351
|
|
352 while (*to && *to!='\t') to++;
|
|
353 if ( to-from != 2 ) {
|
|
354 debug("[sam_header_line_parse] expected '@XY', got [%s]\nHint: The header tags must be tab-separated.\n", headerLine);
|
|
355 return 0;
|
|
356 }
|
|
357
|
|
358 hline = malloc(sizeof(HeaderLine));
|
|
359 hline->type[0] = from[0];
|
|
360 hline->type[1] = from[1];
|
|
361 hline->tags = NULL;
|
|
362
|
|
363 int itype = tag_exists(hline->type, types);
|
|
364
|
|
365 from = to;
|
|
366 while (*to && *to=='\t') to++;
|
|
367 if ( to-from != 1 ) {
|
|
368 debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from));
|
|
369 free(hline);
|
|
370 return 0;
|
|
371 }
|
|
372 from = to;
|
|
373 while (*from)
|
|
374 {
|
|
375 while (*to && *to!='\t') to++;
|
|
376
|
|
377 if ( !required_tags[itype] && !optional_tags[itype] )
|
|
378 {
|
|
379 // CO is a special case, it can contain anything, including tabs
|
|
380 if ( *to ) { to++; continue; }
|
|
381 tag = new_tag(" ",from,to-1);
|
|
382 }
|
|
383 else
|
|
384 tag = new_tag(from,from+3,to-1);
|
|
385
|
|
386 if ( header_line_has_tag(hline,tag->key) )
|
|
387 debug("The tag '%c%c' present (at least) twice on line [%s]\n", tag->key[0],tag->key[1], headerLine);
|
|
388 hline->tags = list_append(hline->tags, tag);
|
|
389
|
|
390 from = to;
|
|
391 while (*to && *to=='\t') to++;
|
|
392 if ( *to && to-from != 1 ) {
|
|
393 debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from));
|
|
394 return 0;
|
|
395 }
|
|
396
|
|
397 from = to;
|
|
398 }
|
|
399 return hline;
|
|
400 }
|
|
401
|
|
402
|
|
403 // Must be of an existing type, all tags must be recognised and all required tags must be present
|
|
404 static int sam_header_line_validate(HeaderLine *hline)
|
|
405 {
|
|
406 list_t *tags;
|
|
407 HeaderTag *tag;
|
|
408 int itype, itag;
|
|
409
|
|
410 // Is the type correct?
|
|
411 itype = tag_exists(hline->type, types);
|
|
412 if ( itype==-1 )
|
|
413 {
|
|
414 debug("The type [%c%c] not recognised.\n", hline->type[0],hline->type[1]);
|
|
415 return 0;
|
|
416 }
|
|
417
|
|
418 // Has all required tags?
|
|
419 itag = 0;
|
|
420 while ( required_tags[itype] && required_tags[itype][itag] )
|
|
421 {
|
|
422 if ( !header_line_has_tag(hline,required_tags[itype][itag]) )
|
|
423 {
|
|
424 debug("The tag [%c%c] required for [%c%c] not present.\n", required_tags[itype][itag][0],required_tags[itype][itag][1],
|
|
425 hline->type[0],hline->type[1]);
|
|
426 return 0;
|
|
427 }
|
|
428 itag++;
|
|
429 }
|
|
430
|
|
431 // Are all tags recognised?
|
|
432 tags = hline->tags;
|
|
433 while ( tags )
|
|
434 {
|
|
435 tag = tags->data;
|
|
436 if ( !tag_exists(tag->key,required_tags[itype]) && !tag_exists(tag->key,optional_tags[itype]) )
|
|
437 {
|
|
438 // Lower case tags are user-defined values.
|
|
439 if( !(islower(tag->key[0]) || islower(tag->key[1])) )
|
|
440 {
|
|
441 // Neither is lower case, but tag was not recognized.
|
|
442 debug("Unknown tag [%c%c] for [%c%c].\n", tag->key[0],tag->key[1], hline->type[0],hline->type[1]);
|
|
443 // return 0; // Even unknown tags are allowed - for forward compatibility with new attributes
|
|
444 }
|
|
445 // else - allow user defined tag
|
|
446 }
|
|
447 tags = tags->next;
|
|
448 }
|
|
449
|
|
450 return 1;
|
|
451 }
|
|
452
|
|
453
|
|
454 static void print_header_line(FILE *fp, HeaderLine *hline)
|
|
455 {
|
|
456 list_t *tags = hline->tags;
|
|
457 HeaderTag *tag;
|
|
458
|
|
459 fprintf(fp, "@%c%c", hline->type[0],hline->type[1]);
|
|
460 while (tags)
|
|
461 {
|
|
462 tag = tags->data;
|
|
463
|
|
464 fprintf(fp, "\t");
|
|
465 if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
|
|
466 fprintf(fp, "%c%c:", tag->key[0],tag->key[1]);
|
|
467 fprintf(fp, "%s", tag->value);
|
|
468
|
|
469 tags = tags->next;
|
|
470 }
|
|
471 fprintf(fp,"\n");
|
|
472 }
|
|
473
|
|
474
|
|
475 static void sam_header_line_free(HeaderLine *hline)
|
|
476 {
|
|
477 list_t *tags = hline->tags;
|
|
478 while (tags)
|
|
479 {
|
|
480 HeaderTag *tag = tags->data;
|
|
481 free(tag->value);
|
|
482 free(tag);
|
|
483 tags = tags->next;
|
|
484 }
|
|
485 list_free(hline->tags);
|
|
486 free(hline);
|
|
487 }
|
|
488
|
|
489 void sam_header_free(void *_header)
|
|
490 {
|
|
491 HeaderDict *header = (HeaderDict*)_header;
|
|
492 list_t *hlines = header;
|
|
493 while (hlines)
|
|
494 {
|
|
495 sam_header_line_free(hlines->data);
|
|
496 hlines = hlines->next;
|
|
497 }
|
|
498 list_free(header);
|
|
499 }
|
|
500
|
|
501 HeaderDict *sam_header_clone(const HeaderDict *dict)
|
|
502 {
|
|
503 HeaderDict *out = NULL;
|
|
504 while (dict)
|
|
505 {
|
|
506 HeaderLine *hline = dict->data;
|
|
507 out = list_append(out, sam_header_line_clone(hline));
|
|
508 dict = dict->next;
|
|
509 }
|
|
510 return out;
|
|
511 }
|
|
512
|
|
513 // Returns a newly allocated string
|
|
514 char *sam_header_write(const void *_header)
|
|
515 {
|
|
516 const HeaderDict *header = (const HeaderDict*)_header;
|
|
517 char *out = NULL;
|
|
518 int len=0, nout=0;
|
|
519 const list_t *hlines;
|
|
520
|
|
521 // Calculate the length of the string to allocate
|
|
522 hlines = header;
|
|
523 while (hlines)
|
|
524 {
|
|
525 len += 4; // @XY and \n
|
|
526
|
|
527 HeaderLine *hline = hlines->data;
|
|
528 list_t *tags = hline->tags;
|
|
529 while (tags)
|
|
530 {
|
|
531 HeaderTag *tag = tags->data;
|
|
532 len += strlen(tag->value) + 1; // \t
|
|
533 if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
|
|
534 len += strlen(tag->value) + 3; // XY:
|
|
535 tags = tags->next;
|
|
536 }
|
|
537 hlines = hlines->next;
|
|
538 }
|
|
539
|
|
540 nout = 0;
|
|
541 out = malloc(len+1);
|
|
542 hlines = header;
|
|
543 while (hlines)
|
|
544 {
|
|
545 HeaderLine *hline = hlines->data;
|
|
546
|
|
547 nout += sprintf(out+nout,"@%c%c",hline->type[0],hline->type[1]);
|
|
548
|
|
549 list_t *tags = hline->tags;
|
|
550 while (tags)
|
|
551 {
|
|
552 HeaderTag *tag = tags->data;
|
|
553 nout += sprintf(out+nout,"\t");
|
|
554 if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
|
|
555 nout += sprintf(out+nout,"%c%c:", tag->key[0],tag->key[1]);
|
|
556 nout += sprintf(out+nout,"%s", tag->value);
|
|
557 tags = tags->next;
|
|
558 }
|
|
559 hlines = hlines->next;
|
|
560 nout += sprintf(out+nout,"\n");
|
|
561 }
|
|
562 out[len] = 0;
|
|
563 return out;
|
|
564 }
|
|
565
|
|
566 void *sam_header_parse2(const char *headerText)
|
|
567 {
|
|
568 list_t *hlines = NULL;
|
|
569 HeaderLine *hline;
|
|
570 const char *text;
|
|
571 char *buf=NULL;
|
|
572 size_t nbuf = 0;
|
|
573 int tovalidate = 0;
|
|
574
|
|
575 if ( !headerText )
|
|
576 return 0;
|
|
577
|
|
578 text = headerText;
|
|
579 while ( (text=nextline(&buf, &nbuf, text)) )
|
|
580 {
|
|
581 hline = sam_header_line_parse(buf);
|
|
582 if ( hline && (!tovalidate || sam_header_line_validate(hline)) )
|
|
583 // With too many (~250,000) reference sequences the header parsing was too slow with list_append.
|
|
584 hlines = list_append_to_end(hlines, hline);
|
|
585 else
|
|
586 {
|
|
587 if (hline) sam_header_line_free(hline);
|
|
588 sam_header_free(hlines);
|
|
589 if ( buf ) free(buf);
|
|
590 return NULL;
|
|
591 }
|
|
592 }
|
|
593 if ( buf ) free(buf);
|
|
594
|
|
595 return hlines;
|
|
596 }
|
|
597
|
|
598 void *sam_header2tbl(const void *_dict, char type[2], char key_tag[2], char value_tag[2])
|
|
599 {
|
|
600 const HeaderDict *dict = (const HeaderDict*)_dict;
|
|
601 const list_t *l = dict;
|
|
602 khash_t(str) *tbl = kh_init(str);
|
|
603 khiter_t k;
|
|
604 int ret;
|
|
605
|
|
606 if (_dict == 0) return tbl; // return an empty (not null) hash table
|
|
607 while (l)
|
|
608 {
|
|
609 HeaderLine *hline = l->data;
|
|
610 if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] )
|
|
611 {
|
|
612 l = l->next;
|
|
613 continue;
|
|
614 }
|
|
615
|
|
616 HeaderTag *key, *value;
|
|
617 key = header_line_has_tag(hline,key_tag);
|
|
618 value = header_line_has_tag(hline,value_tag);
|
|
619 if ( !key || !value )
|
|
620 {
|
|
621 l = l->next;
|
|
622 continue;
|
|
623 }
|
|
624
|
|
625 k = kh_get(str, tbl, key->value);
|
|
626 if ( k != kh_end(tbl) )
|
|
627 debug("[sam_header_lookup_table] They key %s not unique.\n", key->value);
|
|
628 k = kh_put(str, tbl, key->value, &ret);
|
|
629 kh_value(tbl, k) = value->value;
|
|
630
|
|
631 l = l->next;
|
|
632 }
|
|
633 return tbl;
|
|
634 }
|
|
635
|
|
636 char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n)
|
|
637 {
|
|
638 const HeaderDict *dict = (const HeaderDict*)_dict;
|
|
639 const list_t *l = dict;
|
|
640 int max, n;
|
|
641 char **ret;
|
|
642
|
|
643 ret = 0; *_n = max = n = 0;
|
|
644 while (l)
|
|
645 {
|
|
646 HeaderLine *hline = l->data;
|
|
647 if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] )
|
|
648 {
|
|
649 l = l->next;
|
|
650 continue;
|
|
651 }
|
|
652
|
|
653 HeaderTag *key;
|
|
654 key = header_line_has_tag(hline,key_tag);
|
|
655 if ( !key )
|
|
656 {
|
|
657 l = l->next;
|
|
658 continue;
|
|
659 }
|
|
660
|
|
661 if (n == max) {
|
|
662 max = max? max<<1 : 4;
|
|
663 ret = realloc(ret, max * sizeof(void*));
|
|
664 }
|
|
665 ret[n++] = key->value;
|
|
666
|
|
667 l = l->next;
|
|
668 }
|
|
669 *_n = n;
|
|
670 return ret;
|
|
671 }
|
|
672
|
|
673 void *sam_header2key_val(void *iter, const char type[2], const char key_tag[2], const char value_tag[2], const char **_key, const char **_value)
|
|
674 {
|
|
675 list_t *l = iter;
|
|
676 if ( !l ) return NULL;
|
|
677
|
|
678 while (l)
|
|
679 {
|
|
680 HeaderLine *hline = l->data;
|
|
681 if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] )
|
|
682 {
|
|
683 l = l->next;
|
|
684 continue;
|
|
685 }
|
|
686
|
|
687 HeaderTag *key, *value;
|
|
688 key = header_line_has_tag(hline,key_tag);
|
|
689 value = header_line_has_tag(hline,value_tag);
|
|
690 if ( !key && !value )
|
|
691 {
|
|
692 l = l->next;
|
|
693 continue;
|
|
694 }
|
|
695
|
|
696 *_key = key->value;
|
|
697 *_value = value->value;
|
|
698 return l->next;
|
|
699 }
|
|
700 return l;
|
|
701 }
|
|
702
|
|
703 const char *sam_tbl_get(void *h, const char *key)
|
|
704 {
|
|
705 khash_t(str) *tbl = (khash_t(str)*)h;
|
|
706 khint_t k;
|
|
707 k = kh_get(str, tbl, key);
|
|
708 return k == kh_end(tbl)? 0 : kh_val(tbl, k);
|
|
709 }
|
|
710
|
|
711 int sam_tbl_size(void *h)
|
|
712 {
|
|
713 khash_t(str) *tbl = (khash_t(str)*)h;
|
|
714 return h? kh_size(tbl) : 0;
|
|
715 }
|
|
716
|
|
717 void sam_tbl_destroy(void *h)
|
|
718 {
|
|
719 khash_t(str) *tbl = (khash_t(str)*)h;
|
|
720 kh_destroy(str, tbl);
|
|
721 }
|
|
722
|
|
723 void *sam_header_merge(int n, const void **_dicts)
|
|
724 {
|
|
725 const HeaderDict **dicts = (const HeaderDict**)_dicts;
|
|
726 HeaderDict *out_dict;
|
|
727 int idict, status;
|
|
728
|
|
729 if ( n<2 ) return NULL;
|
|
730
|
|
731 out_dict = sam_header_clone(dicts[0]);
|
|
732
|
|
733 for (idict=1; idict<n; idict++)
|
|
734 {
|
|
735 const list_t *tmpl_hlines = dicts[idict];
|
|
736
|
|
737 while ( tmpl_hlines )
|
|
738 {
|
|
739 list_t *out_hlines = out_dict;
|
|
740 int inserted = 0;
|
|
741 while ( out_hlines )
|
|
742 {
|
|
743 status = sam_header_compare_lines(tmpl_hlines->data, out_hlines->data);
|
|
744 if ( status==0 )
|
|
745 {
|
|
746 out_hlines = out_hlines->next;
|
|
747 continue;
|
|
748 }
|
|
749
|
|
750 if ( status==2 )
|
|
751 {
|
|
752 print_header_line(stderr,tmpl_hlines->data);
|
|
753 print_header_line(stderr,out_hlines->data);
|
|
754 debug("Conflicting lines, cannot merge the headers.\n");
|
|
755 return 0;
|
|
756 }
|
|
757 if ( status==3 )
|
|
758 sam_header_line_merge_with(out_hlines->data, tmpl_hlines->data);
|
|
759
|
|
760 inserted = 1;
|
|
761 break;
|
|
762 }
|
|
763 if ( !inserted )
|
|
764 out_dict = list_append(out_dict, sam_header_line_clone(tmpl_hlines->data));
|
|
765
|
|
766 tmpl_hlines = tmpl_hlines->next;
|
|
767 }
|
|
768 }
|
|
769
|
|
770 return out_dict;
|
|
771 }
|
|
772
|
|
773 char **sam_header2tbl_n(const void *dict, const char type[2], const char *tags[], int *n)
|
|
774 {
|
|
775 int nout = 0;
|
|
776 char **out = NULL;
|
|
777
|
|
778 *n = 0;
|
|
779 list_t *l = (list_t *)dict;
|
|
780 if ( !l ) return NULL;
|
|
781
|
|
782 int i, ntags = 0;
|
|
783 while ( tags[ntags] ) ntags++;
|
|
784
|
|
785 while (l)
|
|
786 {
|
|
787 HeaderLine *hline = l->data;
|
|
788 if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] )
|
|
789 {
|
|
790 l = l->next;
|
|
791 continue;
|
|
792 }
|
|
793 out = (char**) realloc(out, sizeof(char*)*(nout+1)*ntags);
|
|
794 for (i=0; i<ntags; i++)
|
|
795 {
|
|
796 HeaderTag *key = header_line_has_tag(hline, tags[i]);
|
|
797 if ( !key )
|
|
798 {
|
|
799 out[nout*ntags+i] = NULL;
|
|
800 continue;
|
|
801 }
|
|
802 out[nout*ntags+i] = key->value;
|
|
803 }
|
|
804 nout++;
|
|
805 l = l->next;
|
|
806 }
|
|
807 *n = nout;
|
|
808 return out;
|
|
809 }
|
|
810
|