0
|
1 /*
|
|
2 Copyright (c) 2013-2014 Genome Research Ltd.
|
|
3 Author: James Bonfield <jkb@sanger.ac.uk>
|
|
4
|
|
5 Redistribution and use in source and binary forms, with or without
|
|
6 modification, are permitted provided that the following conditions are met:
|
|
7
|
|
8 1. Redistributions of source code must retain the above copyright notice,
|
|
9 this list of conditions and the following disclaimer.
|
|
10
|
|
11 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
12 this list of conditions and the following disclaimer in the documentation
|
|
13 and/or other materials provided with the distribution.
|
|
14
|
|
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
|
|
16 Institute nor the names of its contributors may be used to endorse or promote
|
|
17 products derived from this software without specific prior written permission.
|
|
18
|
|
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
|
|
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
|
|
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
29 */
|
|
30
|
|
31 /*! \file
|
|
32 * SAM header parsing.
|
|
33 *
|
|
34 * These functions can be shared between SAM, BAM and CRAM file
|
|
35 * formats as all three internally use the same string encoding for
|
|
36 * header fields.
|
|
37 */
|
|
38
|
|
39 /*
|
|
40 * TODO.
|
|
41 *
|
|
42 * - Sort order (parse to struct, enum type, updating funcs)
|
|
43 * - Removal of lines.
|
|
44 * - Updating of lines
|
|
45 */
|
|
46
|
|
47 #ifndef _SAM_HDR_H_
|
|
48 #define _SAM_HDR_H_
|
|
49
|
|
50 #ifdef __cplusplus
|
|
51 extern "C" {
|
|
52 #endif
|
|
53
|
|
54 #ifdef HAVE_CONFIG_H
|
|
55 #include "io_lib_config.h"
|
|
56 #endif
|
|
57
|
|
58 #include <stdarg.h>
|
|
59
|
|
60 #include "cram/string_alloc.h"
|
|
61 #include "cram/pooled_alloc.h"
|
|
62
|
|
63 #include "htslib/khash.h"
|
|
64 #include "htslib/kstring.h"
|
|
65
|
|
66 // For structure assignment. Eg kstring_t s = KS_INITIALIZER;
|
|
67 #define KS_INITIALIZER {0,0,0}
|
|
68
|
|
69 // For initialisation elsewhere. Eg KS_INIT(x->str);
|
|
70 #define KS_INIT(ks) ((ks)->l = 0, (ks)->m = 0, (ks)->s = NULL)
|
|
71
|
|
72 // Frees the string subfield only. Assumes 's' itself is static.
|
|
73 #define KS_FREE(ks) do { if ((ks)->s) free((ks)->s); } while(0)
|
|
74
|
|
75 /*
|
|
76 * Proposed new SAM header parsing
|
|
77
|
|
78 1 @SQ ID:foo LN:100
|
|
79 2 @SQ ID:bar LN:200
|
|
80 3 @SQ ID:ram LN:300 UR:xyz
|
|
81 4 @RG ID:r ...
|
|
82 5 @RG ID:s ...
|
|
83
|
|
84 Hash table for 2-char @keys without dup entries.
|
|
85 If dup lines, we form a circular linked list. Ie hash keys = {RG, SQ}.
|
|
86
|
|
87 HASH("SQ")--\
|
|
88 |
|
|
89 (3) <-> 1 <-> 2 <-> 3 <-> (1)
|
|
90
|
|
91 HASH("RG")--\
|
|
92 |
|
|
93 (5) <-> 4 <-> 5 <-> (4)
|
|
94
|
|
95 Items stored in the hash values also form their own linked lists:
|
|
96 Ie SQ->ID(foo)->LN(100)
|
|
97 SQ->ID(bar)->LN(200)
|
|
98 SQ->ID(ram)->LN(300)->UR(xyz)
|
|
99 RG->ID(r)
|
|
100 */
|
|
101
|
|
102 /*! A single key:value pair on a header line
|
|
103 *
|
|
104 * These form a linked list and hold strings. The strings are
|
|
105 * allocated from a string_alloc_t pool referenced in the master
|
|
106 * SAM_hdr structure. Do not attempt to free, malloc or manipulate
|
|
107 * these strings directly.
|
|
108 */
|
|
109 typedef struct SAM_hdr_tag_s {
|
|
110 struct SAM_hdr_tag_s *next;
|
|
111 char *str;
|
|
112 int len;
|
|
113 } SAM_hdr_tag;
|
|
114
|
|
115 /*! The parsed version of the SAM header string.
|
|
116 *
|
|
117 * Each header type (SQ, RG, HD, etc) points to its own SAM_hdr_type
|
|
118 * struct via the main hash table h in the SAM_hdr struct.
|
|
119 *
|
|
120 * These in turn consist of circular bi-directional linked lists (ie
|
|
121 * rings) to hold the multiple instances of the same header type
|
|
122 * code. For example if we have 5 \@SQ lines the primary hash table
|
|
123 * will key on \@SQ pointing to the first SAM_hdr_type and that in turn
|
|
124 * will be part of a ring of 5 elements.
|
|
125 *
|
|
126 * For each SAM_hdr_type structure we also point to a SAM_hdr_tag
|
|
127 * structure which holds the tokenised attributes; the tab separated
|
|
128 * key:value pairs per line.
|
|
129 */
|
|
130 typedef struct SAM_hdr_item_s {
|
|
131 struct SAM_hdr_item_s *next; // cirular
|
|
132 struct SAM_hdr_item_s *prev;
|
|
133 SAM_hdr_tag *tag; // first tag
|
|
134 int order; // 0 upwards
|
|
135 } SAM_hdr_type;
|
|
136
|
|
137 /*! Parsed \@SQ lines */
|
|
138 typedef struct {
|
|
139 char *name;
|
|
140 uint32_t len;
|
|
141 SAM_hdr_type *ty;
|
|
142 SAM_hdr_tag *tag;
|
|
143 } SAM_SQ;
|
|
144
|
|
145 /*! Parsed \@RG lines */
|
|
146 typedef struct {
|
|
147 char *name;
|
|
148 SAM_hdr_type *ty;
|
|
149 SAM_hdr_tag *tag;
|
|
150 int name_len;
|
|
151 int id; // numerical ID
|
|
152 } SAM_RG;
|
|
153
|
|
154 /*! Parsed \@PG lines */
|
|
155 typedef struct {
|
|
156 char *name;
|
|
157 SAM_hdr_type *ty;
|
|
158 SAM_hdr_tag *tag;
|
|
159 int name_len;
|
|
160 int id; // numerical ID
|
|
161 int prev_id; // -1 if none
|
|
162 } SAM_PG;
|
|
163
|
|
164 KHASH_MAP_INIT_INT(sam_hdr, SAM_hdr_type*)
|
|
165 KHASH_MAP_INIT_STR(m_s2i, int)
|
|
166
|
|
167 /*! Primary structure for header manipulation
|
|
168 *
|
|
169 * The initial header text is held in the text kstring_t, but is also
|
|
170 * parsed out into SQ, RG and PG arrays. These have a hash table
|
|
171 * associated with each to allow lookup by ID or SN fields instead of
|
|
172 * their numeric array indices. Additionally PG has an array to hold
|
|
173 * the linked list start points (the last in a PP chain).
|
|
174 *
|
|
175 * Use the appropriate sam_hdr_* functions to edit the header, and
|
|
176 * call sam_hdr_rebuild() any time the textual form needs to be
|
|
177 * updated again.
|
|
178 */
|
|
179 typedef struct {
|
|
180 kstring_t text; //!< concatenated text, indexed by SAM_hdr_tag
|
|
181 khash_t(sam_hdr) *h;
|
|
182 string_alloc_t *str_pool; //!< Pool of SAM_hdr_tag->str strings
|
|
183 pool_alloc_t *type_pool;//!< Pool of SAM_hdr_type structs
|
|
184 pool_alloc_t *tag_pool; //!< Pool of SAM_hdr_tag structs
|
|
185
|
|
186 // @SQ lines / references
|
|
187 int nref; //!< Number of \@SQ lines
|
|
188 SAM_SQ *ref; //!< Array of parsed \@SQ lines
|
|
189 khash_t(m_s2i) *ref_hash; //!< Maps SQ SN field to sq[] index
|
|
190
|
|
191 // @RG lines / read-groups
|
|
192 int nrg; //!< Number of \@RG lines
|
|
193 SAM_RG *rg; //!< Array of parsed \@RG lines
|
|
194 khash_t(m_s2i) *rg_hash; //!< Maps RG ID field to rg[] index
|
|
195
|
|
196 // @PG lines / programs
|
|
197 int npg; //!< Number of \@PG lines
|
|
198 int npg_end; //!< Number of terminating \@PG lines
|
|
199 int npg_end_alloc; //!< Size of pg_end field
|
|
200 SAM_PG *pg; //!< Array of parsed \@PG lines
|
|
201 khash_t(m_s2i) *pg_hash; //!< Maps PG ID field to pg[] index
|
|
202 int *pg_end; //!< \@PG chain termination IDs
|
|
203
|
|
204 // @cond internal
|
|
205 char ID_buf[1024]; // temporary buffer
|
|
206 int ID_cnt;
|
|
207 int ref_count; // number of uses of this SAM_hdr
|
|
208 // @endcond
|
|
209 } SAM_hdr;
|
|
210
|
|
211 /*! Creates an empty SAM header, ready to be populated.
|
|
212 *
|
|
213 * @return
|
|
214 * Returns a SAM_hdr struct on success (free with sam_hdr_free())
|
|
215 * NULL on failure
|
|
216 */
|
|
217 SAM_hdr *sam_hdr_new(void);
|
|
218
|
|
219 /*! Tokenises a SAM header into a hash table.
|
|
220 *
|
|
221 * Also extracts a few bits on specific data types, such as @RG lines.
|
|
222 *
|
|
223 * @return
|
|
224 * Returns a SAM_hdr struct on success (free with sam_hdr_free());
|
|
225 * NULL on failure
|
|
226 */
|
|
227 SAM_hdr *sam_hdr_parse_(const char *hdr, int len);
|
|
228
|
|
229
|
|
230 /*! Produces a duplicate copy of hdr and returns it.
|
|
231 * @return
|
|
232 * Returns NULL on failure
|
|
233 */
|
|
234 SAM_hdr *sam_hdr_dup(SAM_hdr *hdr);
|
|
235
|
|
236
|
|
237 /*! Increments a reference count on hdr.
|
|
238 *
|
|
239 * This permits multiple files to share the same header, all calling
|
|
240 * sam_hdr_free when done, without causing errors for other open files.
|
|
241 */
|
|
242 void sam_hdr_incr_ref(SAM_hdr *hdr);
|
|
243
|
|
244
|
|
245 /*! Increments a reference count on hdr.
|
|
246 *
|
|
247 * This permits multiple files to share the same header, all calling
|
|
248 * sam_hdr_free when done, without causing errors for other open files.
|
|
249 *
|
|
250 * If the reference count hits zero then the header is automatically
|
|
251 * freed. This makes it a synonym for sam_hdr_free().
|
|
252 */
|
|
253 void sam_hdr_decr_ref(SAM_hdr *hdr);
|
|
254
|
|
255
|
|
256 /*! Deallocates all storage used by a SAM_hdr struct.
|
|
257 *
|
|
258 * This also decrements the header reference count. If after decrementing
|
|
259 * it is still non-zero then the header is assumed to be in use by another
|
|
260 * caller and the free is not done.
|
|
261 *
|
|
262 * This is a synonym for sam_hdr_dec_ref().
|
|
263 */
|
|
264 void sam_hdr_free(SAM_hdr *hdr);
|
|
265
|
|
266 /*! Returns the current length of the SAM_hdr in text form.
|
|
267 *
|
|
268 * Call sam_hdr_rebuild() first if editing has taken place.
|
|
269 */
|
|
270 int sam_hdr_length(SAM_hdr *hdr);
|
|
271
|
|
272 /*! Returns the string form of the SAM_hdr.
|
|
273 *
|
|
274 * Call sam_hdr_rebuild() first if editing has taken place.
|
|
275 */
|
|
276 char *sam_hdr_str(SAM_hdr *hdr);
|
|
277
|
|
278 /*! Appends a formatted line to an existing SAM header.
|
|
279 *
|
|
280 * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with
|
|
281 * optional new-line. If it contains more than 1 line then multiple lines
|
|
282 * will be added in order.
|
|
283 *
|
|
284 * Len is the length of the text data, or 0 if unknown (in which case
|
|
285 * it should be null terminated).
|
|
286 *
|
|
287 * @return
|
|
288 * Returns 0 on success;
|
|
289 * -1 on failure
|
|
290 */
|
|
291 int sam_hdr_add_lines(SAM_hdr *sh, const char *lines, int len);
|
|
292
|
|
293 /*! Adds a single line to a SAM header.
|
|
294 *
|
|
295 * Specify type and one or more key,value pairs, ending with the NULL key.
|
|
296 * Eg. sam_hdr_add(h, "SQ", "ID", "foo", "LN", "100", NULL).
|
|
297 *
|
|
298 * @return
|
|
299 * Returns 0 on success;
|
|
300 * -1 on failure
|
|
301 */
|
|
302 int sam_hdr_add(SAM_hdr *sh, const char *type, ...);
|
|
303
|
|
304 /*! Adds a single line to a SAM header.
|
|
305 *
|
|
306 * This is much like sam_hdr_add() but with the additional va_list
|
|
307 * argument. This is followed by specifying type and one or more
|
|
308 * key,value pairs, ending with the NULL key.
|
|
309 *
|
|
310 * Eg. sam_hdr_vadd(h, "SQ", args, "ID", "foo", "LN", "100", NULL).
|
|
311 *
|
|
312 * The purpose of the additional va_list parameter is to permit other
|
|
313 * varargs functions to call this while including their own additional
|
|
314 * parameters; an example is in sam_hdr_add_PG().
|
|
315 *
|
|
316 * @return
|
|
317 * Returns 0 on success;
|
|
318 * -1 on failure
|
|
319 */
|
|
320 int sam_hdr_vadd(SAM_hdr *sh, const char *type, va_list ap, ...);
|
|
321
|
|
322 /*!
|
|
323 * @return
|
|
324 * Returns the first header item matching 'type'. If ID is non-NULL it checks
|
|
325 * for the tag ID: and compares against the specified ID.
|
|
326 *
|
|
327 * Returns NULL if no type/ID is found
|
|
328 */
|
|
329 SAM_hdr_type *sam_hdr_find(SAM_hdr *hdr, char *type,
|
|
330 char *ID_key, char *ID_value);
|
|
331
|
|
332 /*!
|
|
333 *
|
|
334 * As per SAM_hdr_type, but returns a complete line of formatted text
|
|
335 * for a specific head type/ID combination. If ID is NULL then it returns
|
|
336 * the first line of the specified type.
|
|
337 *
|
|
338 * The returned string is malloced and should be freed by the calling
|
|
339 * function with free().
|
|
340 *
|
|
341 * @return
|
|
342 * Returns NULL if no type/ID is found.
|
|
343 */
|
|
344 char *sam_hdr_find_line(SAM_hdr *hdr, char *type,
|
|
345 char *ID_key, char *ID_value);
|
|
346
|
|
347 /*! Looks for a specific key in a single sam header line.
|
|
348 *
|
|
349 * If prev is non-NULL it also fills this out with the previous tag, to
|
|
350 * permit use in key removal. *prev is set to NULL when the tag is the first
|
|
351 * key in the list. When a tag isn't found, prev (if non NULL) will be the last
|
|
352 * tag in the existing list.
|
|
353 *
|
|
354 * @return
|
|
355 * Returns the tag pointer on success;
|
|
356 * NULL on failure
|
|
357 */
|
|
358 SAM_hdr_tag *sam_hdr_find_key(SAM_hdr *sh,
|
|
359 SAM_hdr_type *type,
|
|
360 char *key,
|
|
361 SAM_hdr_tag **prev);
|
|
362
|
|
363 /*! Adds or updates tag key,value pairs in a header line.
|
|
364 *
|
|
365 * Eg for adding M5 tags to @SQ lines or updating sort order for the
|
|
366 * @HD line (although use the sam_hdr_sort_order() function for
|
|
367 * HD manipulation, which is a wrapper around this funuction).
|
|
368 *
|
|
369 * Specify multiple key,value pairs ending in NULL.
|
|
370 *
|
|
371 * @return
|
|
372 * Returns 0 on success;
|
|
373 * -1 on failure
|
|
374 */
|
|
375 int sam_hdr_update(SAM_hdr *hdr, SAM_hdr_type *type, ...);
|
|
376
|
|
377 /*! Reconstructs the kstring from the header hash table.
|
|
378 * @return
|
|
379 * Returns 0 on success;
|
|
380 * -1 on failure
|
|
381 */
|
|
382 int sam_hdr_rebuild(SAM_hdr *hdr);
|
|
383
|
|
384 /*! Looks up a reference sequence by name and returns the numerical ID.
|
|
385 * @return
|
|
386 * Returns -1 if unknown reference.
|
|
387 */
|
|
388 int sam_hdr_name2ref(SAM_hdr *hdr, const char *ref);
|
|
389
|
|
390 /*! Looks up a read-group by name and returns a pointer to the start of the
|
|
391 * associated tag list.
|
|
392 *
|
|
393 * @return
|
|
394 * Returns NULL on failure
|
|
395 */
|
|
396 SAM_RG *sam_hdr_find_rg(SAM_hdr *hdr, const char *rg);
|
|
397
|
|
398 /*! Fixes any PP links in @PG headers.
|
|
399 *
|
|
400 * If the entries are in order then this doesn't need doing, but incase
|
|
401 * our header is out of order this goes through the sh->pg[] array
|
|
402 * setting the prev_id field.
|
|
403 *
|
|
404 * @return
|
|
405 * Returns 0 on sucess;
|
|
406 * -1 on failure (indicating broken PG/PP records)
|
|
407 */
|
|
408 int sam_hdr_link_pg(SAM_hdr *hdr);
|
|
409
|
|
410
|
|
411 /*! Add an @PG line.
|
|
412 *
|
|
413 * If we wish complete control over this use sam_hdr_add() directly. This
|
|
414 * function uses that, but attempts to do a lot of tedious house work for
|
|
415 * you too.
|
|
416 *
|
|
417 * - It will generate a suitable ID if the supplied one clashes.
|
|
418 * - It will generate multiple @PG records if we have multiple PG chains.
|
|
419 *
|
|
420 * Call it as per sam_hdr_add() with a series of key,value pairs ending
|
|
421 * in NULL.
|
|
422 *
|
|
423 * @return
|
|
424 * Returns 0 on success;
|
|
425 * -1 on failure
|
|
426 */
|
|
427 int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...);
|
|
428
|
|
429 /*!
|
|
430 * A function to help with construction of CL tags in @PG records.
|
|
431 * Takes an argc, argv pair and returns a single space-separated string.
|
|
432 * This string should be deallocated by the calling function.
|
|
433 *
|
|
434 * @return
|
|
435 * Returns malloced char * on success;
|
|
436 * NULL on failure
|
|
437 */
|
|
438 char *stringify_argv(int argc, char *argv[]);
|
|
439
|
|
440 #ifdef __cplusplus
|
|
441 }
|
|
442 #endif
|
|
443
|
|
444 #endif /* _SAM_HDR_H_ */
|