comparison ezBAMQC/src/htslib/cram/sam_header.h @ 0:dfa3745e5fd8

Uploaded
author youngkim
date Thu, 24 Mar 2016 17:12:52 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dfa3745e5fd8
1 /*
2 Copyright (c) 2013-2014 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7
8 1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10
11 2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 /*! \file
32 * SAM header parsing.
33 *
34 * These functions can be shared between SAM, BAM and CRAM file
35 * formats as all three internally use the same string encoding for
36 * header fields.
37 */
38
39 /*
40 * TODO.
41 *
42 * - Sort order (parse to struct, enum type, updating funcs)
43 * - Removal of lines.
44 * - Updating of lines
45 */
46
47 #ifndef _SAM_HDR_H_
48 #define _SAM_HDR_H_
49
50 #ifdef __cplusplus
51 extern "C" {
52 #endif
53
54 #ifdef HAVE_CONFIG_H
55 #include "io_lib_config.h"
56 #endif
57
58 #include <stdarg.h>
59
60 #include "cram/string_alloc.h"
61 #include "cram/pooled_alloc.h"
62
63 #include "htslib/khash.h"
64 #include "htslib/kstring.h"
65
66 // For structure assignment. Eg kstring_t s = KS_INITIALIZER;
67 #define KS_INITIALIZER {0,0,0}
68
69 // For initialisation elsewhere. Eg KS_INIT(x->str);
70 #define KS_INIT(ks) ((ks)->l = 0, (ks)->m = 0, (ks)->s = NULL)
71
72 // Frees the string subfield only. Assumes 's' itself is static.
73 #define KS_FREE(ks) do { if ((ks)->s) free((ks)->s); } while(0)
74
75 /*
76 * Proposed new SAM header parsing
77
78 1 @SQ ID:foo LN:100
79 2 @SQ ID:bar LN:200
80 3 @SQ ID:ram LN:300 UR:xyz
81 4 @RG ID:r ...
82 5 @RG ID:s ...
83
84 Hash table for 2-char @keys without dup entries.
85 If dup lines, we form a circular linked list. Ie hash keys = {RG, SQ}.
86
87 HASH("SQ")--\
88 |
89 (3) <-> 1 <-> 2 <-> 3 <-> (1)
90
91 HASH("RG")--\
92 |
93 (5) <-> 4 <-> 5 <-> (4)
94
95 Items stored in the hash values also form their own linked lists:
96 Ie SQ->ID(foo)->LN(100)
97 SQ->ID(bar)->LN(200)
98 SQ->ID(ram)->LN(300)->UR(xyz)
99 RG->ID(r)
100 */
101
102 /*! A single key:value pair on a header line
103 *
104 * These form a linked list and hold strings. The strings are
105 * allocated from a string_alloc_t pool referenced in the master
106 * SAM_hdr structure. Do not attempt to free, malloc or manipulate
107 * these strings directly.
108 */
109 typedef struct SAM_hdr_tag_s {
110 struct SAM_hdr_tag_s *next;
111 char *str;
112 int len;
113 } SAM_hdr_tag;
114
115 /*! The parsed version of the SAM header string.
116 *
117 * Each header type (SQ, RG, HD, etc) points to its own SAM_hdr_type
118 * struct via the main hash table h in the SAM_hdr struct.
119 *
120 * These in turn consist of circular bi-directional linked lists (ie
121 * rings) to hold the multiple instances of the same header type
122 * code. For example if we have 5 \@SQ lines the primary hash table
123 * will key on \@SQ pointing to the first SAM_hdr_type and that in turn
124 * will be part of a ring of 5 elements.
125 *
126 * For each SAM_hdr_type structure we also point to a SAM_hdr_tag
127 * structure which holds the tokenised attributes; the tab separated
128 * key:value pairs per line.
129 */
130 typedef struct SAM_hdr_item_s {
131 struct SAM_hdr_item_s *next; // cirular
132 struct SAM_hdr_item_s *prev;
133 SAM_hdr_tag *tag; // first tag
134 int order; // 0 upwards
135 } SAM_hdr_type;
136
137 /*! Parsed \@SQ lines */
138 typedef struct {
139 char *name;
140 uint32_t len;
141 SAM_hdr_type *ty;
142 SAM_hdr_tag *tag;
143 } SAM_SQ;
144
145 /*! Parsed \@RG lines */
146 typedef struct {
147 char *name;
148 SAM_hdr_type *ty;
149 SAM_hdr_tag *tag;
150 int name_len;
151 int id; // numerical ID
152 } SAM_RG;
153
154 /*! Parsed \@PG lines */
155 typedef struct {
156 char *name;
157 SAM_hdr_type *ty;
158 SAM_hdr_tag *tag;
159 int name_len;
160 int id; // numerical ID
161 int prev_id; // -1 if none
162 } SAM_PG;
163
164 KHASH_MAP_INIT_INT(sam_hdr, SAM_hdr_type*)
165 KHASH_MAP_INIT_STR(m_s2i, int)
166
167 /*! Primary structure for header manipulation
168 *
169 * The initial header text is held in the text kstring_t, but is also
170 * parsed out into SQ, RG and PG arrays. These have a hash table
171 * associated with each to allow lookup by ID or SN fields instead of
172 * their numeric array indices. Additionally PG has an array to hold
173 * the linked list start points (the last in a PP chain).
174 *
175 * Use the appropriate sam_hdr_* functions to edit the header, and
176 * call sam_hdr_rebuild() any time the textual form needs to be
177 * updated again.
178 */
179 typedef struct {
180 kstring_t text; //!< concatenated text, indexed by SAM_hdr_tag
181 khash_t(sam_hdr) *h;
182 string_alloc_t *str_pool; //!< Pool of SAM_hdr_tag->str strings
183 pool_alloc_t *type_pool;//!< Pool of SAM_hdr_type structs
184 pool_alloc_t *tag_pool; //!< Pool of SAM_hdr_tag structs
185
186 // @SQ lines / references
187 int nref; //!< Number of \@SQ lines
188 SAM_SQ *ref; //!< Array of parsed \@SQ lines
189 khash_t(m_s2i) *ref_hash; //!< Maps SQ SN field to sq[] index
190
191 // @RG lines / read-groups
192 int nrg; //!< Number of \@RG lines
193 SAM_RG *rg; //!< Array of parsed \@RG lines
194 khash_t(m_s2i) *rg_hash; //!< Maps RG ID field to rg[] index
195
196 // @PG lines / programs
197 int npg; //!< Number of \@PG lines
198 int npg_end; //!< Number of terminating \@PG lines
199 int npg_end_alloc; //!< Size of pg_end field
200 SAM_PG *pg; //!< Array of parsed \@PG lines
201 khash_t(m_s2i) *pg_hash; //!< Maps PG ID field to pg[] index
202 int *pg_end; //!< \@PG chain termination IDs
203
204 // @cond internal
205 char ID_buf[1024]; // temporary buffer
206 int ID_cnt;
207 int ref_count; // number of uses of this SAM_hdr
208 // @endcond
209 } SAM_hdr;
210
211 /*! Creates an empty SAM header, ready to be populated.
212 *
213 * @return
214 * Returns a SAM_hdr struct on success (free with sam_hdr_free())
215 * NULL on failure
216 */
217 SAM_hdr *sam_hdr_new(void);
218
219 /*! Tokenises a SAM header into a hash table.
220 *
221 * Also extracts a few bits on specific data types, such as @RG lines.
222 *
223 * @return
224 * Returns a SAM_hdr struct on success (free with sam_hdr_free());
225 * NULL on failure
226 */
227 SAM_hdr *sam_hdr_parse_(const char *hdr, int len);
228
229
230 /*! Produces a duplicate copy of hdr and returns it.
231 * @return
232 * Returns NULL on failure
233 */
234 SAM_hdr *sam_hdr_dup(SAM_hdr *hdr);
235
236
237 /*! Increments a reference count on hdr.
238 *
239 * This permits multiple files to share the same header, all calling
240 * sam_hdr_free when done, without causing errors for other open files.
241 */
242 void sam_hdr_incr_ref(SAM_hdr *hdr);
243
244
245 /*! Increments a reference count on hdr.
246 *
247 * This permits multiple files to share the same header, all calling
248 * sam_hdr_free when done, without causing errors for other open files.
249 *
250 * If the reference count hits zero then the header is automatically
251 * freed. This makes it a synonym for sam_hdr_free().
252 */
253 void sam_hdr_decr_ref(SAM_hdr *hdr);
254
255
256 /*! Deallocates all storage used by a SAM_hdr struct.
257 *
258 * This also decrements the header reference count. If after decrementing
259 * it is still non-zero then the header is assumed to be in use by another
260 * caller and the free is not done.
261 *
262 * This is a synonym for sam_hdr_dec_ref().
263 */
264 void sam_hdr_free(SAM_hdr *hdr);
265
266 /*! Returns the current length of the SAM_hdr in text form.
267 *
268 * Call sam_hdr_rebuild() first if editing has taken place.
269 */
270 int sam_hdr_length(SAM_hdr *hdr);
271
272 /*! Returns the string form of the SAM_hdr.
273 *
274 * Call sam_hdr_rebuild() first if editing has taken place.
275 */
276 char *sam_hdr_str(SAM_hdr *hdr);
277
278 /*! Appends a formatted line to an existing SAM header.
279 *
280 * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with
281 * optional new-line. If it contains more than 1 line then multiple lines
282 * will be added in order.
283 *
284 * Len is the length of the text data, or 0 if unknown (in which case
285 * it should be null terminated).
286 *
287 * @return
288 * Returns 0 on success;
289 * -1 on failure
290 */
291 int sam_hdr_add_lines(SAM_hdr *sh, const char *lines, int len);
292
293 /*! Adds a single line to a SAM header.
294 *
295 * Specify type and one or more key,value pairs, ending with the NULL key.
296 * Eg. sam_hdr_add(h, "SQ", "ID", "foo", "LN", "100", NULL).
297 *
298 * @return
299 * Returns 0 on success;
300 * -1 on failure
301 */
302 int sam_hdr_add(SAM_hdr *sh, const char *type, ...);
303
304 /*! Adds a single line to a SAM header.
305 *
306 * This is much like sam_hdr_add() but with the additional va_list
307 * argument. This is followed by specifying type and one or more
308 * key,value pairs, ending with the NULL key.
309 *
310 * Eg. sam_hdr_vadd(h, "SQ", args, "ID", "foo", "LN", "100", NULL).
311 *
312 * The purpose of the additional va_list parameter is to permit other
313 * varargs functions to call this while including their own additional
314 * parameters; an example is in sam_hdr_add_PG().
315 *
316 * @return
317 * Returns 0 on success;
318 * -1 on failure
319 */
320 int sam_hdr_vadd(SAM_hdr *sh, const char *type, va_list ap, ...);
321
322 /*!
323 * @return
324 * Returns the first header item matching 'type'. If ID is non-NULL it checks
325 * for the tag ID: and compares against the specified ID.
326 *
327 * Returns NULL if no type/ID is found
328 */
329 SAM_hdr_type *sam_hdr_find(SAM_hdr *hdr, char *type,
330 char *ID_key, char *ID_value);
331
332 /*!
333 *
334 * As per SAM_hdr_type, but returns a complete line of formatted text
335 * for a specific head type/ID combination. If ID is NULL then it returns
336 * the first line of the specified type.
337 *
338 * The returned string is malloced and should be freed by the calling
339 * function with free().
340 *
341 * @return
342 * Returns NULL if no type/ID is found.
343 */
344 char *sam_hdr_find_line(SAM_hdr *hdr, char *type,
345 char *ID_key, char *ID_value);
346
347 /*! Looks for a specific key in a single sam header line.
348 *
349 * If prev is non-NULL it also fills this out with the previous tag, to
350 * permit use in key removal. *prev is set to NULL when the tag is the first
351 * key in the list. When a tag isn't found, prev (if non NULL) will be the last
352 * tag in the existing list.
353 *
354 * @return
355 * Returns the tag pointer on success;
356 * NULL on failure
357 */
358 SAM_hdr_tag *sam_hdr_find_key(SAM_hdr *sh,
359 SAM_hdr_type *type,
360 char *key,
361 SAM_hdr_tag **prev);
362
363 /*! Adds or updates tag key,value pairs in a header line.
364 *
365 * Eg for adding M5 tags to @SQ lines or updating sort order for the
366 * @HD line (although use the sam_hdr_sort_order() function for
367 * HD manipulation, which is a wrapper around this funuction).
368 *
369 * Specify multiple key,value pairs ending in NULL.
370 *
371 * @return
372 * Returns 0 on success;
373 * -1 on failure
374 */
375 int sam_hdr_update(SAM_hdr *hdr, SAM_hdr_type *type, ...);
376
377 /*! Reconstructs the kstring from the header hash table.
378 * @return
379 * Returns 0 on success;
380 * -1 on failure
381 */
382 int sam_hdr_rebuild(SAM_hdr *hdr);
383
384 /*! Looks up a reference sequence by name and returns the numerical ID.
385 * @return
386 * Returns -1 if unknown reference.
387 */
388 int sam_hdr_name2ref(SAM_hdr *hdr, const char *ref);
389
390 /*! Looks up a read-group by name and returns a pointer to the start of the
391 * associated tag list.
392 *
393 * @return
394 * Returns NULL on failure
395 */
396 SAM_RG *sam_hdr_find_rg(SAM_hdr *hdr, const char *rg);
397
398 /*! Fixes any PP links in @PG headers.
399 *
400 * If the entries are in order then this doesn't need doing, but incase
401 * our header is out of order this goes through the sh->pg[] array
402 * setting the prev_id field.
403 *
404 * @return
405 * Returns 0 on sucess;
406 * -1 on failure (indicating broken PG/PP records)
407 */
408 int sam_hdr_link_pg(SAM_hdr *hdr);
409
410
411 /*! Add an @PG line.
412 *
413 * If we wish complete control over this use sam_hdr_add() directly. This
414 * function uses that, but attempts to do a lot of tedious house work for
415 * you too.
416 *
417 * - It will generate a suitable ID if the supplied one clashes.
418 * - It will generate multiple @PG records if we have multiple PG chains.
419 *
420 * Call it as per sam_hdr_add() with a series of key,value pairs ending
421 * in NULL.
422 *
423 * @return
424 * Returns 0 on success;
425 * -1 on failure
426 */
427 int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...);
428
429 /*!
430 * A function to help with construction of CL tags in @PG records.
431 * Takes an argc, argv pair and returns a single space-separated string.
432 * This string should be deallocated by the calling function.
433 *
434 * @return
435 * Returns malloced char * on success;
436 * NULL on failure
437 */
438 char *stringify_argv(int argc, char *argv[]);
439
440 #ifdef __cplusplus
441 }
442 #endif
443
444 #endif /* _SAM_HDR_H_ */