0
|
1 /*
|
|
2 Author: petr.danecek@sanger
|
|
3 gcc -Wall -Winline -g -O2 -I ~/git/samtools bamcheck.c -o bamcheck -lm -lz -L ~/git/samtools -lbam -lpthread
|
|
4
|
|
5 Assumptions, approximations and other issues:
|
|
6 - GC-depth graph does not split reads, the starting position determines which bin is incremented.
|
|
7 There are small overlaps between bins (max readlen-1). However, the bins are big (20k).
|
|
8 - coverage distribution ignores softclips and deletions
|
|
9 - some stats require sorted BAMs
|
|
10 - GC content graph can have an untidy, step-like pattern when BAM contains multiple read lengths.
|
|
11 - 'bases mapped' (stats->nbases_mapped) is calculated from read lengths given by BAM (core.l_qseq)
|
|
12 - With the -t option, the whole reads are used. Except for the number of mapped bases (cigar)
|
|
13 counts, no splicing is done, no indels or soft clips are considered, even small overlap is
|
|
14 good enough to include the read in the stats.
|
|
15
|
|
16 */
|
|
17
|
|
18 #define BAMCHECK_VERSION "2012-09-04"
|
|
19
|
|
20 #define _ISOC99_SOURCE
|
|
21 #include <stdio.h>
|
|
22 #include <stdlib.h>
|
|
23 #include <stdarg.h>
|
|
24 #include <string.h>
|
|
25 #include <math.h>
|
|
26 #include <ctype.h>
|
|
27 #include <getopt.h>
|
|
28 #include <errno.h>
|
|
29 #include <assert.h>
|
|
30 #include "faidx.h"
|
|
31 #include "khash.h"
|
|
32 #include "sam.h"
|
|
33 #include "sam_header.h"
|
|
34 #include "razf.h"
|
|
35
|
|
36 #define BWA_MIN_RDLEN 35
|
|
37 #define IS_PAIRED(bam) ((bam)->core.flag&BAM_FPAIRED && !((bam)->core.flag&BAM_FUNMAP) && !((bam)->core.flag&BAM_FMUNMAP))
|
|
38 #define IS_UNMAPPED(bam) ((bam)->core.flag&BAM_FUNMAP)
|
|
39 #define IS_REVERSE(bam) ((bam)->core.flag&BAM_FREVERSE)
|
|
40 #define IS_MATE_REVERSE(bam) ((bam)->core.flag&BAM_FMREVERSE)
|
|
41 #define IS_READ1(bam) ((bam)->core.flag&BAM_FREAD1)
|
|
42 #define IS_READ2(bam) ((bam)->core.flag&BAM_FREAD2)
|
|
43 #define IS_DUP(bam) ((bam)->core.flag&BAM_FDUP)
|
|
44
|
|
45 typedef struct
|
|
46 {
|
|
47 int32_t line_len, line_blen;
|
|
48 int64_t len;
|
|
49 uint64_t offset;
|
|
50 }
|
|
51 faidx1_t;
|
|
52 KHASH_MAP_INIT_STR(kh_faidx, faidx1_t)
|
|
53 KHASH_MAP_INIT_STR(kh_bam_tid, int)
|
|
54 KHASH_MAP_INIT_STR(kh_rg, const char *)
|
|
55 struct __faidx_t {
|
|
56 RAZF *rz;
|
|
57 int n, m;
|
|
58 char **name;
|
|
59 khash_t(kh_faidx) *hash;
|
|
60 };
|
|
61
|
|
62 typedef struct
|
|
63 {
|
|
64 float gc;
|
|
65 uint32_t depth;
|
|
66 }
|
|
67 gc_depth_t;
|
|
68
|
|
69 // For coverage distribution, a simple pileup
|
|
70 typedef struct
|
|
71 {
|
|
72 int64_t pos;
|
|
73 int size, start;
|
|
74 int *buffer;
|
|
75 }
|
|
76 round_buffer_t;
|
|
77
|
|
78 typedef struct { uint32_t from, to; } pos_t;
|
|
79 typedef struct
|
|
80 {
|
|
81 int npos,mpos,cpos;
|
|
82 pos_t *pos;
|
|
83 }
|
|
84 regions_t;
|
|
85
|
|
86 typedef struct
|
|
87 {
|
|
88 // Parameters
|
|
89 int trim_qual; // bwa trim quality
|
|
90
|
|
91 // Dimensions of the quality histogram holder (quals_1st,quals_2nd), GC content holder (gc_1st,gc_2nd),
|
|
92 // insert size histogram holder
|
|
93 int nquals; // The number of quality bins
|
|
94 int nbases; // The maximum sequence length the allocated array can hold
|
|
95 int nisize; // The maximum insert size that the allocated array can hold
|
|
96 int ngc; // The size of gc_1st and gc_2nd
|
|
97 int nindels; // The maximum indel length for indel distribution
|
|
98
|
|
99 // Arrays for the histogram data
|
|
100 uint64_t *quals_1st, *quals_2nd;
|
|
101 uint64_t *gc_1st, *gc_2nd;
|
|
102 uint64_t *isize_inward, *isize_outward, *isize_other;
|
|
103 uint64_t *acgt_cycles;
|
|
104 uint64_t *read_lengths;
|
|
105 uint64_t *insertions, *deletions;
|
|
106 uint64_t *ins_cycles_1st, *ins_cycles_2nd, *del_cycles_1st, *del_cycles_2nd;
|
|
107
|
|
108 // The extremes encountered
|
|
109 int max_len; // Maximum read length
|
|
110 int max_qual; // Maximum quality
|
|
111 float isize_main_bulk; // There are always some unrealistically big insert sizes, report only the main part
|
|
112 int is_sorted;
|
|
113
|
|
114 // Summary numbers
|
|
115 uint64_t total_len;
|
|
116 uint64_t total_len_dup;
|
|
117 uint64_t nreads_1st;
|
|
118 uint64_t nreads_2nd;
|
|
119 uint64_t nreads_filtered;
|
|
120 uint64_t nreads_dup;
|
|
121 uint64_t nreads_unmapped;
|
|
122 uint64_t nreads_unpaired;
|
|
123 uint64_t nreads_paired;
|
|
124 uint64_t nreads_anomalous;
|
|
125 uint64_t nreads_mq0;
|
|
126 uint64_t nbases_mapped;
|
|
127 uint64_t nbases_mapped_cigar;
|
|
128 uint64_t nbases_trimmed; // bwa trimmed bases
|
|
129 uint64_t nmismatches;
|
|
130 uint64_t nreads_QCfailed, nreads_secondary;
|
|
131
|
|
132 // GC-depth related data
|
|
133 uint32_t ngcd, igcd; // The maximum number of GC depth bins and index of the current bin
|
|
134 gc_depth_t *gcd; // The GC-depth bins holder
|
|
135 int gcd_bin_size; // The size of GC-depth bin
|
|
136 uint32_t gcd_ref_size; // The approximate size of the genome
|
|
137 int32_t tid, gcd_pos; // Position of the current bin
|
|
138 int32_t pos; // Position of the last read
|
|
139
|
|
140 // Coverage distribution related data
|
|
141 int ncov; // The number of coverage bins
|
|
142 uint64_t *cov; // The coverage frequencies
|
|
143 int cov_min,cov_max,cov_step; // Minimum, maximum coverage and size of the coverage bins
|
|
144 round_buffer_t cov_rbuf; // Pileup round buffer
|
|
145
|
|
146 // Mismatches by read cycle
|
|
147 uint8_t *rseq_buf; // A buffer for reference sequence to check the mismatches against
|
|
148 int mrseq_buf; // The size of the buffer
|
|
149 int32_t rseq_pos; // The coordinate of the first base in the buffer
|
|
150 int32_t nrseq_buf; // The used part of the buffer
|
|
151 uint64_t *mpc_buf; // Mismatches per cycle
|
|
152
|
|
153 // Filters
|
|
154 int filter_readlen;
|
|
155
|
|
156 // Target regions
|
|
157 int nregions, reg_from,reg_to;
|
|
158 regions_t *regions;
|
|
159
|
|
160 // Auxiliary data
|
|
161 int flag_require, flag_filter;
|
|
162 double sum_qual; // For calculating average quality value
|
|
163 samfile_t *sam;
|
|
164 khash_t(kh_rg) *rg_hash; // Read groups to include, the array is null-terminated
|
|
165 faidx_t *fai; // Reference sequence for GC-depth graph
|
|
166 int argc; // Command line arguments to be printed on the output
|
|
167 char **argv;
|
|
168 }
|
|
169 stats_t;
|
|
170
|
|
171 void error(const char *format, ...);
|
|
172 void bam_init_header_hash(bam_header_t *header);
|
|
173 int is_in_regions(bam1_t *bam_line, stats_t *stats);
|
|
174
|
|
175
|
|
176 // Coverage distribution methods
|
|
177 inline int coverage_idx(int min, int max, int n, int step, int depth)
|
|
178 {
|
|
179 if ( depth < min )
|
|
180 return 0;
|
|
181
|
|
182 if ( depth > max )
|
|
183 return n-1;
|
|
184
|
|
185 return 1 + (depth - min) / step;
|
|
186 }
|
|
187
|
|
188 inline int round_buffer_lidx2ridx(int offset, int size, int64_t refpos, int64_t pos)
|
|
189 {
|
|
190 return (offset + (pos-refpos) % size) % size;
|
|
191 }
|
|
192
|
|
193 void round_buffer_flush(stats_t *stats, int64_t pos)
|
|
194 {
|
|
195 int ibuf,idp;
|
|
196
|
|
197 if ( pos==stats->cov_rbuf.pos )
|
|
198 return;
|
|
199
|
|
200 int64_t new_pos = pos;
|
|
201 if ( pos==-1 || pos - stats->cov_rbuf.pos >= stats->cov_rbuf.size )
|
|
202 {
|
|
203 // Flush the whole buffer, but in sequential order,
|
|
204 pos = stats->cov_rbuf.pos + stats->cov_rbuf.size - 1;
|
|
205 }
|
|
206
|
|
207 if ( pos < stats->cov_rbuf.pos )
|
|
208 error("Expected coordinates in ascending order, got %ld after %ld\n", pos,stats->cov_rbuf.pos);
|
|
209
|
|
210 int ifrom = stats->cov_rbuf.start;
|
|
211 int ito = round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos-1);
|
|
212 if ( ifrom>ito )
|
|
213 {
|
|
214 for (ibuf=ifrom; ibuf<stats->cov_rbuf.size; ibuf++)
|
|
215 {
|
|
216 if ( !stats->cov_rbuf.buffer[ibuf] )
|
|
217 continue;
|
|
218 idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]);
|
|
219 stats->cov[idp]++;
|
|
220 stats->cov_rbuf.buffer[ibuf] = 0;
|
|
221 }
|
|
222 ifrom = 0;
|
|
223 }
|
|
224 for (ibuf=ifrom; ibuf<=ito; ibuf++)
|
|
225 {
|
|
226 if ( !stats->cov_rbuf.buffer[ibuf] )
|
|
227 continue;
|
|
228 idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]);
|
|
229 stats->cov[idp]++;
|
|
230 stats->cov_rbuf.buffer[ibuf] = 0;
|
|
231 }
|
|
232 stats->cov_rbuf.start = (new_pos==-1) ? 0 : round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos);
|
|
233 stats->cov_rbuf.pos = new_pos;
|
|
234 }
|
|
235
|
|
236 void round_buffer_insert_read(round_buffer_t *rbuf, int64_t from, int64_t to)
|
|
237 {
|
|
238 if ( to-from >= rbuf->size )
|
|
239 error("The read length too big (%d), please increase the buffer length (currently %d)\n", to-from+1,rbuf->size);
|
|
240 if ( from < rbuf->pos )
|
|
241 error("The reads are not sorted (%ld comes after %ld).\n", from,rbuf->pos);
|
|
242
|
|
243 int ifrom,ito,ibuf;
|
|
244 ifrom = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,from);
|
|
245 ito = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,to);
|
|
246 if ( ifrom>ito )
|
|
247 {
|
|
248 for (ibuf=ifrom; ibuf<rbuf->size; ibuf++)
|
|
249 rbuf->buffer[ibuf]++;
|
|
250 ifrom = 0;
|
|
251 }
|
|
252 for (ibuf=ifrom; ibuf<=ito; ibuf++)
|
|
253 rbuf->buffer[ibuf]++;
|
|
254 }
|
|
255
|
|
256 // Calculate the number of bases in the read trimmed by BWA
|
|
257 int bwa_trim_read(int trim_qual, uint8_t *quals, int len, int reverse)
|
|
258 {
|
|
259 if ( len<BWA_MIN_RDLEN ) return 0;
|
|
260
|
|
261 // Although the name implies that the read cannot be trimmed to more than BWA_MIN_RDLEN,
|
|
262 // the calculation can in fact trim it to (BWA_MIN_RDLEN-1). (bwa_trim_read in bwa/bwaseqio.c).
|
|
263 int max_trimmed = len - BWA_MIN_RDLEN + 1;
|
|
264 int l, sum=0, max_sum=0, max_l=0;
|
|
265
|
|
266 for (l=0; l<max_trimmed; l++)
|
|
267 {
|
|
268 sum += trim_qual - quals[ reverse ? l : len-1-l ];
|
|
269 if ( sum<0 ) break;
|
|
270 if ( sum>max_sum )
|
|
271 {
|
|
272 max_sum = sum;
|
|
273 // This is the correct way, but bwa clips from some reason one base less
|
|
274 // max_l = l+1;
|
|
275 max_l = l;
|
|
276 }
|
|
277 }
|
|
278 return max_l;
|
|
279 }
|
|
280
|
|
281
|
|
282 void count_indels(stats_t *stats,bam1_t *bam_line)
|
|
283 {
|
|
284 int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
|
|
285 int is_1st = IS_READ1(bam_line) ? 1 : 0;
|
|
286 int icig;
|
|
287 int icycle = 0;
|
|
288 int read_len = bam_line->core.l_qseq;
|
|
289 for (icig=0; icig<bam_line->core.n_cigar; icig++)
|
|
290 {
|
|
291 // Conversion from uint32_t to MIDNSHP
|
|
292 // 0123456
|
|
293 // MIDNSHP
|
|
294 int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK;
|
|
295 int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT;
|
|
296
|
|
297 if ( cig==1 )
|
|
298 {
|
|
299 int idx = is_fwd ? icycle : read_len-icycle-ncig;
|
|
300 if ( idx<0 )
|
|
301 error("FIXME: read_len=%d vs icycle=%d\n", read_len,icycle);
|
|
302 if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d, %s:%d %s\n", idx,stats->nbases, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
|
|
303 if ( is_1st )
|
|
304 stats->ins_cycles_1st[idx]++;
|
|
305 else
|
|
306 stats->ins_cycles_2nd[idx]++;
|
|
307 icycle += ncig;
|
|
308 if ( ncig<=stats->nindels )
|
|
309 stats->insertions[ncig-1]++;
|
|
310 continue;
|
|
311 }
|
|
312 if ( cig==2 )
|
|
313 {
|
|
314 int idx = is_fwd ? icycle-1 : read_len-icycle-1;
|
|
315 if ( idx<0 ) continue; // discard meaningless deletions
|
|
316 if ( idx >= stats->nbases ) error("FIXME: %d vs %d\n", idx,stats->nbases);
|
|
317 if ( is_1st )
|
|
318 stats->del_cycles_1st[idx]++;
|
|
319 else
|
|
320 stats->del_cycles_2nd[idx]++;
|
|
321 if ( ncig<=stats->nindels )
|
|
322 stats->deletions[ncig-1]++;
|
|
323 continue;
|
|
324 }
|
|
325 if ( cig!=3 && cig!=5 )
|
|
326 icycle += ncig;
|
|
327 }
|
|
328 }
|
|
329
|
|
330 void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line)
|
|
331 {
|
|
332 int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
|
|
333 int icig,iread=0,icycle=0;
|
|
334 int iref = bam_line->core.pos - stats->rseq_pos;
|
|
335 int read_len = bam_line->core.l_qseq;
|
|
336 uint8_t *read = bam1_seq(bam_line);
|
|
337 uint8_t *quals = bam1_qual(bam_line);
|
|
338 uint64_t *mpc_buf = stats->mpc_buf;
|
|
339 for (icig=0; icig<bam_line->core.n_cigar; icig++)
|
|
340 {
|
|
341 // Conversion from uint32_t to MIDNSHP
|
|
342 // 0123456
|
|
343 // MIDNSHP
|
|
344 int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK;
|
|
345 int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT;
|
|
346 if ( cig==1 )
|
|
347 {
|
|
348 iread += ncig;
|
|
349 icycle += ncig;
|
|
350 continue;
|
|
351 }
|
|
352 if ( cig==2 )
|
|
353 {
|
|
354 iref += ncig;
|
|
355 continue;
|
|
356 }
|
|
357 if ( cig==4 )
|
|
358 {
|
|
359 icycle += ncig;
|
|
360 // Soft-clips are present in the sequence, but the position of the read marks a start of non-clipped sequence
|
|
361 // iref += ncig;
|
|
362 iread += ncig;
|
|
363 continue;
|
|
364 }
|
|
365 if ( cig==5 )
|
|
366 {
|
|
367 icycle += ncig;
|
|
368 continue;
|
|
369 }
|
|
370 // Ignore H and N CIGARs. The letter are inserted e.g. by TopHat and often require very large
|
|
371 // chunk of refseq in memory. Not very frequent and not noticable in the stats.
|
|
372 if ( cig==3 || cig==5 ) continue;
|
|
373 if ( cig!=0 )
|
|
374 error("TODO: cigar %d, %s:%d %s\n", cig,stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
|
|
375
|
|
376 if ( ncig+iref > stats->nrseq_buf )
|
|
377 error("FIXME: %d+%d > %d, %s, %s:%d\n",ncig,iref,stats->nrseq_buf, bam1_qname(bam_line),stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1);
|
|
378
|
|
379 int im;
|
|
380 for (im=0; im<ncig; im++)
|
|
381 {
|
|
382 uint8_t cread = bam1_seqi(read,iread);
|
|
383 uint8_t cref = stats->rseq_buf[iref];
|
|
384
|
|
385 // ---------------15
|
|
386 // =ACMGRSVTWYHKDBN
|
|
387 if ( cread==15 )
|
|
388 {
|
|
389 int idx = is_fwd ? icycle : read_len-icycle-1;
|
|
390 if ( idx>stats->max_len )
|
|
391 error("mpc: %d>%d\n",idx,stats->max_len);
|
|
392 idx = idx*stats->nquals;
|
|
393 if ( idx>=stats->nquals*stats->nbases )
|
|
394 error("FIXME: mpc_buf overflow\n");
|
|
395 mpc_buf[idx]++;
|
|
396 }
|
|
397 else if ( cref && cread && cref!=cread )
|
|
398 {
|
|
399 uint8_t qual = quals[iread] + 1;
|
|
400 if ( qual>=stats->nquals )
|
|
401 error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
|
|
402
|
|
403 int idx = is_fwd ? icycle : read_len-icycle-1;
|
|
404 if ( idx>stats->max_len )
|
|
405 error("mpc: %d>%d\n",idx,stats->max_len);
|
|
406
|
|
407 idx = idx*stats->nquals + qual;
|
|
408 if ( idx>=stats->nquals*stats->nbases )
|
|
409 error("FIXME: mpc_buf overflow\n");
|
|
410 mpc_buf[idx]++;
|
|
411 }
|
|
412
|
|
413 iref++;
|
|
414 iread++;
|
|
415 icycle++;
|
|
416 }
|
|
417 }
|
|
418 }
|
|
419
|
|
420 void read_ref_seq(stats_t *stats,int32_t tid,int32_t pos)
|
|
421 {
|
|
422 khash_t(kh_faidx) *h;
|
|
423 khiter_t iter;
|
|
424 faidx1_t val;
|
|
425 char *chr, c;
|
|
426 faidx_t *fai = stats->fai;
|
|
427
|
|
428 h = fai->hash;
|
|
429 chr = stats->sam->header->target_name[tid];
|
|
430
|
|
431 // ID of the sequence name
|
|
432 iter = kh_get(kh_faidx, h, chr);
|
|
433 if (iter == kh_end(h))
|
|
434 error("No such reference sequence [%s]?\n", chr);
|
|
435 val = kh_value(h, iter);
|
|
436
|
|
437 // Check the boundaries
|
|
438 if (pos >= val.len)
|
|
439 error("Was the bam file mapped with the reference sequence supplied?"
|
|
440 " A read mapped beyond the end of the chromosome (%s:%d, chromosome length %d).\n", chr,pos,val.len);
|
|
441 int size = stats->mrseq_buf;
|
|
442 // The buffer extends beyond the chromosome end. Later the rest will be filled with N's.
|
|
443 if (size+pos > val.len) size = val.len-pos;
|
|
444
|
|
445 // Position the razf reader
|
|
446 razf_seek(fai->rz, val.offset + pos / val.line_blen * val.line_len + pos % val.line_blen, SEEK_SET);
|
|
447
|
|
448 uint8_t *ptr = stats->rseq_buf;
|
|
449 int nread = 0;
|
|
450 while ( nread<size && razf_read(fai->rz,&c,1) && !fai->rz->z_err )
|
|
451 {
|
|
452 if ( !isgraph(c) )
|
|
453 continue;
|
|
454
|
|
455 // Conversion between uint8_t coding and ACGT
|
|
456 // -12-4---8-------
|
|
457 // =ACMGRSVTWYHKDBN
|
|
458 if ( c=='A' || c=='a' )
|
|
459 *ptr = 1;
|
|
460 else if ( c=='C' || c=='c' )
|
|
461 *ptr = 2;
|
|
462 else if ( c=='G' || c=='g' )
|
|
463 *ptr = 4;
|
|
464 else if ( c=='T' || c=='t' )
|
|
465 *ptr = 8;
|
|
466 else
|
|
467 *ptr = 0;
|
|
468 ptr++;
|
|
469 nread++;
|
|
470 }
|
|
471 if ( nread < stats->mrseq_buf )
|
|
472 {
|
|
473 memset(ptr,0, stats->mrseq_buf - nread);
|
|
474 nread = stats->mrseq_buf;
|
|
475 }
|
|
476 stats->nrseq_buf = nread;
|
|
477 stats->rseq_pos = pos;
|
|
478 stats->tid = tid;
|
|
479 }
|
|
480
|
|
481 float fai_gc_content(stats_t *stats, int pos, int len)
|
|
482 {
|
|
483 uint32_t gc,count,c;
|
|
484 int i = pos - stats->rseq_pos, ito = i + len;
|
|
485 assert( i>=0 && ito<=stats->nrseq_buf );
|
|
486
|
|
487 // Count GC content
|
|
488 gc = count = 0;
|
|
489 for (; i<ito; i++)
|
|
490 {
|
|
491 c = stats->rseq_buf[i];
|
|
492 if ( c==2 || c==4 )
|
|
493 {
|
|
494 gc++;
|
|
495 count++;
|
|
496 }
|
|
497 else if ( c==1 || c==8 )
|
|
498 count++;
|
|
499 }
|
|
500 return count ? (float)gc/count : 0;
|
|
501 }
|
|
502
|
|
503 void realloc_rseq_buffer(stats_t *stats)
|
|
504 {
|
|
505 int n = stats->nbases*10;
|
|
506 if ( stats->gcd_bin_size > n ) n = stats->gcd_bin_size;
|
|
507 if ( stats->mrseq_buf<n )
|
|
508 {
|
|
509 stats->rseq_buf = realloc(stats->rseq_buf,sizeof(uint8_t)*n);
|
|
510 stats->mrseq_buf = n;
|
|
511 }
|
|
512 }
|
|
513
|
|
514 void realloc_gcd_buffer(stats_t *stats, int seq_len)
|
|
515 {
|
|
516 if ( seq_len >= stats->gcd_bin_size )
|
|
517 error("The --GC-depth bin size (%d) is set too low for the read length %d\n", stats->gcd_bin_size, seq_len);
|
|
518
|
|
519 int n = 1 + stats->gcd_ref_size / (stats->gcd_bin_size - seq_len);
|
|
520 if ( n <= stats->igcd )
|
|
521 error("The --GC-depth bin size is too small or reference genome too big; please decrease the bin size or increase the reference length\n");
|
|
522
|
|
523 if ( n > stats->ngcd )
|
|
524 {
|
|
525 stats->gcd = realloc(stats->gcd, n*sizeof(gc_depth_t));
|
|
526 if ( !stats->gcd )
|
|
527 error("Could not realloc GCD buffer, too many chromosomes or the genome too long?? [%u %u]\n", stats->ngcd,n);
|
|
528 memset(&(stats->gcd[stats->ngcd]),0,(n-stats->ngcd)*sizeof(gc_depth_t));
|
|
529 stats->ngcd = n;
|
|
530 }
|
|
531
|
|
532 realloc_rseq_buffer(stats);
|
|
533 }
|
|
534
|
|
535 void realloc_buffers(stats_t *stats, int seq_len)
|
|
536 {
|
|
537 int n = 2*(1 + seq_len - stats->nbases) + stats->nbases;
|
|
538
|
|
539 stats->quals_1st = realloc(stats->quals_1st, n*stats->nquals*sizeof(uint64_t));
|
|
540 if ( !stats->quals_1st )
|
|
541 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
|
|
542 memset(stats->quals_1st + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
|
|
543
|
|
544 stats->quals_2nd = realloc(stats->quals_2nd, n*stats->nquals*sizeof(uint64_t));
|
|
545 if ( !stats->quals_2nd )
|
|
546 error("Could not realloc buffers, the sequence too long: %d (2x%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
|
|
547 memset(stats->quals_2nd + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
|
|
548
|
|
549 if ( stats->mpc_buf )
|
|
550 {
|
|
551 stats->mpc_buf = realloc(stats->mpc_buf, n*stats->nquals*sizeof(uint64_t));
|
|
552 if ( !stats->mpc_buf )
|
|
553 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
|
|
554 memset(stats->mpc_buf + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
|
|
555 }
|
|
556
|
|
557 stats->acgt_cycles = realloc(stats->acgt_cycles, n*4*sizeof(uint64_t));
|
|
558 if ( !stats->acgt_cycles )
|
|
559 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*4*sizeof(uint64_t));
|
|
560 memset(stats->acgt_cycles + stats->nbases*4, 0, (n-stats->nbases)*4*sizeof(uint64_t));
|
|
561
|
|
562 stats->read_lengths = realloc(stats->read_lengths, n*sizeof(uint64_t));
|
|
563 if ( !stats->read_lengths )
|
|
564 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
|
|
565 memset(stats->read_lengths + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
|
|
566
|
|
567 stats->insertions = realloc(stats->insertions, n*sizeof(uint64_t));
|
|
568 if ( !stats->insertions )
|
|
569 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
|
|
570 memset(stats->insertions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
|
|
571
|
|
572 stats->deletions = realloc(stats->deletions, n*sizeof(uint64_t));
|
|
573 if ( !stats->deletions )
|
|
574 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
|
|
575 memset(stats->deletions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
|
|
576
|
|
577 stats->ins_cycles_1st = realloc(stats->ins_cycles_1st, (n+1)*sizeof(uint64_t));
|
|
578 if ( !stats->ins_cycles_1st )
|
|
579 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
|
|
580 memset(stats->ins_cycles_1st + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
|
|
581
|
|
582 stats->ins_cycles_2nd = realloc(stats->ins_cycles_2nd, (n+1)*sizeof(uint64_t));
|
|
583 if ( !stats->ins_cycles_2nd )
|
|
584 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
|
|
585 memset(stats->ins_cycles_2nd + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
|
|
586
|
|
587 stats->del_cycles_1st = realloc(stats->del_cycles_1st, (n+1)*sizeof(uint64_t));
|
|
588 if ( !stats->del_cycles_1st )
|
|
589 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
|
|
590 memset(stats->del_cycles_1st + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
|
|
591
|
|
592 stats->del_cycles_2nd = realloc(stats->del_cycles_2nd, (n+1)*sizeof(uint64_t));
|
|
593 if ( !stats->del_cycles_2nd )
|
|
594 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
|
|
595 memset(stats->del_cycles_2nd + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
|
|
596
|
|
597 stats->nbases = n;
|
|
598
|
|
599 // Realloc the coverage distribution buffer
|
|
600 int *rbuffer = calloc(sizeof(int),seq_len*5);
|
|
601 n = stats->cov_rbuf.size-stats->cov_rbuf.start;
|
|
602 memcpy(rbuffer,stats->cov_rbuf.buffer+stats->cov_rbuf.start,n);
|
|
603 if ( stats->cov_rbuf.start>1 )
|
|
604 memcpy(rbuffer+n,stats->cov_rbuf.buffer,stats->cov_rbuf.start);
|
|
605 stats->cov_rbuf.start = 0;
|
|
606 free(stats->cov_rbuf.buffer);
|
|
607 stats->cov_rbuf.buffer = rbuffer;
|
|
608 stats->cov_rbuf.size = seq_len*5;
|
|
609
|
|
610 realloc_rseq_buffer(stats);
|
|
611 }
|
|
612
|
|
613 void collect_stats(bam1_t *bam_line, stats_t *stats)
|
|
614 {
|
|
615 if ( stats->rg_hash )
|
|
616 {
|
|
617 const uint8_t *rg = bam_aux_get(bam_line, "RG");
|
|
618 if ( !rg ) return;
|
|
619 khiter_t k = kh_get(kh_rg, stats->rg_hash, (const char*)(rg + 1));
|
|
620 if ( k == kh_end(stats->rg_hash) ) return;
|
|
621 }
|
|
622 if ( stats->flag_require && (bam_line->core.flag & stats->flag_require)!=stats->flag_require )
|
|
623 {
|
|
624 stats->nreads_filtered++;
|
|
625 return;
|
|
626 }
|
|
627 if ( stats->flag_filter && (bam_line->core.flag & stats->flag_filter) )
|
|
628 {
|
|
629 stats->nreads_filtered++;
|
|
630 return;
|
|
631 }
|
|
632 if ( !is_in_regions(bam_line,stats) )
|
|
633 return;
|
|
634 if ( stats->filter_readlen!=-1 && bam_line->core.l_qseq!=stats->filter_readlen )
|
|
635 return;
|
|
636
|
|
637 if ( bam_line->core.flag & BAM_FQCFAIL ) stats->nreads_QCfailed++;
|
|
638 if ( bam_line->core.flag & BAM_FSECONDARY ) stats->nreads_secondary++;
|
|
639
|
|
640 int seq_len = bam_line->core.l_qseq;
|
|
641 if ( !seq_len ) return;
|
|
642
|
|
643 if ( seq_len >= stats->nbases )
|
|
644 realloc_buffers(stats,seq_len);
|
|
645 if ( stats->max_len<seq_len )
|
|
646 stats->max_len = seq_len;
|
|
647
|
|
648 stats->read_lengths[seq_len]++;
|
|
649
|
|
650 // Count GC and ACGT per cycle
|
|
651 uint8_t base, *seq = bam1_seq(bam_line);
|
|
652 int gc_count = 0;
|
|
653 int i;
|
|
654 int reverse = IS_REVERSE(bam_line);
|
|
655 for (i=0; i<seq_len; i++)
|
|
656 {
|
|
657 // Conversion from uint8_t coding to ACGT
|
|
658 // -12-4---8-------
|
|
659 // =ACMGRSVTWYHKDBN
|
|
660 // 01 2 3
|
|
661 base = bam1_seqi(seq,i);
|
|
662 base /= 2;
|
|
663 if ( base==1 || base==2 ) gc_count++;
|
|
664 else if ( base>2 ) base=3;
|
|
665 if ( 4*(reverse ? seq_len-i-1 : i) + base >= stats->nbases*4 )
|
|
666 error("FIXME: acgt_cycles\n");
|
|
667 stats->acgt_cycles[ 4*(reverse ? seq_len-i-1 : i) + base ]++;
|
|
668 }
|
|
669 int gc_idx_min = gc_count*(stats->ngc-1)/seq_len;
|
|
670 int gc_idx_max = (gc_count+1)*(stats->ngc-1)/seq_len;
|
|
671 if ( gc_idx_max >= stats->ngc ) gc_idx_max = stats->ngc - 1;
|
|
672
|
|
673 // Determine which array (1st or 2nd read) will these stats go to,
|
|
674 // trim low quality bases from end the same way BWA does,
|
|
675 // fill GC histogram
|
|
676 uint64_t *quals;
|
|
677 uint8_t *bam_quals = bam1_qual(bam_line);
|
|
678 if ( bam_line->core.flag&BAM_FREAD2 )
|
|
679 {
|
|
680 quals = stats->quals_2nd;
|
|
681 stats->nreads_2nd++;
|
|
682 for (i=gc_idx_min; i<gc_idx_max; i++)
|
|
683 stats->gc_2nd[i]++;
|
|
684 }
|
|
685 else
|
|
686 {
|
|
687 quals = stats->quals_1st;
|
|
688 stats->nreads_1st++;
|
|
689 for (i=gc_idx_min; i<gc_idx_max; i++)
|
|
690 stats->gc_1st[i]++;
|
|
691 }
|
|
692 if ( stats->trim_qual>0 )
|
|
693 stats->nbases_trimmed += bwa_trim_read(stats->trim_qual, bam_quals, seq_len, reverse);
|
|
694
|
|
695 // Quality histogram and average quality
|
|
696 for (i=0; i<seq_len; i++)
|
|
697 {
|
|
698 uint8_t qual = bam_quals[ reverse ? seq_len-i-1 : i];
|
|
699 if ( qual>=stats->nquals )
|
|
700 error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals,stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
|
|
701 if ( qual>stats->max_qual )
|
|
702 stats->max_qual = qual;
|
|
703
|
|
704 quals[ i*stats->nquals+qual ]++;
|
|
705 stats->sum_qual += qual;
|
|
706 }
|
|
707
|
|
708 // Look at the flags and increment appropriate counters (mapped, paired, etc)
|
|
709 if ( IS_UNMAPPED(bam_line) )
|
|
710 stats->nreads_unmapped++;
|
|
711 else
|
|
712 {
|
|
713 if ( !bam_line->core.qual )
|
|
714 stats->nreads_mq0++;
|
|
715
|
|
716 count_indels(stats,bam_line);
|
|
717
|
|
718 if ( !IS_PAIRED(bam_line) )
|
|
719 stats->nreads_unpaired++;
|
|
720 else
|
|
721 {
|
|
722 stats->nreads_paired++;
|
|
723
|
|
724 if ( bam_line->core.tid!=bam_line->core.mtid )
|
|
725 stats->nreads_anomalous++;
|
|
726
|
|
727 // The insert size is tricky, because for long inserts the libraries are
|
|
728 // prepared differently and the pairs point in other direction. BWA does
|
|
729 // not set the paired flag for them. Similar thing is true also for 454
|
|
730 // reads. Mates mapped to different chromosomes have isize==0.
|
|
731 int32_t isize = bam_line->core.isize;
|
|
732 if ( isize<0 ) isize = -isize;
|
|
733 if ( isize >= stats->nisize )
|
|
734 isize = stats->nisize-1;
|
|
735 if ( isize>0 || bam_line->core.tid==bam_line->core.mtid )
|
|
736 {
|
|
737 int pos_fst = bam_line->core.mpos - bam_line->core.pos;
|
|
738 int is_fst = IS_READ1(bam_line) ? 1 : -1;
|
|
739 int is_fwd = IS_REVERSE(bam_line) ? -1 : 1;
|
|
740 int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1;
|
|
741
|
|
742 if ( is_fwd*is_mfwd>0 )
|
|
743 stats->isize_other[isize]++;
|
|
744 else if ( is_fst*pos_fst>0 )
|
|
745 {
|
|
746 if ( is_fst*is_fwd>0 )
|
|
747 stats->isize_inward[isize]++;
|
|
748 else
|
|
749 stats->isize_outward[isize]++;
|
|
750 }
|
|
751 else if ( is_fst*pos_fst<0 )
|
|
752 {
|
|
753 if ( is_fst*is_fwd>0 )
|
|
754 stats->isize_outward[isize]++;
|
|
755 else
|
|
756 stats->isize_inward[isize]++;
|
|
757 }
|
|
758 }
|
|
759 }
|
|
760
|
|
761 // Number of mismatches
|
|
762 uint8_t *nm = bam_aux_get(bam_line,"NM");
|
|
763 if (nm)
|
|
764 stats->nmismatches += bam_aux2i(nm);
|
|
765
|
|
766 // Number of mapped bases from cigar
|
|
767 // Conversion from uint32_t to MIDNSHP
|
|
768 // 012-4--
|
|
769 // MIDNSHP
|
|
770 if ( bam_line->core.n_cigar == 0)
|
|
771 error("FIXME: mapped read with no cigar?\n");
|
|
772 int readlen=seq_len;
|
|
773 if ( stats->regions )
|
|
774 {
|
|
775 // Count only on-target bases
|
|
776 int iref = bam_line->core.pos + 1;
|
|
777 for (i=0; i<bam_line->core.n_cigar; i++)
|
|
778 {
|
|
779 int cig = bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK;
|
|
780 int ncig = bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
|
|
781 if ( cig==2 ) readlen += ncig;
|
|
782 else if ( cig==0 )
|
|
783 {
|
|
784 if ( iref < stats->reg_from ) ncig -= stats->reg_from-iref;
|
|
785 else if ( iref+ncig-1 > stats->reg_to ) ncig -= iref+ncig-1 - stats->reg_to;
|
|
786 if ( ncig<0 ) ncig = 0;
|
|
787 stats->nbases_mapped_cigar += ncig;
|
|
788 iref += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
|
|
789 }
|
|
790 else if ( cig==1 )
|
|
791 {
|
|
792 iref += ncig;
|
|
793 if ( iref>=stats->reg_from && iref<=stats->reg_to )
|
|
794 stats->nbases_mapped_cigar += ncig;
|
|
795 }
|
|
796 }
|
|
797 }
|
|
798 else
|
|
799 {
|
|
800 // Count the whole read
|
|
801 for (i=0; i<bam_line->core.n_cigar; i++)
|
|
802 {
|
|
803 if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==0 || (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==1 )
|
|
804 stats->nbases_mapped_cigar += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
|
|
805 if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==2 )
|
|
806 readlen += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
|
|
807 }
|
|
808 }
|
|
809 stats->nbases_mapped += seq_len;
|
|
810
|
|
811 if ( stats->tid==bam_line->core.tid && bam_line->core.pos<stats->pos )
|
|
812 stats->is_sorted = 0;
|
|
813 stats->pos = bam_line->core.pos;
|
|
814
|
|
815 if ( stats->is_sorted )
|
|
816 {
|
|
817 if ( stats->tid==-1 || stats->tid!=bam_line->core.tid )
|
|
818 round_buffer_flush(stats,-1);
|
|
819
|
|
820 // Mismatches per cycle and GC-depth graph. For simplicity, reads overlapping GCD bins
|
|
821 // are not splitted which results in up to seq_len-1 overlaps. The default bin size is
|
|
822 // 20kbp, so the effect is negligible.
|
|
823 if ( stats->fai )
|
|
824 {
|
|
825 int inc_ref = 0, inc_gcd = 0;
|
|
826 // First pass or new chromosome
|
|
827 if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid ) { inc_ref=1; inc_gcd=1; }
|
|
828 // Read goes beyond the end of the rseq buffer
|
|
829 else if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+readlen ) { inc_ref=1; inc_gcd=1; }
|
|
830 // Read overlaps the next gcd bin
|
|
831 else if ( stats->gcd_pos+stats->gcd_bin_size < bam_line->core.pos+readlen )
|
|
832 {
|
|
833 inc_gcd = 1;
|
|
834 if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+stats->gcd_bin_size ) inc_ref = 1;
|
|
835 }
|
|
836 if ( inc_gcd )
|
|
837 {
|
|
838 stats->igcd++;
|
|
839 if ( stats->igcd >= stats->ngcd )
|
|
840 realloc_gcd_buffer(stats, readlen);
|
|
841 if ( inc_ref )
|
|
842 read_ref_seq(stats,bam_line->core.tid,bam_line->core.pos);
|
|
843 stats->gcd_pos = bam_line->core.pos;
|
|
844 stats->gcd[ stats->igcd ].gc = fai_gc_content(stats, stats->gcd_pos, stats->gcd_bin_size);
|
|
845 }
|
|
846
|
|
847 count_mismatches_per_cycle(stats,bam_line);
|
|
848 }
|
|
849 // No reference and first pass, new chromosome or sequence going beyond the end of the gcd bin
|
|
850 else if ( stats->gcd_pos==-1 || stats->tid != bam_line->core.tid || bam_line->core.pos - stats->gcd_pos > stats->gcd_bin_size )
|
|
851 {
|
|
852 // First pass or a new chromosome
|
|
853 stats->tid = bam_line->core.tid;
|
|
854 stats->gcd_pos = bam_line->core.pos;
|
|
855 stats->igcd++;
|
|
856 if ( stats->igcd >= stats->ngcd )
|
|
857 realloc_gcd_buffer(stats, readlen);
|
|
858 }
|
|
859 stats->gcd[ stats->igcd ].depth++;
|
|
860 // When no reference sequence is given, approximate the GC from the read (much shorter window, but otherwise OK)
|
|
861 if ( !stats->fai )
|
|
862 stats->gcd[ stats->igcd ].gc += (float) gc_count / seq_len;
|
|
863
|
|
864 // Coverage distribution graph
|
|
865 round_buffer_flush(stats,bam_line->core.pos);
|
|
866 round_buffer_insert_read(&(stats->cov_rbuf),bam_line->core.pos,bam_line->core.pos+seq_len-1);
|
|
867 }
|
|
868 }
|
|
869
|
|
870 stats->total_len += seq_len;
|
|
871 if ( IS_DUP(bam_line) )
|
|
872 {
|
|
873 stats->total_len_dup += seq_len;
|
|
874 stats->nreads_dup++;
|
|
875 }
|
|
876 }
|
|
877
|
|
878 // Sort by GC and depth
|
|
879 #define GCD_t(x) ((gc_depth_t *)x)
|
|
880 static int gcd_cmp(const void *a, const void *b)
|
|
881 {
|
|
882 if ( GCD_t(a)->gc < GCD_t(b)->gc ) return -1;
|
|
883 if ( GCD_t(a)->gc > GCD_t(b)->gc ) return 1;
|
|
884 if ( GCD_t(a)->depth < GCD_t(b)->depth ) return -1;
|
|
885 if ( GCD_t(a)->depth > GCD_t(b)->depth ) return 1;
|
|
886 return 0;
|
|
887 }
|
|
888 #undef GCD_t
|
|
889
|
|
890 float gcd_percentile(gc_depth_t *gcd, int N, int p)
|
|
891 {
|
|
892 float n,d;
|
|
893 int k;
|
|
894
|
|
895 n = p*(N+1)/100;
|
|
896 k = n;
|
|
897 if ( k<=0 )
|
|
898 return gcd[0].depth;
|
|
899 if ( k>=N )
|
|
900 return gcd[N-1].depth;
|
|
901
|
|
902 d = n - k;
|
|
903 return gcd[k-1].depth + d*(gcd[k].depth - gcd[k-1].depth);
|
|
904 }
|
|
905
|
|
906 void output_stats(stats_t *stats)
|
|
907 {
|
|
908 // Calculate average insert size and standard deviation (from the main bulk data only)
|
|
909 int isize, ibulk=0;
|
|
910 uint64_t nisize=0, nisize_inward=0, nisize_outward=0, nisize_other=0;
|
|
911 for (isize=0; isize<stats->nisize; isize++)
|
|
912 {
|
|
913 // Each pair was counted twice
|
|
914 stats->isize_inward[isize] *= 0.5;
|
|
915 stats->isize_outward[isize] *= 0.5;
|
|
916 stats->isize_other[isize] *= 0.5;
|
|
917
|
|
918 nisize_inward += stats->isize_inward[isize];
|
|
919 nisize_outward += stats->isize_outward[isize];
|
|
920 nisize_other += stats->isize_other[isize];
|
|
921 nisize += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize];
|
|
922 }
|
|
923
|
|
924 double bulk=0, avg_isize=0, sd_isize=0;
|
|
925 for (isize=0; isize<stats->nisize; isize++)
|
|
926 {
|
|
927 bulk += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize];
|
|
928 avg_isize += isize * (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]);
|
|
929
|
|
930 if ( bulk/nisize > stats->isize_main_bulk )
|
|
931 {
|
|
932 ibulk = isize+1;
|
|
933 nisize = bulk;
|
|
934 break;
|
|
935 }
|
|
936 }
|
|
937 avg_isize /= nisize ? nisize : 1;
|
|
938 for (isize=1; isize<ibulk; isize++)
|
|
939 sd_isize += (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]) * (isize-avg_isize)*(isize-avg_isize) / nisize;
|
|
940 sd_isize = sqrt(sd_isize);
|
|
941
|
|
942
|
|
943 printf("# This file was produced by bamcheck (%s)\n",BAMCHECK_VERSION);
|
|
944 printf("# The command line was: %s",stats->argv[0]);
|
|
945 int i;
|
|
946 for (i=1; i<stats->argc; i++)
|
|
947 printf(" %s",stats->argv[i]);
|
|
948 printf("\n");
|
|
949 printf("# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n");
|
|
950 printf("SN\traw total sequences:\t%ld\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd));
|
|
951 printf("SN\tfiltered sequences:\t%ld\n", (long)stats->nreads_filtered);
|
|
952 printf("SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd));
|
|
953 printf("SN\tis paired:\t%d\n", stats->nreads_1st&&stats->nreads_2nd ? 1 : 0);
|
|
954 printf("SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0);
|
|
955 printf("SN\t1st fragments:\t%ld\n", (long)stats->nreads_1st);
|
|
956 printf("SN\tlast fragments:\t%ld\n", (long)stats->nreads_2nd);
|
|
957 printf("SN\treads mapped:\t%ld\n", (long)(stats->nreads_paired+stats->nreads_unpaired));
|
|
958 printf("SN\treads unmapped:\t%ld\n", (long)stats->nreads_unmapped);
|
|
959 printf("SN\treads unpaired:\t%ld\n", (long)stats->nreads_unpaired);
|
|
960 printf("SN\treads paired:\t%ld\n", (long)stats->nreads_paired);
|
|
961 printf("SN\treads duplicated:\t%ld\n", (long)stats->nreads_dup);
|
|
962 printf("SN\treads MQ0:\t%ld\n", (long)stats->nreads_mq0);
|
|
963 printf("SN\treads QC failed:\t%ld\n", (long)stats->nreads_QCfailed);
|
|
964 printf("SN\tnon-primary alignments:\t%ld\n", (long)stats->nreads_secondary);
|
|
965 printf("SN\ttotal length:\t%ld\n", (long)stats->total_len);
|
|
966 printf("SN\tbases mapped:\t%ld\n", (long)stats->nbases_mapped);
|
|
967 printf("SN\tbases mapped (cigar):\t%ld\n", (long)stats->nbases_mapped_cigar);
|
|
968 printf("SN\tbases trimmed:\t%ld\n", (long)stats->nbases_trimmed);
|
|
969 printf("SN\tbases duplicated:\t%ld\n", (long)stats->total_len_dup);
|
|
970 printf("SN\tmismatches:\t%ld\n", (long)stats->nmismatches);
|
|
971 printf("SN\terror rate:\t%e\n", (float)stats->nmismatches/stats->nbases_mapped_cigar);
|
|
972 float avg_read_length = (stats->nreads_1st+stats->nreads_2nd)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd):0;
|
|
973 printf("SN\taverage length:\t%.0f\n", avg_read_length);
|
|
974 printf("SN\tmaximum length:\t%d\n", stats->max_len);
|
|
975 printf("SN\taverage quality:\t%.1f\n", stats->total_len?stats->sum_qual/stats->total_len:0);
|
|
976 printf("SN\tinsert size average:\t%.1f\n", avg_isize);
|
|
977 printf("SN\tinsert size standard deviation:\t%.1f\n", sd_isize);
|
|
978 printf("SN\tinward oriented pairs:\t%ld\n", (long)nisize_inward);
|
|
979 printf("SN\toutward oriented pairs:\t%ld\n", (long)nisize_outward);
|
|
980 printf("SN\tpairs with other orientation:\t%ld\n", (long)nisize_other);
|
|
981 printf("SN\tpairs on different chromosomes:\t%ld\n", (long)stats->nreads_anomalous/2);
|
|
982
|
|
983 int ibase,iqual;
|
|
984 if ( stats->max_len<stats->nbases ) stats->max_len++;
|
|
985 if ( stats->max_qual+1<stats->nquals ) stats->max_qual++;
|
|
986 printf("# First Fragment Qualitites. Use `grep ^FFQ | cut -f 2-` to extract this part.\n");
|
|
987 printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
|
|
988 for (ibase=0; ibase<stats->max_len; ibase++)
|
|
989 {
|
|
990 printf("FFQ\t%d",ibase+1);
|
|
991 for (iqual=0; iqual<=stats->max_qual; iqual++)
|
|
992 {
|
|
993 printf("\t%ld", (long)stats->quals_1st[ibase*stats->nquals+iqual]);
|
|
994 }
|
|
995 printf("\n");
|
|
996 }
|
|
997 printf("# Last Fragment Qualitites. Use `grep ^LFQ | cut -f 2-` to extract this part.\n");
|
|
998 printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
|
|
999 for (ibase=0; ibase<stats->max_len; ibase++)
|
|
1000 {
|
|
1001 printf("LFQ\t%d",ibase+1);
|
|
1002 for (iqual=0; iqual<=stats->max_qual; iqual++)
|
|
1003 {
|
|
1004 printf("\t%ld", (long)stats->quals_2nd[ibase*stats->nquals+iqual]);
|
|
1005 }
|
|
1006 printf("\n");
|
|
1007 }
|
|
1008 if ( stats->mpc_buf )
|
|
1009 {
|
|
1010 printf("# Mismatches per cycle and quality. Use `grep ^MPC | cut -f 2-` to extract this part.\n");
|
|
1011 printf("# Columns correspond to qualities, rows to cycles. First column is the cycle number, second\n");
|
|
1012 printf("# is the number of N's and the rest is the number of mismatches\n");
|
|
1013 for (ibase=0; ibase<stats->max_len; ibase++)
|
|
1014 {
|
|
1015 printf("MPC\t%d",ibase+1);
|
|
1016 for (iqual=0; iqual<=stats->max_qual; iqual++)
|
|
1017 {
|
|
1018 printf("\t%ld", (long)stats->mpc_buf[ibase*stats->nquals+iqual]);
|
|
1019 }
|
|
1020 printf("\n");
|
|
1021 }
|
|
1022 }
|
|
1023 printf("# GC Content of first fragments. Use `grep ^GCF | cut -f 2-` to extract this part.\n");
|
|
1024 int ibase_prev = 0;
|
|
1025 for (ibase=0; ibase<stats->ngc; ibase++)
|
|
1026 {
|
|
1027 if ( stats->gc_1st[ibase]==stats->gc_1st[ibase_prev] ) continue;
|
|
1028 printf("GCF\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_1st[ibase_prev]);
|
|
1029 ibase_prev = ibase;
|
|
1030 }
|
|
1031 printf("# GC Content of last fragments. Use `grep ^GCL | cut -f 2-` to extract this part.\n");
|
|
1032 ibase_prev = 0;
|
|
1033 for (ibase=0; ibase<stats->ngc; ibase++)
|
|
1034 {
|
|
1035 if ( stats->gc_2nd[ibase]==stats->gc_2nd[ibase_prev] ) continue;
|
|
1036 printf("GCL\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_2nd[ibase_prev]);
|
|
1037 ibase_prev = ibase;
|
|
1038 }
|
|
1039 printf("# ACGT content per cycle. Use `grep ^GCC | cut -f 2-` to extract this part. The columns are: cycle, and A,C,G,T counts [%%]\n");
|
|
1040 for (ibase=0; ibase<stats->max_len; ibase++)
|
|
1041 {
|
|
1042 uint64_t *ptr = &(stats->acgt_cycles[ibase*4]);
|
|
1043 uint64_t sum = ptr[0]+ptr[1]+ptr[2]+ptr[3];
|
|
1044 if ( ! sum ) continue;
|
|
1045 printf("GCC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase,100.*ptr[0]/sum,100.*ptr[1]/sum,100.*ptr[2]/sum,100.*ptr[3]/sum);
|
|
1046 }
|
|
1047 printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: pairs total, inward oriented pairs, outward oriented pairs, other pairs\n");
|
|
1048 for (isize=0; isize<ibulk; isize++)
|
|
1049 printf("IS\t%d\t%ld\t%ld\t%ld\t%ld\n", isize, (long)(stats->isize_inward[isize]+stats->isize_outward[isize]+stats->isize_other[isize]),
|
|
1050 (long)stats->isize_inward[isize], (long)stats->isize_outward[isize], (long)stats->isize_other[isize]);
|
|
1051
|
|
1052 printf("# Read lengths. Use `grep ^RL | cut -f 2-` to extract this part. The columns are: read length, count\n");
|
|
1053 int ilen;
|
|
1054 for (ilen=0; ilen<stats->max_len; ilen++)
|
|
1055 {
|
|
1056 if ( stats->read_lengths[ilen]>0 )
|
|
1057 printf("RL\t%d\t%ld\n", ilen, (long)stats->read_lengths[ilen]);
|
|
1058 }
|
|
1059
|
|
1060 printf("# Indel distribution. Use `grep ^ID | cut -f 2-` to extract this part. The columns are: length, number of insertions, number of deletions\n");
|
|
1061 for (ilen=0; ilen<stats->nindels; ilen++)
|
|
1062 {
|
|
1063 if ( stats->insertions[ilen]>0 || stats->deletions[ilen]>0 )
|
|
1064 printf("ID\t%d\t%ld\t%ld\n", ilen+1, (long)stats->insertions[ilen], (long)stats->deletions[ilen]);
|
|
1065 }
|
|
1066
|
|
1067 printf("# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions (fwd), .. (rev) , number of deletions (fwd), .. (rev)\n");
|
|
1068 for (ilen=0; ilen<=stats->nbases; ilen++)
|
|
1069 {
|
|
1070 // For deletions we print the index of the cycle before the deleted base (1-based) and for insertions
|
|
1071 // the index of the cycle of the first inserted base (also 1-based)
|
|
1072 if ( stats->ins_cycles_1st[ilen]>0 || stats->ins_cycles_2nd[ilen]>0 || stats->del_cycles_1st[ilen]>0 || stats->del_cycles_2nd[ilen]>0 )
|
|
1073 printf("IC\t%d\t%ld\t%ld\t%ld\t%ld\n", ilen+1, (long)stats->ins_cycles_1st[ilen], (long)stats->ins_cycles_2nd[ilen], (long)stats->del_cycles_1st[ilen], (long)stats->del_cycles_2nd[ilen]);
|
|
1074 }
|
|
1075
|
|
1076 printf("# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n");
|
|
1077 if ( stats->cov[0] )
|
|
1078 printf("COV\t[<%d]\t%d\t%ld\n",stats->cov_min,stats->cov_min-1, (long)stats->cov[0]);
|
|
1079 int icov;
|
|
1080 for (icov=1; icov<stats->ncov-1; icov++)
|
|
1081 if ( stats->cov[icov] )
|
|
1082 printf("COV\t[%d-%d]\t%d\t%ld\n",stats->cov_min + (icov-1)*stats->cov_step, stats->cov_min + icov*stats->cov_step-1,stats->cov_min + icov*stats->cov_step-1, (long)stats->cov[icov]);
|
|
1083 if ( stats->cov[stats->ncov-1] )
|
|
1084 printf("COV\t[%d<]\t%d\t%ld\n",stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov_min + (stats->ncov-2)*stats->cov_step-1, (long)stats->cov[stats->ncov-1]);
|
|
1085
|
|
1086 // Calculate average GC content, then sort by GC and depth
|
|
1087 printf("# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile\n");
|
|
1088 uint32_t igcd;
|
|
1089 for (igcd=0; igcd<stats->igcd; igcd++)
|
|
1090 {
|
|
1091 if ( stats->fai )
|
|
1092 stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc);
|
|
1093 else
|
|
1094 if ( stats->gcd[igcd].depth )
|
|
1095 stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc / stats->gcd[igcd].depth);
|
|
1096 }
|
|
1097 qsort(stats->gcd, stats->igcd+1, sizeof(gc_depth_t), gcd_cmp);
|
|
1098 igcd = 0;
|
|
1099 while ( igcd < stats->igcd )
|
|
1100 {
|
|
1101 // Calculate percentiles (10,25,50,75,90th) for the current GC content and print
|
|
1102 uint32_t nbins=0, itmp=igcd;
|
|
1103 float gc = stats->gcd[igcd].gc;
|
|
1104 while ( itmp<stats->igcd && fabs(stats->gcd[itmp].gc-gc)<0.1 )
|
|
1105 {
|
|
1106 nbins++;
|
|
1107 itmp++;
|
|
1108 }
|
|
1109 printf("GCD\t%.1f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n", gc, (igcd+nbins+1)*100./(stats->igcd+1),
|
|
1110 gcd_percentile(&(stats->gcd[igcd]),nbins,10) *avg_read_length/stats->gcd_bin_size,
|
|
1111 gcd_percentile(&(stats->gcd[igcd]),nbins,25) *avg_read_length/stats->gcd_bin_size,
|
|
1112 gcd_percentile(&(stats->gcd[igcd]),nbins,50) *avg_read_length/stats->gcd_bin_size,
|
|
1113 gcd_percentile(&(stats->gcd[igcd]),nbins,75) *avg_read_length/stats->gcd_bin_size,
|
|
1114 gcd_percentile(&(stats->gcd[igcd]),nbins,90) *avg_read_length/stats->gcd_bin_size
|
|
1115 );
|
|
1116 igcd += nbins;
|
|
1117 }
|
|
1118 }
|
|
1119
|
|
1120 size_t mygetline(char **line, size_t *n, FILE *fp)
|
|
1121 {
|
|
1122 if (line == NULL || n == NULL || fp == NULL)
|
|
1123 {
|
|
1124 errno = EINVAL;
|
|
1125 return -1;
|
|
1126 }
|
|
1127 if (*n==0 || !*line)
|
|
1128 {
|
|
1129 *line = NULL;
|
|
1130 *n = 0;
|
|
1131 }
|
|
1132
|
|
1133 size_t nread=0;
|
|
1134 int c;
|
|
1135 while ((c=getc(fp))!= EOF && c!='\n')
|
|
1136 {
|
|
1137 if ( ++nread>=*n )
|
|
1138 {
|
|
1139 *n += 255;
|
|
1140 *line = realloc(*line, sizeof(char)*(*n));
|
|
1141 }
|
|
1142 (*line)[nread-1] = c;
|
|
1143 }
|
|
1144 if ( nread>=*n )
|
|
1145 {
|
|
1146 *n += 255;
|
|
1147 *line = realloc(*line, sizeof(char)*(*n));
|
|
1148 }
|
|
1149 (*line)[nread] = 0;
|
|
1150 return nread>0 ? nread : -1;
|
|
1151
|
|
1152 }
|
|
1153
|
|
1154 void init_regions(stats_t *stats, char *file)
|
|
1155 {
|
|
1156 khiter_t iter;
|
|
1157 khash_t(kh_bam_tid) *header_hash;
|
|
1158
|
|
1159 bam_init_header_hash(stats->sam->header);
|
|
1160 header_hash = (khash_t(kh_bam_tid)*)stats->sam->header->hash;
|
|
1161
|
|
1162 FILE *fp = fopen(file,"r");
|
|
1163 if ( !fp ) error("%s: %s\n",file,strerror(errno));
|
|
1164
|
|
1165 char *line = NULL;
|
|
1166 size_t len = 0;
|
|
1167 ssize_t nread;
|
|
1168 int warned = 0;
|
|
1169 int prev_tid=-1, prev_pos=-1;
|
|
1170 while ((nread = mygetline(&line, &len, fp)) != -1)
|
|
1171 {
|
|
1172 if ( line[0] == '#' ) continue;
|
|
1173
|
|
1174 int i = 0;
|
|
1175 while ( i<nread && !isspace(line[i]) ) i++;
|
|
1176 if ( i>=nread ) error("Could not parse the file: %s [%s]\n", file,line);
|
|
1177 line[i] = 0;
|
|
1178
|
|
1179 iter = kh_get(kh_bam_tid, header_hash, line);
|
|
1180 int tid = kh_val(header_hash, iter);
|
|
1181 if ( iter == kh_end(header_hash) )
|
|
1182 {
|
|
1183 if ( !warned )
|
|
1184 fprintf(stderr,"Warning: Some sequences not present in the BAM, e.g. \"%s\". This message is printed only once.\n", line);
|
|
1185 warned = 1;
|
|
1186 continue;
|
|
1187 }
|
|
1188
|
|
1189 if ( tid >= stats->nregions )
|
|
1190 {
|
|
1191 stats->regions = realloc(stats->regions,sizeof(regions_t)*(stats->nregions+100));
|
|
1192 int j;
|
|
1193 for (j=stats->nregions; j<stats->nregions+100; j++)
|
|
1194 {
|
|
1195 stats->regions[j].npos = stats->regions[j].mpos = stats->regions[j].cpos = 0;
|
|
1196 stats->regions[j].pos = NULL;
|
|
1197 }
|
|
1198 stats->nregions += 100;
|
|
1199 }
|
|
1200 int npos = stats->regions[tid].npos;
|
|
1201 if ( npos >= stats->regions[tid].mpos )
|
|
1202 {
|
|
1203 stats->regions[tid].mpos += 1000;
|
|
1204 stats->regions[tid].pos = realloc(stats->regions[tid].pos,sizeof(pos_t)*stats->regions[tid].mpos);
|
|
1205 }
|
|
1206
|
|
1207 if ( (sscanf(line+i+1,"%d %d",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n");
|
|
1208 if ( prev_tid==-1 || prev_tid!=tid )
|
|
1209 {
|
|
1210 prev_tid = tid;
|
|
1211 prev_pos = stats->regions[tid].pos[npos].from;
|
|
1212 }
|
|
1213 if ( prev_pos>stats->regions[tid].pos[npos].from )
|
|
1214 error("The positions are not in chromosomal order (%s:%d comes after %d)\n", line,stats->regions[tid].pos[npos].from,prev_pos);
|
|
1215 stats->regions[tid].npos++;
|
|
1216 }
|
|
1217 if (line) free(line);
|
|
1218 if ( !stats->regions ) error("Unable to map the -t sequences to the BAM sequences.\n");
|
|
1219 fclose(fp);
|
|
1220 }
|
|
1221
|
|
1222 void destroy_regions(stats_t *stats)
|
|
1223 {
|
|
1224 int i;
|
|
1225 for (i=0; i<stats->nregions; i++)
|
|
1226 {
|
|
1227 if ( !stats->regions[i].mpos ) continue;
|
|
1228 free(stats->regions[i].pos);
|
|
1229 }
|
|
1230 if ( stats->regions ) free(stats->regions);
|
|
1231 }
|
|
1232
|
|
1233 static int fetch_read(const bam1_t *bam_line, void *data)
|
|
1234 {
|
|
1235 collect_stats((bam1_t*)bam_line,(stats_t*)data);
|
|
1236 return 1;
|
|
1237 }
|
|
1238
|
|
1239 void reset_regions(stats_t *stats)
|
|
1240 {
|
|
1241 int i;
|
|
1242 for (i=0; i<stats->nregions; i++)
|
|
1243 stats->regions[i].cpos = 0;
|
|
1244 }
|
|
1245
|
|
1246 int is_in_regions(bam1_t *bam_line, stats_t *stats)
|
|
1247 {
|
|
1248 if ( !stats->regions ) return 1;
|
|
1249
|
|
1250 if ( bam_line->core.tid >= stats->nregions || bam_line->core.tid<0 ) return 0;
|
|
1251 if ( !stats->is_sorted ) error("The BAM must be sorted in order for -t to work.\n");
|
|
1252
|
|
1253 regions_t *reg = &stats->regions[bam_line->core.tid];
|
|
1254 if ( reg->cpos==reg->npos ) return 0; // done for this chr
|
|
1255
|
|
1256 // Find a matching interval or skip this read. No splicing of reads is done, no indels or soft clips considered,
|
|
1257 // even small overlap is enough to include the read in the stats.
|
|
1258 int i = reg->cpos;
|
|
1259 while ( i<reg->npos && reg->pos[i].to<=bam_line->core.pos ) i++;
|
|
1260 if ( i>=reg->npos ) { reg->cpos = reg->npos; return 0; }
|
|
1261 if ( bam_line->core.pos + bam_line->core.l_qseq + 1 < reg->pos[i].from ) return 0;
|
|
1262 reg->cpos = i;
|
|
1263 stats->reg_from = reg->pos[i].from;
|
|
1264 stats->reg_to = reg->pos[i].to;
|
|
1265
|
|
1266 return 1;
|
|
1267 }
|
|
1268
|
|
1269 void init_group_id(stats_t *stats, char *id)
|
|
1270 {
|
|
1271 if ( !stats->sam->header->dict )
|
|
1272 stats->sam->header->dict = sam_header_parse2(stats->sam->header->text);
|
|
1273 void *iter = stats->sam->header->dict;
|
|
1274 const char *key, *val;
|
|
1275 int n = 0;
|
|
1276 stats->rg_hash = kh_init(kh_rg);
|
|
1277 while ( (iter = sam_header2key_val(iter, "RG","ID","SM", &key, &val)) )
|
|
1278 {
|
|
1279 if ( !strcmp(id,key) || (val && !strcmp(id,val)) )
|
|
1280 {
|
|
1281 khiter_t k = kh_get(kh_rg, stats->rg_hash, key);
|
|
1282 if ( k != kh_end(stats->rg_hash) )
|
|
1283 fprintf(stderr, "[init_group_id] The group ID not unique: \"%s\"\n", key);
|
|
1284 int ret;
|
|
1285 k = kh_put(kh_rg, stats->rg_hash, key, &ret);
|
|
1286 kh_value(stats->rg_hash, k) = val;
|
|
1287 n++;
|
|
1288 }
|
|
1289 }
|
|
1290 if ( !n )
|
|
1291 error("The sample or read group \"%s\" not present.\n", id);
|
|
1292 }
|
|
1293
|
|
1294
|
|
1295 void error(const char *format, ...)
|
|
1296 {
|
|
1297 if ( !format )
|
|
1298 {
|
|
1299 printf("Version: %s\n", BAMCHECK_VERSION);
|
|
1300 printf("About: The program collects statistics from BAM files. The output can be visualized using plot-bamcheck.\n");
|
|
1301 printf("Usage: bamcheck [OPTIONS] file.bam\n");
|
|
1302 printf(" bamcheck [OPTIONS] file.bam chr:from-to\n");
|
|
1303 printf("Options:\n");
|
|
1304 printf(" -c, --coverage <int>,<int>,<int> Coverage distribution min,max,step [1,1000,1]\n");
|
|
1305 printf(" -d, --remove-dups Exlude from statistics reads marked as duplicates\n");
|
|
1306 printf(" -f, --required-flag <int> Required flag, 0 for unset [0]\n");
|
|
1307 printf(" -F, --filtering-flag <int> Filtering flag, 0 for unset [0]\n");
|
|
1308 printf(" --GC-depth <float,float> Bin size for GC-depth graph and the maximum reference length [2e4,4.2e9]\n");
|
|
1309 printf(" -h, --help This help message\n");
|
|
1310 printf(" -i, --insert-size <int> Maximum insert size [8000]\n");
|
|
1311 printf(" -I, --id <string> Include only listed read group or sample name\n");
|
|
1312 printf(" -l, --read-length <int> Include in the statistics only reads with the given read length []\n");
|
|
1313 printf(" -m, --most-inserts <float> Report only the main part of inserts [0.99]\n");
|
|
1314 printf(" -q, --trim-quality <int> The BWA trimming parameter [0]\n");
|
|
1315 printf(" -r, --ref-seq <file> Reference sequence (required for GC-depth calculation).\n");
|
|
1316 printf(" -t, --target-regions <file> Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n");
|
|
1317 printf(" -s, --sam Input is SAM\n");
|
|
1318 printf("\n");
|
|
1319 }
|
|
1320 else
|
|
1321 {
|
|
1322 va_list ap;
|
|
1323 va_start(ap, format);
|
|
1324 vfprintf(stderr, format, ap);
|
|
1325 va_end(ap);
|
|
1326 }
|
|
1327 exit(-1);
|
|
1328 }
|
|
1329
|
|
1330 int main(int argc, char *argv[])
|
|
1331 {
|
|
1332 char *targets = NULL;
|
|
1333 char *bam_fname = NULL;
|
|
1334 char *group_id = NULL;
|
|
1335 samfile_t *sam = NULL;
|
|
1336 char in_mode[5];
|
|
1337
|
|
1338 stats_t *stats = calloc(1,sizeof(stats_t));
|
|
1339 stats->ngc = 200;
|
|
1340 stats->nquals = 256;
|
|
1341 stats->nbases = 300;
|
|
1342 stats->nisize = 8000;
|
|
1343 stats->max_len = 30;
|
|
1344 stats->max_qual = 40;
|
|
1345 stats->isize_main_bulk = 0.99; // There are always outliers at the far end
|
|
1346 stats->gcd_bin_size = 20e3;
|
|
1347 stats->gcd_ref_size = 4.2e9;
|
|
1348 stats->rseq_pos = -1;
|
|
1349 stats->tid = stats->gcd_pos = -1;
|
|
1350 stats->igcd = 0;
|
|
1351 stats->is_sorted = 1;
|
|
1352 stats->cov_min = 1;
|
|
1353 stats->cov_max = 1000;
|
|
1354 stats->cov_step = 1;
|
|
1355 stats->argc = argc;
|
|
1356 stats->argv = argv;
|
|
1357 stats->filter_readlen = -1;
|
|
1358 stats->nindels = stats->nbases;
|
|
1359
|
|
1360 strcpy(in_mode, "rb");
|
|
1361
|
|
1362 static struct option loptions[] =
|
|
1363 {
|
|
1364 {"help",0,0,'h'},
|
|
1365 {"remove-dups",0,0,'d'},
|
|
1366 {"sam",0,0,'s'},
|
|
1367 {"ref-seq",1,0,'r'},
|
|
1368 {"coverage",1,0,'c'},
|
|
1369 {"read-length",1,0,'l'},
|
|
1370 {"insert-size",1,0,'i'},
|
|
1371 {"most-inserts",1,0,'m'},
|
|
1372 {"trim-quality",1,0,'q'},
|
|
1373 {"target-regions",0,0,'t'},
|
|
1374 {"required-flag",1,0,'f'},
|
|
1375 {"filtering-flag",0,0,'F'},
|
|
1376 {"id",1,0,'I'},
|
|
1377 {"GC-depth",1,0,1},
|
|
1378 {0,0,0,0}
|
|
1379 };
|
|
1380 int opt;
|
|
1381 while ( (opt=getopt_long(argc,argv,"?hdsr:c:l:i:t:m:q:f:F:I:1:",loptions,NULL))>0 )
|
|
1382 {
|
|
1383 switch (opt)
|
|
1384 {
|
|
1385 case 'f': stats->flag_require=strtol(optarg,0,0); break;
|
|
1386 case 'F': stats->flag_filter=strtol(optarg,0,0); break;
|
|
1387 case 'd': stats->flag_filter|=BAM_FDUP; break;
|
|
1388 case 's': strcpy(in_mode, "r"); break;
|
|
1389 case 'r': stats->fai = fai_load(optarg);
|
|
1390 if (stats->fai==0)
|
|
1391 error("Could not load faidx: %s\n", optarg);
|
|
1392 break;
|
|
1393 case 1 : {
|
|
1394 float flen,fbin;
|
|
1395 if ( sscanf(optarg,"%f,%f",&fbin,&flen)!= 2 )
|
|
1396 error("Unable to parse --GC-depth %s\n", optarg);
|
|
1397 stats->gcd_bin_size = fbin;
|
|
1398 stats->gcd_ref_size = flen;
|
|
1399 }
|
|
1400 break;
|
|
1401 case 'c': if ( sscanf(optarg,"%d,%d,%d",&stats->cov_min,&stats->cov_max,&stats->cov_step)!= 3 )
|
|
1402 error("Unable to parse -c %s\n", optarg);
|
|
1403 break;
|
|
1404 case 'l': stats->filter_readlen = atoi(optarg); break;
|
|
1405 case 'i': stats->nisize = atoi(optarg); break;
|
|
1406 case 'm': stats->isize_main_bulk = atof(optarg); break;
|
|
1407 case 'q': stats->trim_qual = atoi(optarg); break;
|
|
1408 case 't': targets = optarg; break;
|
|
1409 case 'I': group_id = optarg; break;
|
|
1410 case '?':
|
|
1411 case 'h': error(NULL);
|
|
1412 default: error("Unknown argument: %s\n", optarg);
|
|
1413 }
|
|
1414 }
|
|
1415 if ( optind<argc )
|
|
1416 bam_fname = argv[optind++];
|
|
1417
|
|
1418 if ( !bam_fname )
|
|
1419 {
|
|
1420 if ( isatty(fileno((FILE *)stdin)) )
|
|
1421 error(NULL);
|
|
1422 bam_fname = "-";
|
|
1423 }
|
|
1424
|
|
1425 // Init structures
|
|
1426 // .. coverage bins and round buffer
|
|
1427 if ( stats->cov_step > stats->cov_max - stats->cov_min + 1 )
|
|
1428 {
|
|
1429 stats->cov_step = stats->cov_max - stats->cov_min;
|
|
1430 if ( stats->cov_step <= 0 )
|
|
1431 stats->cov_step = 1;
|
|
1432 }
|
|
1433 stats->ncov = 3 + (stats->cov_max-stats->cov_min) / stats->cov_step;
|
|
1434 stats->cov_max = stats->cov_min + ((stats->cov_max-stats->cov_min)/stats->cov_step +1)*stats->cov_step - 1;
|
|
1435 stats->cov = calloc(sizeof(uint64_t),stats->ncov);
|
|
1436 stats->cov_rbuf.size = stats->nbases*5;
|
|
1437 stats->cov_rbuf.buffer = calloc(sizeof(int32_t),stats->cov_rbuf.size);
|
|
1438 // .. bam
|
|
1439 if ((sam = samopen(bam_fname, in_mode, NULL)) == 0)
|
|
1440 error("Failed to open: %s\n", bam_fname);
|
|
1441 stats->sam = sam;
|
|
1442 if ( group_id ) init_group_id(stats, group_id);
|
|
1443 bam1_t *bam_line = bam_init1();
|
|
1444 // .. arrays
|
|
1445 stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
|
|
1446 stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
|
|
1447 stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t));
|
|
1448 stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t));
|
|
1449 stats->isize_inward = calloc(stats->nisize,sizeof(uint64_t));
|
|
1450 stats->isize_outward = calloc(stats->nisize,sizeof(uint64_t));
|
|
1451 stats->isize_other = calloc(stats->nisize,sizeof(uint64_t));
|
|
1452 stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t));
|
|
1453 stats->mpc_buf = stats->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL;
|
|
1454 stats->acgt_cycles = calloc(4*stats->nbases,sizeof(uint64_t));
|
|
1455 stats->read_lengths = calloc(stats->nbases,sizeof(uint64_t));
|
|
1456 stats->insertions = calloc(stats->nbases,sizeof(uint64_t));
|
|
1457 stats->deletions = calloc(stats->nbases,sizeof(uint64_t));
|
|
1458 stats->ins_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t));
|
|
1459 stats->ins_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
|
|
1460 stats->del_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t));
|
|
1461 stats->del_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
|
|
1462 realloc_rseq_buffer(stats);
|
|
1463 if ( targets )
|
|
1464 init_regions(stats, targets);
|
|
1465
|
|
1466 // Collect statistics
|
|
1467 if ( optind<argc )
|
|
1468 {
|
|
1469 // Collect stats in selected regions only
|
|
1470 bam_index_t *bam_idx = bam_index_load(bam_fname);
|
|
1471 if (bam_idx == 0)
|
|
1472 error("Random alignment retrieval only works for indexed BAM files.\n");
|
|
1473
|
|
1474 int i;
|
|
1475 for (i=optind; i<argc; i++)
|
|
1476 {
|
|
1477 int tid, beg, end;
|
|
1478 bam_parse_region(stats->sam->header, argv[i], &tid, &beg, &end);
|
|
1479 if ( tid < 0 ) continue;
|
|
1480 reset_regions(stats);
|
|
1481 bam_fetch(stats->sam->x.bam, bam_idx, tid, beg, end, stats, fetch_read);
|
|
1482 }
|
|
1483 bam_index_destroy(bam_idx);
|
|
1484 }
|
|
1485 else
|
|
1486 {
|
|
1487 // Stream through the entire BAM ignoring off-target regions if -t is given
|
|
1488 while (samread(sam,bam_line) >= 0)
|
|
1489 collect_stats(bam_line,stats);
|
|
1490 }
|
|
1491 round_buffer_flush(stats,-1);
|
|
1492
|
|
1493 output_stats(stats);
|
|
1494
|
|
1495 bam_destroy1(bam_line);
|
|
1496 samclose(stats->sam);
|
|
1497 if (stats->fai) fai_destroy(stats->fai);
|
|
1498 free(stats->cov_rbuf.buffer); free(stats->cov);
|
|
1499 free(stats->quals_1st); free(stats->quals_2nd);
|
|
1500 free(stats->gc_1st); free(stats->gc_2nd);
|
|
1501 free(stats->isize_inward); free(stats->isize_outward); free(stats->isize_other);
|
|
1502 free(stats->gcd);
|
|
1503 free(stats->rseq_buf);
|
|
1504 free(stats->mpc_buf);
|
|
1505 free(stats->acgt_cycles);
|
|
1506 free(stats->read_lengths);
|
|
1507 free(stats->insertions);
|
|
1508 free(stats->deletions);
|
|
1509 free(stats->ins_cycles_1st);
|
|
1510 free(stats->ins_cycles_2nd);
|
|
1511 free(stats->del_cycles_1st);
|
|
1512 free(stats->del_cycles_2nd);
|
|
1513 destroy_regions(stats);
|
|
1514 free(stats);
|
|
1515 if ( stats->rg_hash ) kh_destroy(kh_rg, stats->rg_hash);
|
|
1516
|
|
1517 return 0;
|
|
1518 }
|
|
1519
|
|
1520
|
|
1521
|