comparison PsiCLASS-1.0.2/samtools-0.1.19/misc/bamcheck.c @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:903fc43d6227
1 /*
2 Author: petr.danecek@sanger
3 gcc -Wall -Winline -g -O2 -I ~/git/samtools bamcheck.c -o bamcheck -lm -lz -L ~/git/samtools -lbam -lpthread
4
5 Assumptions, approximations and other issues:
6 - GC-depth graph does not split reads, the starting position determines which bin is incremented.
7 There are small overlaps between bins (max readlen-1). However, the bins are big (20k).
8 - coverage distribution ignores softclips and deletions
9 - some stats require sorted BAMs
10 - GC content graph can have an untidy, step-like pattern when BAM contains multiple read lengths.
11 - 'bases mapped' (stats->nbases_mapped) is calculated from read lengths given by BAM (core.l_qseq)
12 - With the -t option, the whole reads are used. Except for the number of mapped bases (cigar)
13 counts, no splicing is done, no indels or soft clips are considered, even small overlap is
14 good enough to include the read in the stats.
15
16 */
17
18 #define BAMCHECK_VERSION "2012-09-04"
19
20 #define _ISOC99_SOURCE
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <stdarg.h>
24 #include <string.h>
25 #include <math.h>
26 #include <ctype.h>
27 #include <getopt.h>
28 #include <errno.h>
29 #include <assert.h>
30 #include "faidx.h"
31 #include "khash.h"
32 #include "sam.h"
33 #include "sam_header.h"
34 #include "razf.h"
35
36 #define BWA_MIN_RDLEN 35
37 #define IS_PAIRED(bam) ((bam)->core.flag&BAM_FPAIRED && !((bam)->core.flag&BAM_FUNMAP) && !((bam)->core.flag&BAM_FMUNMAP))
38 #define IS_UNMAPPED(bam) ((bam)->core.flag&BAM_FUNMAP)
39 #define IS_REVERSE(bam) ((bam)->core.flag&BAM_FREVERSE)
40 #define IS_MATE_REVERSE(bam) ((bam)->core.flag&BAM_FMREVERSE)
41 #define IS_READ1(bam) ((bam)->core.flag&BAM_FREAD1)
42 #define IS_READ2(bam) ((bam)->core.flag&BAM_FREAD2)
43 #define IS_DUP(bam) ((bam)->core.flag&BAM_FDUP)
44
45 typedef struct
46 {
47 int32_t line_len, line_blen;
48 int64_t len;
49 uint64_t offset;
50 }
51 faidx1_t;
52 KHASH_MAP_INIT_STR(kh_faidx, faidx1_t)
53 KHASH_MAP_INIT_STR(kh_bam_tid, int)
54 KHASH_MAP_INIT_STR(kh_rg, const char *)
55 struct __faidx_t {
56 RAZF *rz;
57 int n, m;
58 char **name;
59 khash_t(kh_faidx) *hash;
60 };
61
62 typedef struct
63 {
64 float gc;
65 uint32_t depth;
66 }
67 gc_depth_t;
68
69 // For coverage distribution, a simple pileup
70 typedef struct
71 {
72 int64_t pos;
73 int size, start;
74 int *buffer;
75 }
76 round_buffer_t;
77
78 typedef struct { uint32_t from, to; } pos_t;
79 typedef struct
80 {
81 int npos,mpos,cpos;
82 pos_t *pos;
83 }
84 regions_t;
85
86 typedef struct
87 {
88 // Parameters
89 int trim_qual; // bwa trim quality
90
91 // Dimensions of the quality histogram holder (quals_1st,quals_2nd), GC content holder (gc_1st,gc_2nd),
92 // insert size histogram holder
93 int nquals; // The number of quality bins
94 int nbases; // The maximum sequence length the allocated array can hold
95 int nisize; // The maximum insert size that the allocated array can hold
96 int ngc; // The size of gc_1st and gc_2nd
97 int nindels; // The maximum indel length for indel distribution
98
99 // Arrays for the histogram data
100 uint64_t *quals_1st, *quals_2nd;
101 uint64_t *gc_1st, *gc_2nd;
102 uint64_t *isize_inward, *isize_outward, *isize_other;
103 uint64_t *acgt_cycles;
104 uint64_t *read_lengths;
105 uint64_t *insertions, *deletions;
106 uint64_t *ins_cycles_1st, *ins_cycles_2nd, *del_cycles_1st, *del_cycles_2nd;
107
108 // The extremes encountered
109 int max_len; // Maximum read length
110 int max_qual; // Maximum quality
111 float isize_main_bulk; // There are always some unrealistically big insert sizes, report only the main part
112 int is_sorted;
113
114 // Summary numbers
115 uint64_t total_len;
116 uint64_t total_len_dup;
117 uint64_t nreads_1st;
118 uint64_t nreads_2nd;
119 uint64_t nreads_filtered;
120 uint64_t nreads_dup;
121 uint64_t nreads_unmapped;
122 uint64_t nreads_unpaired;
123 uint64_t nreads_paired;
124 uint64_t nreads_anomalous;
125 uint64_t nreads_mq0;
126 uint64_t nbases_mapped;
127 uint64_t nbases_mapped_cigar;
128 uint64_t nbases_trimmed; // bwa trimmed bases
129 uint64_t nmismatches;
130 uint64_t nreads_QCfailed, nreads_secondary;
131
132 // GC-depth related data
133 uint32_t ngcd, igcd; // The maximum number of GC depth bins and index of the current bin
134 gc_depth_t *gcd; // The GC-depth bins holder
135 int gcd_bin_size; // The size of GC-depth bin
136 uint32_t gcd_ref_size; // The approximate size of the genome
137 int32_t tid, gcd_pos; // Position of the current bin
138 int32_t pos; // Position of the last read
139
140 // Coverage distribution related data
141 int ncov; // The number of coverage bins
142 uint64_t *cov; // The coverage frequencies
143 int cov_min,cov_max,cov_step; // Minimum, maximum coverage and size of the coverage bins
144 round_buffer_t cov_rbuf; // Pileup round buffer
145
146 // Mismatches by read cycle
147 uint8_t *rseq_buf; // A buffer for reference sequence to check the mismatches against
148 int mrseq_buf; // The size of the buffer
149 int32_t rseq_pos; // The coordinate of the first base in the buffer
150 int32_t nrseq_buf; // The used part of the buffer
151 uint64_t *mpc_buf; // Mismatches per cycle
152
153 // Filters
154 int filter_readlen;
155
156 // Target regions
157 int nregions, reg_from,reg_to;
158 regions_t *regions;
159
160 // Auxiliary data
161 int flag_require, flag_filter;
162 double sum_qual; // For calculating average quality value
163 samfile_t *sam;
164 khash_t(kh_rg) *rg_hash; // Read groups to include, the array is null-terminated
165 faidx_t *fai; // Reference sequence for GC-depth graph
166 int argc; // Command line arguments to be printed on the output
167 char **argv;
168 }
169 stats_t;
170
171 void error(const char *format, ...);
172 void bam_init_header_hash(bam_header_t *header);
173 int is_in_regions(bam1_t *bam_line, stats_t *stats);
174
175
176 // Coverage distribution methods
177 inline int coverage_idx(int min, int max, int n, int step, int depth)
178 {
179 if ( depth < min )
180 return 0;
181
182 if ( depth > max )
183 return n-1;
184
185 return 1 + (depth - min) / step;
186 }
187
188 inline int round_buffer_lidx2ridx(int offset, int size, int64_t refpos, int64_t pos)
189 {
190 return (offset + (pos-refpos) % size) % size;
191 }
192
193 void round_buffer_flush(stats_t *stats, int64_t pos)
194 {
195 int ibuf,idp;
196
197 if ( pos==stats->cov_rbuf.pos )
198 return;
199
200 int64_t new_pos = pos;
201 if ( pos==-1 || pos - stats->cov_rbuf.pos >= stats->cov_rbuf.size )
202 {
203 // Flush the whole buffer, but in sequential order,
204 pos = stats->cov_rbuf.pos + stats->cov_rbuf.size - 1;
205 }
206
207 if ( pos < stats->cov_rbuf.pos )
208 error("Expected coordinates in ascending order, got %ld after %ld\n", pos,stats->cov_rbuf.pos);
209
210 int ifrom = stats->cov_rbuf.start;
211 int ito = round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos-1);
212 if ( ifrom>ito )
213 {
214 for (ibuf=ifrom; ibuf<stats->cov_rbuf.size; ibuf++)
215 {
216 if ( !stats->cov_rbuf.buffer[ibuf] )
217 continue;
218 idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]);
219 stats->cov[idp]++;
220 stats->cov_rbuf.buffer[ibuf] = 0;
221 }
222 ifrom = 0;
223 }
224 for (ibuf=ifrom; ibuf<=ito; ibuf++)
225 {
226 if ( !stats->cov_rbuf.buffer[ibuf] )
227 continue;
228 idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]);
229 stats->cov[idp]++;
230 stats->cov_rbuf.buffer[ibuf] = 0;
231 }
232 stats->cov_rbuf.start = (new_pos==-1) ? 0 : round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos);
233 stats->cov_rbuf.pos = new_pos;
234 }
235
236 void round_buffer_insert_read(round_buffer_t *rbuf, int64_t from, int64_t to)
237 {
238 if ( to-from >= rbuf->size )
239 error("The read length too big (%d), please increase the buffer length (currently %d)\n", to-from+1,rbuf->size);
240 if ( from < rbuf->pos )
241 error("The reads are not sorted (%ld comes after %ld).\n", from,rbuf->pos);
242
243 int ifrom,ito,ibuf;
244 ifrom = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,from);
245 ito = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,to);
246 if ( ifrom>ito )
247 {
248 for (ibuf=ifrom; ibuf<rbuf->size; ibuf++)
249 rbuf->buffer[ibuf]++;
250 ifrom = 0;
251 }
252 for (ibuf=ifrom; ibuf<=ito; ibuf++)
253 rbuf->buffer[ibuf]++;
254 }
255
256 // Calculate the number of bases in the read trimmed by BWA
257 int bwa_trim_read(int trim_qual, uint8_t *quals, int len, int reverse)
258 {
259 if ( len<BWA_MIN_RDLEN ) return 0;
260
261 // Although the name implies that the read cannot be trimmed to more than BWA_MIN_RDLEN,
262 // the calculation can in fact trim it to (BWA_MIN_RDLEN-1). (bwa_trim_read in bwa/bwaseqio.c).
263 int max_trimmed = len - BWA_MIN_RDLEN + 1;
264 int l, sum=0, max_sum=0, max_l=0;
265
266 for (l=0; l<max_trimmed; l++)
267 {
268 sum += trim_qual - quals[ reverse ? l : len-1-l ];
269 if ( sum<0 ) break;
270 if ( sum>max_sum )
271 {
272 max_sum = sum;
273 // This is the correct way, but bwa clips from some reason one base less
274 // max_l = l+1;
275 max_l = l;
276 }
277 }
278 return max_l;
279 }
280
281
282 void count_indels(stats_t *stats,bam1_t *bam_line)
283 {
284 int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
285 int is_1st = IS_READ1(bam_line) ? 1 : 0;
286 int icig;
287 int icycle = 0;
288 int read_len = bam_line->core.l_qseq;
289 for (icig=0; icig<bam_line->core.n_cigar; icig++)
290 {
291 // Conversion from uint32_t to MIDNSHP
292 // 0123456
293 // MIDNSHP
294 int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK;
295 int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT;
296
297 if ( cig==1 )
298 {
299 int idx = is_fwd ? icycle : read_len-icycle-ncig;
300 if ( idx<0 )
301 error("FIXME: read_len=%d vs icycle=%d\n", read_len,icycle);
302 if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d, %s:%d %s\n", idx,stats->nbases, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
303 if ( is_1st )
304 stats->ins_cycles_1st[idx]++;
305 else
306 stats->ins_cycles_2nd[idx]++;
307 icycle += ncig;
308 if ( ncig<=stats->nindels )
309 stats->insertions[ncig-1]++;
310 continue;
311 }
312 if ( cig==2 )
313 {
314 int idx = is_fwd ? icycle-1 : read_len-icycle-1;
315 if ( idx<0 ) continue; // discard meaningless deletions
316 if ( idx >= stats->nbases ) error("FIXME: %d vs %d\n", idx,stats->nbases);
317 if ( is_1st )
318 stats->del_cycles_1st[idx]++;
319 else
320 stats->del_cycles_2nd[idx]++;
321 if ( ncig<=stats->nindels )
322 stats->deletions[ncig-1]++;
323 continue;
324 }
325 if ( cig!=3 && cig!=5 )
326 icycle += ncig;
327 }
328 }
329
330 void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line)
331 {
332 int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
333 int icig,iread=0,icycle=0;
334 int iref = bam_line->core.pos - stats->rseq_pos;
335 int read_len = bam_line->core.l_qseq;
336 uint8_t *read = bam1_seq(bam_line);
337 uint8_t *quals = bam1_qual(bam_line);
338 uint64_t *mpc_buf = stats->mpc_buf;
339 for (icig=0; icig<bam_line->core.n_cigar; icig++)
340 {
341 // Conversion from uint32_t to MIDNSHP
342 // 0123456
343 // MIDNSHP
344 int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK;
345 int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT;
346 if ( cig==1 )
347 {
348 iread += ncig;
349 icycle += ncig;
350 continue;
351 }
352 if ( cig==2 )
353 {
354 iref += ncig;
355 continue;
356 }
357 if ( cig==4 )
358 {
359 icycle += ncig;
360 // Soft-clips are present in the sequence, but the position of the read marks a start of non-clipped sequence
361 // iref += ncig;
362 iread += ncig;
363 continue;
364 }
365 if ( cig==5 )
366 {
367 icycle += ncig;
368 continue;
369 }
370 // Ignore H and N CIGARs. The letter are inserted e.g. by TopHat and often require very large
371 // chunk of refseq in memory. Not very frequent and not noticable in the stats.
372 if ( cig==3 || cig==5 ) continue;
373 if ( cig!=0 )
374 error("TODO: cigar %d, %s:%d %s\n", cig,stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
375
376 if ( ncig+iref > stats->nrseq_buf )
377 error("FIXME: %d+%d > %d, %s, %s:%d\n",ncig,iref,stats->nrseq_buf, bam1_qname(bam_line),stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1);
378
379 int im;
380 for (im=0; im<ncig; im++)
381 {
382 uint8_t cread = bam1_seqi(read,iread);
383 uint8_t cref = stats->rseq_buf[iref];
384
385 // ---------------15
386 // =ACMGRSVTWYHKDBN
387 if ( cread==15 )
388 {
389 int idx = is_fwd ? icycle : read_len-icycle-1;
390 if ( idx>stats->max_len )
391 error("mpc: %d>%d\n",idx,stats->max_len);
392 idx = idx*stats->nquals;
393 if ( idx>=stats->nquals*stats->nbases )
394 error("FIXME: mpc_buf overflow\n");
395 mpc_buf[idx]++;
396 }
397 else if ( cref && cread && cref!=cread )
398 {
399 uint8_t qual = quals[iread] + 1;
400 if ( qual>=stats->nquals )
401 error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
402
403 int idx = is_fwd ? icycle : read_len-icycle-1;
404 if ( idx>stats->max_len )
405 error("mpc: %d>%d\n",idx,stats->max_len);
406
407 idx = idx*stats->nquals + qual;
408 if ( idx>=stats->nquals*stats->nbases )
409 error("FIXME: mpc_buf overflow\n");
410 mpc_buf[idx]++;
411 }
412
413 iref++;
414 iread++;
415 icycle++;
416 }
417 }
418 }
419
420 void read_ref_seq(stats_t *stats,int32_t tid,int32_t pos)
421 {
422 khash_t(kh_faidx) *h;
423 khiter_t iter;
424 faidx1_t val;
425 char *chr, c;
426 faidx_t *fai = stats->fai;
427
428 h = fai->hash;
429 chr = stats->sam->header->target_name[tid];
430
431 // ID of the sequence name
432 iter = kh_get(kh_faidx, h, chr);
433 if (iter == kh_end(h))
434 error("No such reference sequence [%s]?\n", chr);
435 val = kh_value(h, iter);
436
437 // Check the boundaries
438 if (pos >= val.len)
439 error("Was the bam file mapped with the reference sequence supplied?"
440 " A read mapped beyond the end of the chromosome (%s:%d, chromosome length %d).\n", chr,pos,val.len);
441 int size = stats->mrseq_buf;
442 // The buffer extends beyond the chromosome end. Later the rest will be filled with N's.
443 if (size+pos > val.len) size = val.len-pos;
444
445 // Position the razf reader
446 razf_seek(fai->rz, val.offset + pos / val.line_blen * val.line_len + pos % val.line_blen, SEEK_SET);
447
448 uint8_t *ptr = stats->rseq_buf;
449 int nread = 0;
450 while ( nread<size && razf_read(fai->rz,&c,1) && !fai->rz->z_err )
451 {
452 if ( !isgraph(c) )
453 continue;
454
455 // Conversion between uint8_t coding and ACGT
456 // -12-4---8-------
457 // =ACMGRSVTWYHKDBN
458 if ( c=='A' || c=='a' )
459 *ptr = 1;
460 else if ( c=='C' || c=='c' )
461 *ptr = 2;
462 else if ( c=='G' || c=='g' )
463 *ptr = 4;
464 else if ( c=='T' || c=='t' )
465 *ptr = 8;
466 else
467 *ptr = 0;
468 ptr++;
469 nread++;
470 }
471 if ( nread < stats->mrseq_buf )
472 {
473 memset(ptr,0, stats->mrseq_buf - nread);
474 nread = stats->mrseq_buf;
475 }
476 stats->nrseq_buf = nread;
477 stats->rseq_pos = pos;
478 stats->tid = tid;
479 }
480
481 float fai_gc_content(stats_t *stats, int pos, int len)
482 {
483 uint32_t gc,count,c;
484 int i = pos - stats->rseq_pos, ito = i + len;
485 assert( i>=0 && ito<=stats->nrseq_buf );
486
487 // Count GC content
488 gc = count = 0;
489 for (; i<ito; i++)
490 {
491 c = stats->rseq_buf[i];
492 if ( c==2 || c==4 )
493 {
494 gc++;
495 count++;
496 }
497 else if ( c==1 || c==8 )
498 count++;
499 }
500 return count ? (float)gc/count : 0;
501 }
502
503 void realloc_rseq_buffer(stats_t *stats)
504 {
505 int n = stats->nbases*10;
506 if ( stats->gcd_bin_size > n ) n = stats->gcd_bin_size;
507 if ( stats->mrseq_buf<n )
508 {
509 stats->rseq_buf = realloc(stats->rseq_buf,sizeof(uint8_t)*n);
510 stats->mrseq_buf = n;
511 }
512 }
513
514 void realloc_gcd_buffer(stats_t *stats, int seq_len)
515 {
516 if ( seq_len >= stats->gcd_bin_size )
517 error("The --GC-depth bin size (%d) is set too low for the read length %d\n", stats->gcd_bin_size, seq_len);
518
519 int n = 1 + stats->gcd_ref_size / (stats->gcd_bin_size - seq_len);
520 if ( n <= stats->igcd )
521 error("The --GC-depth bin size is too small or reference genome too big; please decrease the bin size or increase the reference length\n");
522
523 if ( n > stats->ngcd )
524 {
525 stats->gcd = realloc(stats->gcd, n*sizeof(gc_depth_t));
526 if ( !stats->gcd )
527 error("Could not realloc GCD buffer, too many chromosomes or the genome too long?? [%u %u]\n", stats->ngcd,n);
528 memset(&(stats->gcd[stats->ngcd]),0,(n-stats->ngcd)*sizeof(gc_depth_t));
529 stats->ngcd = n;
530 }
531
532 realloc_rseq_buffer(stats);
533 }
534
535 void realloc_buffers(stats_t *stats, int seq_len)
536 {
537 int n = 2*(1 + seq_len - stats->nbases) + stats->nbases;
538
539 stats->quals_1st = realloc(stats->quals_1st, n*stats->nquals*sizeof(uint64_t));
540 if ( !stats->quals_1st )
541 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
542 memset(stats->quals_1st + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
543
544 stats->quals_2nd = realloc(stats->quals_2nd, n*stats->nquals*sizeof(uint64_t));
545 if ( !stats->quals_2nd )
546 error("Could not realloc buffers, the sequence too long: %d (2x%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
547 memset(stats->quals_2nd + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
548
549 if ( stats->mpc_buf )
550 {
551 stats->mpc_buf = realloc(stats->mpc_buf, n*stats->nquals*sizeof(uint64_t));
552 if ( !stats->mpc_buf )
553 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
554 memset(stats->mpc_buf + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
555 }
556
557 stats->acgt_cycles = realloc(stats->acgt_cycles, n*4*sizeof(uint64_t));
558 if ( !stats->acgt_cycles )
559 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*4*sizeof(uint64_t));
560 memset(stats->acgt_cycles + stats->nbases*4, 0, (n-stats->nbases)*4*sizeof(uint64_t));
561
562 stats->read_lengths = realloc(stats->read_lengths, n*sizeof(uint64_t));
563 if ( !stats->read_lengths )
564 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
565 memset(stats->read_lengths + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
566
567 stats->insertions = realloc(stats->insertions, n*sizeof(uint64_t));
568 if ( !stats->insertions )
569 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
570 memset(stats->insertions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
571
572 stats->deletions = realloc(stats->deletions, n*sizeof(uint64_t));
573 if ( !stats->deletions )
574 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
575 memset(stats->deletions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
576
577 stats->ins_cycles_1st = realloc(stats->ins_cycles_1st, (n+1)*sizeof(uint64_t));
578 if ( !stats->ins_cycles_1st )
579 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
580 memset(stats->ins_cycles_1st + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
581
582 stats->ins_cycles_2nd = realloc(stats->ins_cycles_2nd, (n+1)*sizeof(uint64_t));
583 if ( !stats->ins_cycles_2nd )
584 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
585 memset(stats->ins_cycles_2nd + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
586
587 stats->del_cycles_1st = realloc(stats->del_cycles_1st, (n+1)*sizeof(uint64_t));
588 if ( !stats->del_cycles_1st )
589 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
590 memset(stats->del_cycles_1st + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
591
592 stats->del_cycles_2nd = realloc(stats->del_cycles_2nd, (n+1)*sizeof(uint64_t));
593 if ( !stats->del_cycles_2nd )
594 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
595 memset(stats->del_cycles_2nd + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
596
597 stats->nbases = n;
598
599 // Realloc the coverage distribution buffer
600 int *rbuffer = calloc(sizeof(int),seq_len*5);
601 n = stats->cov_rbuf.size-stats->cov_rbuf.start;
602 memcpy(rbuffer,stats->cov_rbuf.buffer+stats->cov_rbuf.start,n);
603 if ( stats->cov_rbuf.start>1 )
604 memcpy(rbuffer+n,stats->cov_rbuf.buffer,stats->cov_rbuf.start);
605 stats->cov_rbuf.start = 0;
606 free(stats->cov_rbuf.buffer);
607 stats->cov_rbuf.buffer = rbuffer;
608 stats->cov_rbuf.size = seq_len*5;
609
610 realloc_rseq_buffer(stats);
611 }
612
613 void collect_stats(bam1_t *bam_line, stats_t *stats)
614 {
615 if ( stats->rg_hash )
616 {
617 const uint8_t *rg = bam_aux_get(bam_line, "RG");
618 if ( !rg ) return;
619 khiter_t k = kh_get(kh_rg, stats->rg_hash, (const char*)(rg + 1));
620 if ( k == kh_end(stats->rg_hash) ) return;
621 }
622 if ( stats->flag_require && (bam_line->core.flag & stats->flag_require)!=stats->flag_require )
623 {
624 stats->nreads_filtered++;
625 return;
626 }
627 if ( stats->flag_filter && (bam_line->core.flag & stats->flag_filter) )
628 {
629 stats->nreads_filtered++;
630 return;
631 }
632 if ( !is_in_regions(bam_line,stats) )
633 return;
634 if ( stats->filter_readlen!=-1 && bam_line->core.l_qseq!=stats->filter_readlen )
635 return;
636
637 if ( bam_line->core.flag & BAM_FQCFAIL ) stats->nreads_QCfailed++;
638 if ( bam_line->core.flag & BAM_FSECONDARY ) stats->nreads_secondary++;
639
640 int seq_len = bam_line->core.l_qseq;
641 if ( !seq_len ) return;
642
643 if ( seq_len >= stats->nbases )
644 realloc_buffers(stats,seq_len);
645 if ( stats->max_len<seq_len )
646 stats->max_len = seq_len;
647
648 stats->read_lengths[seq_len]++;
649
650 // Count GC and ACGT per cycle
651 uint8_t base, *seq = bam1_seq(bam_line);
652 int gc_count = 0;
653 int i;
654 int reverse = IS_REVERSE(bam_line);
655 for (i=0; i<seq_len; i++)
656 {
657 // Conversion from uint8_t coding to ACGT
658 // -12-4---8-------
659 // =ACMGRSVTWYHKDBN
660 // 01 2 3
661 base = bam1_seqi(seq,i);
662 base /= 2;
663 if ( base==1 || base==2 ) gc_count++;
664 else if ( base>2 ) base=3;
665 if ( 4*(reverse ? seq_len-i-1 : i) + base >= stats->nbases*4 )
666 error("FIXME: acgt_cycles\n");
667 stats->acgt_cycles[ 4*(reverse ? seq_len-i-1 : i) + base ]++;
668 }
669 int gc_idx_min = gc_count*(stats->ngc-1)/seq_len;
670 int gc_idx_max = (gc_count+1)*(stats->ngc-1)/seq_len;
671 if ( gc_idx_max >= stats->ngc ) gc_idx_max = stats->ngc - 1;
672
673 // Determine which array (1st or 2nd read) will these stats go to,
674 // trim low quality bases from end the same way BWA does,
675 // fill GC histogram
676 uint64_t *quals;
677 uint8_t *bam_quals = bam1_qual(bam_line);
678 if ( bam_line->core.flag&BAM_FREAD2 )
679 {
680 quals = stats->quals_2nd;
681 stats->nreads_2nd++;
682 for (i=gc_idx_min; i<gc_idx_max; i++)
683 stats->gc_2nd[i]++;
684 }
685 else
686 {
687 quals = stats->quals_1st;
688 stats->nreads_1st++;
689 for (i=gc_idx_min; i<gc_idx_max; i++)
690 stats->gc_1st[i]++;
691 }
692 if ( stats->trim_qual>0 )
693 stats->nbases_trimmed += bwa_trim_read(stats->trim_qual, bam_quals, seq_len, reverse);
694
695 // Quality histogram and average quality
696 for (i=0; i<seq_len; i++)
697 {
698 uint8_t qual = bam_quals[ reverse ? seq_len-i-1 : i];
699 if ( qual>=stats->nquals )
700 error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals,stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
701 if ( qual>stats->max_qual )
702 stats->max_qual = qual;
703
704 quals[ i*stats->nquals+qual ]++;
705 stats->sum_qual += qual;
706 }
707
708 // Look at the flags and increment appropriate counters (mapped, paired, etc)
709 if ( IS_UNMAPPED(bam_line) )
710 stats->nreads_unmapped++;
711 else
712 {
713 if ( !bam_line->core.qual )
714 stats->nreads_mq0++;
715
716 count_indels(stats,bam_line);
717
718 if ( !IS_PAIRED(bam_line) )
719 stats->nreads_unpaired++;
720 else
721 {
722 stats->nreads_paired++;
723
724 if ( bam_line->core.tid!=bam_line->core.mtid )
725 stats->nreads_anomalous++;
726
727 // The insert size is tricky, because for long inserts the libraries are
728 // prepared differently and the pairs point in other direction. BWA does
729 // not set the paired flag for them. Similar thing is true also for 454
730 // reads. Mates mapped to different chromosomes have isize==0.
731 int32_t isize = bam_line->core.isize;
732 if ( isize<0 ) isize = -isize;
733 if ( isize >= stats->nisize )
734 isize = stats->nisize-1;
735 if ( isize>0 || bam_line->core.tid==bam_line->core.mtid )
736 {
737 int pos_fst = bam_line->core.mpos - bam_line->core.pos;
738 int is_fst = IS_READ1(bam_line) ? 1 : -1;
739 int is_fwd = IS_REVERSE(bam_line) ? -1 : 1;
740 int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1;
741
742 if ( is_fwd*is_mfwd>0 )
743 stats->isize_other[isize]++;
744 else if ( is_fst*pos_fst>0 )
745 {
746 if ( is_fst*is_fwd>0 )
747 stats->isize_inward[isize]++;
748 else
749 stats->isize_outward[isize]++;
750 }
751 else if ( is_fst*pos_fst<0 )
752 {
753 if ( is_fst*is_fwd>0 )
754 stats->isize_outward[isize]++;
755 else
756 stats->isize_inward[isize]++;
757 }
758 }
759 }
760
761 // Number of mismatches
762 uint8_t *nm = bam_aux_get(bam_line,"NM");
763 if (nm)
764 stats->nmismatches += bam_aux2i(nm);
765
766 // Number of mapped bases from cigar
767 // Conversion from uint32_t to MIDNSHP
768 // 012-4--
769 // MIDNSHP
770 if ( bam_line->core.n_cigar == 0)
771 error("FIXME: mapped read with no cigar?\n");
772 int readlen=seq_len;
773 if ( stats->regions )
774 {
775 // Count only on-target bases
776 int iref = bam_line->core.pos + 1;
777 for (i=0; i<bam_line->core.n_cigar; i++)
778 {
779 int cig = bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK;
780 int ncig = bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
781 if ( cig==2 ) readlen += ncig;
782 else if ( cig==0 )
783 {
784 if ( iref < stats->reg_from ) ncig -= stats->reg_from-iref;
785 else if ( iref+ncig-1 > stats->reg_to ) ncig -= iref+ncig-1 - stats->reg_to;
786 if ( ncig<0 ) ncig = 0;
787 stats->nbases_mapped_cigar += ncig;
788 iref += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
789 }
790 else if ( cig==1 )
791 {
792 iref += ncig;
793 if ( iref>=stats->reg_from && iref<=stats->reg_to )
794 stats->nbases_mapped_cigar += ncig;
795 }
796 }
797 }
798 else
799 {
800 // Count the whole read
801 for (i=0; i<bam_line->core.n_cigar; i++)
802 {
803 if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==0 || (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==1 )
804 stats->nbases_mapped_cigar += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
805 if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==2 )
806 readlen += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
807 }
808 }
809 stats->nbases_mapped += seq_len;
810
811 if ( stats->tid==bam_line->core.tid && bam_line->core.pos<stats->pos )
812 stats->is_sorted = 0;
813 stats->pos = bam_line->core.pos;
814
815 if ( stats->is_sorted )
816 {
817 if ( stats->tid==-1 || stats->tid!=bam_line->core.tid )
818 round_buffer_flush(stats,-1);
819
820 // Mismatches per cycle and GC-depth graph. For simplicity, reads overlapping GCD bins
821 // are not splitted which results in up to seq_len-1 overlaps. The default bin size is
822 // 20kbp, so the effect is negligible.
823 if ( stats->fai )
824 {
825 int inc_ref = 0, inc_gcd = 0;
826 // First pass or new chromosome
827 if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid ) { inc_ref=1; inc_gcd=1; }
828 // Read goes beyond the end of the rseq buffer
829 else if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+readlen ) { inc_ref=1; inc_gcd=1; }
830 // Read overlaps the next gcd bin
831 else if ( stats->gcd_pos+stats->gcd_bin_size < bam_line->core.pos+readlen )
832 {
833 inc_gcd = 1;
834 if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+stats->gcd_bin_size ) inc_ref = 1;
835 }
836 if ( inc_gcd )
837 {
838 stats->igcd++;
839 if ( stats->igcd >= stats->ngcd )
840 realloc_gcd_buffer(stats, readlen);
841 if ( inc_ref )
842 read_ref_seq(stats,bam_line->core.tid,bam_line->core.pos);
843 stats->gcd_pos = bam_line->core.pos;
844 stats->gcd[ stats->igcd ].gc = fai_gc_content(stats, stats->gcd_pos, stats->gcd_bin_size);
845 }
846
847 count_mismatches_per_cycle(stats,bam_line);
848 }
849 // No reference and first pass, new chromosome or sequence going beyond the end of the gcd bin
850 else if ( stats->gcd_pos==-1 || stats->tid != bam_line->core.tid || bam_line->core.pos - stats->gcd_pos > stats->gcd_bin_size )
851 {
852 // First pass or a new chromosome
853 stats->tid = bam_line->core.tid;
854 stats->gcd_pos = bam_line->core.pos;
855 stats->igcd++;
856 if ( stats->igcd >= stats->ngcd )
857 realloc_gcd_buffer(stats, readlen);
858 }
859 stats->gcd[ stats->igcd ].depth++;
860 // When no reference sequence is given, approximate the GC from the read (much shorter window, but otherwise OK)
861 if ( !stats->fai )
862 stats->gcd[ stats->igcd ].gc += (float) gc_count / seq_len;
863
864 // Coverage distribution graph
865 round_buffer_flush(stats,bam_line->core.pos);
866 round_buffer_insert_read(&(stats->cov_rbuf),bam_line->core.pos,bam_line->core.pos+seq_len-1);
867 }
868 }
869
870 stats->total_len += seq_len;
871 if ( IS_DUP(bam_line) )
872 {
873 stats->total_len_dup += seq_len;
874 stats->nreads_dup++;
875 }
876 }
877
878 // Sort by GC and depth
879 #define GCD_t(x) ((gc_depth_t *)x)
880 static int gcd_cmp(const void *a, const void *b)
881 {
882 if ( GCD_t(a)->gc < GCD_t(b)->gc ) return -1;
883 if ( GCD_t(a)->gc > GCD_t(b)->gc ) return 1;
884 if ( GCD_t(a)->depth < GCD_t(b)->depth ) return -1;
885 if ( GCD_t(a)->depth > GCD_t(b)->depth ) return 1;
886 return 0;
887 }
888 #undef GCD_t
889
890 float gcd_percentile(gc_depth_t *gcd, int N, int p)
891 {
892 float n,d;
893 int k;
894
895 n = p*(N+1)/100;
896 k = n;
897 if ( k<=0 )
898 return gcd[0].depth;
899 if ( k>=N )
900 return gcd[N-1].depth;
901
902 d = n - k;
903 return gcd[k-1].depth + d*(gcd[k].depth - gcd[k-1].depth);
904 }
905
906 void output_stats(stats_t *stats)
907 {
908 // Calculate average insert size and standard deviation (from the main bulk data only)
909 int isize, ibulk=0;
910 uint64_t nisize=0, nisize_inward=0, nisize_outward=0, nisize_other=0;
911 for (isize=0; isize<stats->nisize; isize++)
912 {
913 // Each pair was counted twice
914 stats->isize_inward[isize] *= 0.5;
915 stats->isize_outward[isize] *= 0.5;
916 stats->isize_other[isize] *= 0.5;
917
918 nisize_inward += stats->isize_inward[isize];
919 nisize_outward += stats->isize_outward[isize];
920 nisize_other += stats->isize_other[isize];
921 nisize += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize];
922 }
923
924 double bulk=0, avg_isize=0, sd_isize=0;
925 for (isize=0; isize<stats->nisize; isize++)
926 {
927 bulk += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize];
928 avg_isize += isize * (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]);
929
930 if ( bulk/nisize > stats->isize_main_bulk )
931 {
932 ibulk = isize+1;
933 nisize = bulk;
934 break;
935 }
936 }
937 avg_isize /= nisize ? nisize : 1;
938 for (isize=1; isize<ibulk; isize++)
939 sd_isize += (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]) * (isize-avg_isize)*(isize-avg_isize) / nisize;
940 sd_isize = sqrt(sd_isize);
941
942
943 printf("# This file was produced by bamcheck (%s)\n",BAMCHECK_VERSION);
944 printf("# The command line was: %s",stats->argv[0]);
945 int i;
946 for (i=1; i<stats->argc; i++)
947 printf(" %s",stats->argv[i]);
948 printf("\n");
949 printf("# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n");
950 printf("SN\traw total sequences:\t%ld\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd));
951 printf("SN\tfiltered sequences:\t%ld\n", (long)stats->nreads_filtered);
952 printf("SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd));
953 printf("SN\tis paired:\t%d\n", stats->nreads_1st&&stats->nreads_2nd ? 1 : 0);
954 printf("SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0);
955 printf("SN\t1st fragments:\t%ld\n", (long)stats->nreads_1st);
956 printf("SN\tlast fragments:\t%ld\n", (long)stats->nreads_2nd);
957 printf("SN\treads mapped:\t%ld\n", (long)(stats->nreads_paired+stats->nreads_unpaired));
958 printf("SN\treads unmapped:\t%ld\n", (long)stats->nreads_unmapped);
959 printf("SN\treads unpaired:\t%ld\n", (long)stats->nreads_unpaired);
960 printf("SN\treads paired:\t%ld\n", (long)stats->nreads_paired);
961 printf("SN\treads duplicated:\t%ld\n", (long)stats->nreads_dup);
962 printf("SN\treads MQ0:\t%ld\n", (long)stats->nreads_mq0);
963 printf("SN\treads QC failed:\t%ld\n", (long)stats->nreads_QCfailed);
964 printf("SN\tnon-primary alignments:\t%ld\n", (long)stats->nreads_secondary);
965 printf("SN\ttotal length:\t%ld\n", (long)stats->total_len);
966 printf("SN\tbases mapped:\t%ld\n", (long)stats->nbases_mapped);
967 printf("SN\tbases mapped (cigar):\t%ld\n", (long)stats->nbases_mapped_cigar);
968 printf("SN\tbases trimmed:\t%ld\n", (long)stats->nbases_trimmed);
969 printf("SN\tbases duplicated:\t%ld\n", (long)stats->total_len_dup);
970 printf("SN\tmismatches:\t%ld\n", (long)stats->nmismatches);
971 printf("SN\terror rate:\t%e\n", (float)stats->nmismatches/stats->nbases_mapped_cigar);
972 float avg_read_length = (stats->nreads_1st+stats->nreads_2nd)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd):0;
973 printf("SN\taverage length:\t%.0f\n", avg_read_length);
974 printf("SN\tmaximum length:\t%d\n", stats->max_len);
975 printf("SN\taverage quality:\t%.1f\n", stats->total_len?stats->sum_qual/stats->total_len:0);
976 printf("SN\tinsert size average:\t%.1f\n", avg_isize);
977 printf("SN\tinsert size standard deviation:\t%.1f\n", sd_isize);
978 printf("SN\tinward oriented pairs:\t%ld\n", (long)nisize_inward);
979 printf("SN\toutward oriented pairs:\t%ld\n", (long)nisize_outward);
980 printf("SN\tpairs with other orientation:\t%ld\n", (long)nisize_other);
981 printf("SN\tpairs on different chromosomes:\t%ld\n", (long)stats->nreads_anomalous/2);
982
983 int ibase,iqual;
984 if ( stats->max_len<stats->nbases ) stats->max_len++;
985 if ( stats->max_qual+1<stats->nquals ) stats->max_qual++;
986 printf("# First Fragment Qualitites. Use `grep ^FFQ | cut -f 2-` to extract this part.\n");
987 printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
988 for (ibase=0; ibase<stats->max_len; ibase++)
989 {
990 printf("FFQ\t%d",ibase+1);
991 for (iqual=0; iqual<=stats->max_qual; iqual++)
992 {
993 printf("\t%ld", (long)stats->quals_1st[ibase*stats->nquals+iqual]);
994 }
995 printf("\n");
996 }
997 printf("# Last Fragment Qualitites. Use `grep ^LFQ | cut -f 2-` to extract this part.\n");
998 printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
999 for (ibase=0; ibase<stats->max_len; ibase++)
1000 {
1001 printf("LFQ\t%d",ibase+1);
1002 for (iqual=0; iqual<=stats->max_qual; iqual++)
1003 {
1004 printf("\t%ld", (long)stats->quals_2nd[ibase*stats->nquals+iqual]);
1005 }
1006 printf("\n");
1007 }
1008 if ( stats->mpc_buf )
1009 {
1010 printf("# Mismatches per cycle and quality. Use `grep ^MPC | cut -f 2-` to extract this part.\n");
1011 printf("# Columns correspond to qualities, rows to cycles. First column is the cycle number, second\n");
1012 printf("# is the number of N's and the rest is the number of mismatches\n");
1013 for (ibase=0; ibase<stats->max_len; ibase++)
1014 {
1015 printf("MPC\t%d",ibase+1);
1016 for (iqual=0; iqual<=stats->max_qual; iqual++)
1017 {
1018 printf("\t%ld", (long)stats->mpc_buf[ibase*stats->nquals+iqual]);
1019 }
1020 printf("\n");
1021 }
1022 }
1023 printf("# GC Content of first fragments. Use `grep ^GCF | cut -f 2-` to extract this part.\n");
1024 int ibase_prev = 0;
1025 for (ibase=0; ibase<stats->ngc; ibase++)
1026 {
1027 if ( stats->gc_1st[ibase]==stats->gc_1st[ibase_prev] ) continue;
1028 printf("GCF\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_1st[ibase_prev]);
1029 ibase_prev = ibase;
1030 }
1031 printf("# GC Content of last fragments. Use `grep ^GCL | cut -f 2-` to extract this part.\n");
1032 ibase_prev = 0;
1033 for (ibase=0; ibase<stats->ngc; ibase++)
1034 {
1035 if ( stats->gc_2nd[ibase]==stats->gc_2nd[ibase_prev] ) continue;
1036 printf("GCL\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_2nd[ibase_prev]);
1037 ibase_prev = ibase;
1038 }
1039 printf("# ACGT content per cycle. Use `grep ^GCC | cut -f 2-` to extract this part. The columns are: cycle, and A,C,G,T counts [%%]\n");
1040 for (ibase=0; ibase<stats->max_len; ibase++)
1041 {
1042 uint64_t *ptr = &(stats->acgt_cycles[ibase*4]);
1043 uint64_t sum = ptr[0]+ptr[1]+ptr[2]+ptr[3];
1044 if ( ! sum ) continue;
1045 printf("GCC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase,100.*ptr[0]/sum,100.*ptr[1]/sum,100.*ptr[2]/sum,100.*ptr[3]/sum);
1046 }
1047 printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: pairs total, inward oriented pairs, outward oriented pairs, other pairs\n");
1048 for (isize=0; isize<ibulk; isize++)
1049 printf("IS\t%d\t%ld\t%ld\t%ld\t%ld\n", isize, (long)(stats->isize_inward[isize]+stats->isize_outward[isize]+stats->isize_other[isize]),
1050 (long)stats->isize_inward[isize], (long)stats->isize_outward[isize], (long)stats->isize_other[isize]);
1051
1052 printf("# Read lengths. Use `grep ^RL | cut -f 2-` to extract this part. The columns are: read length, count\n");
1053 int ilen;
1054 for (ilen=0; ilen<stats->max_len; ilen++)
1055 {
1056 if ( stats->read_lengths[ilen]>0 )
1057 printf("RL\t%d\t%ld\n", ilen, (long)stats->read_lengths[ilen]);
1058 }
1059
1060 printf("# Indel distribution. Use `grep ^ID | cut -f 2-` to extract this part. The columns are: length, number of insertions, number of deletions\n");
1061 for (ilen=0; ilen<stats->nindels; ilen++)
1062 {
1063 if ( stats->insertions[ilen]>0 || stats->deletions[ilen]>0 )
1064 printf("ID\t%d\t%ld\t%ld\n", ilen+1, (long)stats->insertions[ilen], (long)stats->deletions[ilen]);
1065 }
1066
1067 printf("# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions (fwd), .. (rev) , number of deletions (fwd), .. (rev)\n");
1068 for (ilen=0; ilen<=stats->nbases; ilen++)
1069 {
1070 // For deletions we print the index of the cycle before the deleted base (1-based) and for insertions
1071 // the index of the cycle of the first inserted base (also 1-based)
1072 if ( stats->ins_cycles_1st[ilen]>0 || stats->ins_cycles_2nd[ilen]>0 || stats->del_cycles_1st[ilen]>0 || stats->del_cycles_2nd[ilen]>0 )
1073 printf("IC\t%d\t%ld\t%ld\t%ld\t%ld\n", ilen+1, (long)stats->ins_cycles_1st[ilen], (long)stats->ins_cycles_2nd[ilen], (long)stats->del_cycles_1st[ilen], (long)stats->del_cycles_2nd[ilen]);
1074 }
1075
1076 printf("# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n");
1077 if ( stats->cov[0] )
1078 printf("COV\t[<%d]\t%d\t%ld\n",stats->cov_min,stats->cov_min-1, (long)stats->cov[0]);
1079 int icov;
1080 for (icov=1; icov<stats->ncov-1; icov++)
1081 if ( stats->cov[icov] )
1082 printf("COV\t[%d-%d]\t%d\t%ld\n",stats->cov_min + (icov-1)*stats->cov_step, stats->cov_min + icov*stats->cov_step-1,stats->cov_min + icov*stats->cov_step-1, (long)stats->cov[icov]);
1083 if ( stats->cov[stats->ncov-1] )
1084 printf("COV\t[%d<]\t%d\t%ld\n",stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov_min + (stats->ncov-2)*stats->cov_step-1, (long)stats->cov[stats->ncov-1]);
1085
1086 // Calculate average GC content, then sort by GC and depth
1087 printf("# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile\n");
1088 uint32_t igcd;
1089 for (igcd=0; igcd<stats->igcd; igcd++)
1090 {
1091 if ( stats->fai )
1092 stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc);
1093 else
1094 if ( stats->gcd[igcd].depth )
1095 stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc / stats->gcd[igcd].depth);
1096 }
1097 qsort(stats->gcd, stats->igcd+1, sizeof(gc_depth_t), gcd_cmp);
1098 igcd = 0;
1099 while ( igcd < stats->igcd )
1100 {
1101 // Calculate percentiles (10,25,50,75,90th) for the current GC content and print
1102 uint32_t nbins=0, itmp=igcd;
1103 float gc = stats->gcd[igcd].gc;
1104 while ( itmp<stats->igcd && fabs(stats->gcd[itmp].gc-gc)<0.1 )
1105 {
1106 nbins++;
1107 itmp++;
1108 }
1109 printf("GCD\t%.1f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n", gc, (igcd+nbins+1)*100./(stats->igcd+1),
1110 gcd_percentile(&(stats->gcd[igcd]),nbins,10) *avg_read_length/stats->gcd_bin_size,
1111 gcd_percentile(&(stats->gcd[igcd]),nbins,25) *avg_read_length/stats->gcd_bin_size,
1112 gcd_percentile(&(stats->gcd[igcd]),nbins,50) *avg_read_length/stats->gcd_bin_size,
1113 gcd_percentile(&(stats->gcd[igcd]),nbins,75) *avg_read_length/stats->gcd_bin_size,
1114 gcd_percentile(&(stats->gcd[igcd]),nbins,90) *avg_read_length/stats->gcd_bin_size
1115 );
1116 igcd += nbins;
1117 }
1118 }
1119
1120 size_t mygetline(char **line, size_t *n, FILE *fp)
1121 {
1122 if (line == NULL || n == NULL || fp == NULL)
1123 {
1124 errno = EINVAL;
1125 return -1;
1126 }
1127 if (*n==0 || !*line)
1128 {
1129 *line = NULL;
1130 *n = 0;
1131 }
1132
1133 size_t nread=0;
1134 int c;
1135 while ((c=getc(fp))!= EOF && c!='\n')
1136 {
1137 if ( ++nread>=*n )
1138 {
1139 *n += 255;
1140 *line = realloc(*line, sizeof(char)*(*n));
1141 }
1142 (*line)[nread-1] = c;
1143 }
1144 if ( nread>=*n )
1145 {
1146 *n += 255;
1147 *line = realloc(*line, sizeof(char)*(*n));
1148 }
1149 (*line)[nread] = 0;
1150 return nread>0 ? nread : -1;
1151
1152 }
1153
1154 void init_regions(stats_t *stats, char *file)
1155 {
1156 khiter_t iter;
1157 khash_t(kh_bam_tid) *header_hash;
1158
1159 bam_init_header_hash(stats->sam->header);
1160 header_hash = (khash_t(kh_bam_tid)*)stats->sam->header->hash;
1161
1162 FILE *fp = fopen(file,"r");
1163 if ( !fp ) error("%s: %s\n",file,strerror(errno));
1164
1165 char *line = NULL;
1166 size_t len = 0;
1167 ssize_t nread;
1168 int warned = 0;
1169 int prev_tid=-1, prev_pos=-1;
1170 while ((nread = mygetline(&line, &len, fp)) != -1)
1171 {
1172 if ( line[0] == '#' ) continue;
1173
1174 int i = 0;
1175 while ( i<nread && !isspace(line[i]) ) i++;
1176 if ( i>=nread ) error("Could not parse the file: %s [%s]\n", file,line);
1177 line[i] = 0;
1178
1179 iter = kh_get(kh_bam_tid, header_hash, line);
1180 int tid = kh_val(header_hash, iter);
1181 if ( iter == kh_end(header_hash) )
1182 {
1183 if ( !warned )
1184 fprintf(stderr,"Warning: Some sequences not present in the BAM, e.g. \"%s\". This message is printed only once.\n", line);
1185 warned = 1;
1186 continue;
1187 }
1188
1189 if ( tid >= stats->nregions )
1190 {
1191 stats->regions = realloc(stats->regions,sizeof(regions_t)*(stats->nregions+100));
1192 int j;
1193 for (j=stats->nregions; j<stats->nregions+100; j++)
1194 {
1195 stats->regions[j].npos = stats->regions[j].mpos = stats->regions[j].cpos = 0;
1196 stats->regions[j].pos = NULL;
1197 }
1198 stats->nregions += 100;
1199 }
1200 int npos = stats->regions[tid].npos;
1201 if ( npos >= stats->regions[tid].mpos )
1202 {
1203 stats->regions[tid].mpos += 1000;
1204 stats->regions[tid].pos = realloc(stats->regions[tid].pos,sizeof(pos_t)*stats->regions[tid].mpos);
1205 }
1206
1207 if ( (sscanf(line+i+1,"%d %d",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n");
1208 if ( prev_tid==-1 || prev_tid!=tid )
1209 {
1210 prev_tid = tid;
1211 prev_pos = stats->regions[tid].pos[npos].from;
1212 }
1213 if ( prev_pos>stats->regions[tid].pos[npos].from )
1214 error("The positions are not in chromosomal order (%s:%d comes after %d)\n", line,stats->regions[tid].pos[npos].from,prev_pos);
1215 stats->regions[tid].npos++;
1216 }
1217 if (line) free(line);
1218 if ( !stats->regions ) error("Unable to map the -t sequences to the BAM sequences.\n");
1219 fclose(fp);
1220 }
1221
1222 void destroy_regions(stats_t *stats)
1223 {
1224 int i;
1225 for (i=0; i<stats->nregions; i++)
1226 {
1227 if ( !stats->regions[i].mpos ) continue;
1228 free(stats->regions[i].pos);
1229 }
1230 if ( stats->regions ) free(stats->regions);
1231 }
1232
1233 static int fetch_read(const bam1_t *bam_line, void *data)
1234 {
1235 collect_stats((bam1_t*)bam_line,(stats_t*)data);
1236 return 1;
1237 }
1238
1239 void reset_regions(stats_t *stats)
1240 {
1241 int i;
1242 for (i=0; i<stats->nregions; i++)
1243 stats->regions[i].cpos = 0;
1244 }
1245
1246 int is_in_regions(bam1_t *bam_line, stats_t *stats)
1247 {
1248 if ( !stats->regions ) return 1;
1249
1250 if ( bam_line->core.tid >= stats->nregions || bam_line->core.tid<0 ) return 0;
1251 if ( !stats->is_sorted ) error("The BAM must be sorted in order for -t to work.\n");
1252
1253 regions_t *reg = &stats->regions[bam_line->core.tid];
1254 if ( reg->cpos==reg->npos ) return 0; // done for this chr
1255
1256 // Find a matching interval or skip this read. No splicing of reads is done, no indels or soft clips considered,
1257 // even small overlap is enough to include the read in the stats.
1258 int i = reg->cpos;
1259 while ( i<reg->npos && reg->pos[i].to<=bam_line->core.pos ) i++;
1260 if ( i>=reg->npos ) { reg->cpos = reg->npos; return 0; }
1261 if ( bam_line->core.pos + bam_line->core.l_qseq + 1 < reg->pos[i].from ) return 0;
1262 reg->cpos = i;
1263 stats->reg_from = reg->pos[i].from;
1264 stats->reg_to = reg->pos[i].to;
1265
1266 return 1;
1267 }
1268
1269 void init_group_id(stats_t *stats, char *id)
1270 {
1271 if ( !stats->sam->header->dict )
1272 stats->sam->header->dict = sam_header_parse2(stats->sam->header->text);
1273 void *iter = stats->sam->header->dict;
1274 const char *key, *val;
1275 int n = 0;
1276 stats->rg_hash = kh_init(kh_rg);
1277 while ( (iter = sam_header2key_val(iter, "RG","ID","SM", &key, &val)) )
1278 {
1279 if ( !strcmp(id,key) || (val && !strcmp(id,val)) )
1280 {
1281 khiter_t k = kh_get(kh_rg, stats->rg_hash, key);
1282 if ( k != kh_end(stats->rg_hash) )
1283 fprintf(stderr, "[init_group_id] The group ID not unique: \"%s\"\n", key);
1284 int ret;
1285 k = kh_put(kh_rg, stats->rg_hash, key, &ret);
1286 kh_value(stats->rg_hash, k) = val;
1287 n++;
1288 }
1289 }
1290 if ( !n )
1291 error("The sample or read group \"%s\" not present.\n", id);
1292 }
1293
1294
1295 void error(const char *format, ...)
1296 {
1297 if ( !format )
1298 {
1299 printf("Version: %s\n", BAMCHECK_VERSION);
1300 printf("About: The program collects statistics from BAM files. The output can be visualized using plot-bamcheck.\n");
1301 printf("Usage: bamcheck [OPTIONS] file.bam\n");
1302 printf(" bamcheck [OPTIONS] file.bam chr:from-to\n");
1303 printf("Options:\n");
1304 printf(" -c, --coverage <int>,<int>,<int> Coverage distribution min,max,step [1,1000,1]\n");
1305 printf(" -d, --remove-dups Exlude from statistics reads marked as duplicates\n");
1306 printf(" -f, --required-flag <int> Required flag, 0 for unset [0]\n");
1307 printf(" -F, --filtering-flag <int> Filtering flag, 0 for unset [0]\n");
1308 printf(" --GC-depth <float,float> Bin size for GC-depth graph and the maximum reference length [2e4,4.2e9]\n");
1309 printf(" -h, --help This help message\n");
1310 printf(" -i, --insert-size <int> Maximum insert size [8000]\n");
1311 printf(" -I, --id <string> Include only listed read group or sample name\n");
1312 printf(" -l, --read-length <int> Include in the statistics only reads with the given read length []\n");
1313 printf(" -m, --most-inserts <float> Report only the main part of inserts [0.99]\n");
1314 printf(" -q, --trim-quality <int> The BWA trimming parameter [0]\n");
1315 printf(" -r, --ref-seq <file> Reference sequence (required for GC-depth calculation).\n");
1316 printf(" -t, --target-regions <file> Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n");
1317 printf(" -s, --sam Input is SAM\n");
1318 printf("\n");
1319 }
1320 else
1321 {
1322 va_list ap;
1323 va_start(ap, format);
1324 vfprintf(stderr, format, ap);
1325 va_end(ap);
1326 }
1327 exit(-1);
1328 }
1329
1330 int main(int argc, char *argv[])
1331 {
1332 char *targets = NULL;
1333 char *bam_fname = NULL;
1334 char *group_id = NULL;
1335 samfile_t *sam = NULL;
1336 char in_mode[5];
1337
1338 stats_t *stats = calloc(1,sizeof(stats_t));
1339 stats->ngc = 200;
1340 stats->nquals = 256;
1341 stats->nbases = 300;
1342 stats->nisize = 8000;
1343 stats->max_len = 30;
1344 stats->max_qual = 40;
1345 stats->isize_main_bulk = 0.99; // There are always outliers at the far end
1346 stats->gcd_bin_size = 20e3;
1347 stats->gcd_ref_size = 4.2e9;
1348 stats->rseq_pos = -1;
1349 stats->tid = stats->gcd_pos = -1;
1350 stats->igcd = 0;
1351 stats->is_sorted = 1;
1352 stats->cov_min = 1;
1353 stats->cov_max = 1000;
1354 stats->cov_step = 1;
1355 stats->argc = argc;
1356 stats->argv = argv;
1357 stats->filter_readlen = -1;
1358 stats->nindels = stats->nbases;
1359
1360 strcpy(in_mode, "rb");
1361
1362 static struct option loptions[] =
1363 {
1364 {"help",0,0,'h'},
1365 {"remove-dups",0,0,'d'},
1366 {"sam",0,0,'s'},
1367 {"ref-seq",1,0,'r'},
1368 {"coverage",1,0,'c'},
1369 {"read-length",1,0,'l'},
1370 {"insert-size",1,0,'i'},
1371 {"most-inserts",1,0,'m'},
1372 {"trim-quality",1,0,'q'},
1373 {"target-regions",0,0,'t'},
1374 {"required-flag",1,0,'f'},
1375 {"filtering-flag",0,0,'F'},
1376 {"id",1,0,'I'},
1377 {"GC-depth",1,0,1},
1378 {0,0,0,0}
1379 };
1380 int opt;
1381 while ( (opt=getopt_long(argc,argv,"?hdsr:c:l:i:t:m:q:f:F:I:1:",loptions,NULL))>0 )
1382 {
1383 switch (opt)
1384 {
1385 case 'f': stats->flag_require=strtol(optarg,0,0); break;
1386 case 'F': stats->flag_filter=strtol(optarg,0,0); break;
1387 case 'd': stats->flag_filter|=BAM_FDUP; break;
1388 case 's': strcpy(in_mode, "r"); break;
1389 case 'r': stats->fai = fai_load(optarg);
1390 if (stats->fai==0)
1391 error("Could not load faidx: %s\n", optarg);
1392 break;
1393 case 1 : {
1394 float flen,fbin;
1395 if ( sscanf(optarg,"%f,%f",&fbin,&flen)!= 2 )
1396 error("Unable to parse --GC-depth %s\n", optarg);
1397 stats->gcd_bin_size = fbin;
1398 stats->gcd_ref_size = flen;
1399 }
1400 break;
1401 case 'c': if ( sscanf(optarg,"%d,%d,%d",&stats->cov_min,&stats->cov_max,&stats->cov_step)!= 3 )
1402 error("Unable to parse -c %s\n", optarg);
1403 break;
1404 case 'l': stats->filter_readlen = atoi(optarg); break;
1405 case 'i': stats->nisize = atoi(optarg); break;
1406 case 'm': stats->isize_main_bulk = atof(optarg); break;
1407 case 'q': stats->trim_qual = atoi(optarg); break;
1408 case 't': targets = optarg; break;
1409 case 'I': group_id = optarg; break;
1410 case '?':
1411 case 'h': error(NULL);
1412 default: error("Unknown argument: %s\n", optarg);
1413 }
1414 }
1415 if ( optind<argc )
1416 bam_fname = argv[optind++];
1417
1418 if ( !bam_fname )
1419 {
1420 if ( isatty(fileno((FILE *)stdin)) )
1421 error(NULL);
1422 bam_fname = "-";
1423 }
1424
1425 // Init structures
1426 // .. coverage bins and round buffer
1427 if ( stats->cov_step > stats->cov_max - stats->cov_min + 1 )
1428 {
1429 stats->cov_step = stats->cov_max - stats->cov_min;
1430 if ( stats->cov_step <= 0 )
1431 stats->cov_step = 1;
1432 }
1433 stats->ncov = 3 + (stats->cov_max-stats->cov_min) / stats->cov_step;
1434 stats->cov_max = stats->cov_min + ((stats->cov_max-stats->cov_min)/stats->cov_step +1)*stats->cov_step - 1;
1435 stats->cov = calloc(sizeof(uint64_t),stats->ncov);
1436 stats->cov_rbuf.size = stats->nbases*5;
1437 stats->cov_rbuf.buffer = calloc(sizeof(int32_t),stats->cov_rbuf.size);
1438 // .. bam
1439 if ((sam = samopen(bam_fname, in_mode, NULL)) == 0)
1440 error("Failed to open: %s\n", bam_fname);
1441 stats->sam = sam;
1442 if ( group_id ) init_group_id(stats, group_id);
1443 bam1_t *bam_line = bam_init1();
1444 // .. arrays
1445 stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
1446 stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
1447 stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t));
1448 stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t));
1449 stats->isize_inward = calloc(stats->nisize,sizeof(uint64_t));
1450 stats->isize_outward = calloc(stats->nisize,sizeof(uint64_t));
1451 stats->isize_other = calloc(stats->nisize,sizeof(uint64_t));
1452 stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t));
1453 stats->mpc_buf = stats->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL;
1454 stats->acgt_cycles = calloc(4*stats->nbases,sizeof(uint64_t));
1455 stats->read_lengths = calloc(stats->nbases,sizeof(uint64_t));
1456 stats->insertions = calloc(stats->nbases,sizeof(uint64_t));
1457 stats->deletions = calloc(stats->nbases,sizeof(uint64_t));
1458 stats->ins_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t));
1459 stats->ins_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
1460 stats->del_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t));
1461 stats->del_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
1462 realloc_rseq_buffer(stats);
1463 if ( targets )
1464 init_regions(stats, targets);
1465
1466 // Collect statistics
1467 if ( optind<argc )
1468 {
1469 // Collect stats in selected regions only
1470 bam_index_t *bam_idx = bam_index_load(bam_fname);
1471 if (bam_idx == 0)
1472 error("Random alignment retrieval only works for indexed BAM files.\n");
1473
1474 int i;
1475 for (i=optind; i<argc; i++)
1476 {
1477 int tid, beg, end;
1478 bam_parse_region(stats->sam->header, argv[i], &tid, &beg, &end);
1479 if ( tid < 0 ) continue;
1480 reset_regions(stats);
1481 bam_fetch(stats->sam->x.bam, bam_idx, tid, beg, end, stats, fetch_read);
1482 }
1483 bam_index_destroy(bam_idx);
1484 }
1485 else
1486 {
1487 // Stream through the entire BAM ignoring off-target regions if -t is given
1488 while (samread(sam,bam_line) >= 0)
1489 collect_stats(bam_line,stats);
1490 }
1491 round_buffer_flush(stats,-1);
1492
1493 output_stats(stats);
1494
1495 bam_destroy1(bam_line);
1496 samclose(stats->sam);
1497 if (stats->fai) fai_destroy(stats->fai);
1498 free(stats->cov_rbuf.buffer); free(stats->cov);
1499 free(stats->quals_1st); free(stats->quals_2nd);
1500 free(stats->gc_1st); free(stats->gc_2nd);
1501 free(stats->isize_inward); free(stats->isize_outward); free(stats->isize_other);
1502 free(stats->gcd);
1503 free(stats->rseq_buf);
1504 free(stats->mpc_buf);
1505 free(stats->acgt_cycles);
1506 free(stats->read_lengths);
1507 free(stats->insertions);
1508 free(stats->deletions);
1509 free(stats->ins_cycles_1st);
1510 free(stats->ins_cycles_2nd);
1511 free(stats->del_cycles_1st);
1512 free(stats->del_cycles_2nd);
1513 destroy_regions(stats);
1514 free(stats);
1515 if ( stats->rg_hash ) kh_destroy(kh_rg, stats->rg_hash);
1516
1517 return 0;
1518 }
1519
1520
1521