Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/samtools-0.1.19/misc/bamcheck.c @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:903fc43d6227 |
---|---|
1 /* | |
2 Author: petr.danecek@sanger | |
3 gcc -Wall -Winline -g -O2 -I ~/git/samtools bamcheck.c -o bamcheck -lm -lz -L ~/git/samtools -lbam -lpthread | |
4 | |
5 Assumptions, approximations and other issues: | |
6 - GC-depth graph does not split reads, the starting position determines which bin is incremented. | |
7 There are small overlaps between bins (max readlen-1). However, the bins are big (20k). | |
8 - coverage distribution ignores softclips and deletions | |
9 - some stats require sorted BAMs | |
10 - GC content graph can have an untidy, step-like pattern when BAM contains multiple read lengths. | |
11 - 'bases mapped' (stats->nbases_mapped) is calculated from read lengths given by BAM (core.l_qseq) | |
12 - With the -t option, the whole reads are used. Except for the number of mapped bases (cigar) | |
13 counts, no splicing is done, no indels or soft clips are considered, even small overlap is | |
14 good enough to include the read in the stats. | |
15 | |
16 */ | |
17 | |
18 #define BAMCHECK_VERSION "2012-09-04" | |
19 | |
20 #define _ISOC99_SOURCE | |
21 #include <stdio.h> | |
22 #include <stdlib.h> | |
23 #include <stdarg.h> | |
24 #include <string.h> | |
25 #include <math.h> | |
26 #include <ctype.h> | |
27 #include <getopt.h> | |
28 #include <errno.h> | |
29 #include <assert.h> | |
30 #include "faidx.h" | |
31 #include "khash.h" | |
32 #include "sam.h" | |
33 #include "sam_header.h" | |
34 #include "razf.h" | |
35 | |
36 #define BWA_MIN_RDLEN 35 | |
37 #define IS_PAIRED(bam) ((bam)->core.flag&BAM_FPAIRED && !((bam)->core.flag&BAM_FUNMAP) && !((bam)->core.flag&BAM_FMUNMAP)) | |
38 #define IS_UNMAPPED(bam) ((bam)->core.flag&BAM_FUNMAP) | |
39 #define IS_REVERSE(bam) ((bam)->core.flag&BAM_FREVERSE) | |
40 #define IS_MATE_REVERSE(bam) ((bam)->core.flag&BAM_FMREVERSE) | |
41 #define IS_READ1(bam) ((bam)->core.flag&BAM_FREAD1) | |
42 #define IS_READ2(bam) ((bam)->core.flag&BAM_FREAD2) | |
43 #define IS_DUP(bam) ((bam)->core.flag&BAM_FDUP) | |
44 | |
45 typedef struct | |
46 { | |
47 int32_t line_len, line_blen; | |
48 int64_t len; | |
49 uint64_t offset; | |
50 } | |
51 faidx1_t; | |
52 KHASH_MAP_INIT_STR(kh_faidx, faidx1_t) | |
53 KHASH_MAP_INIT_STR(kh_bam_tid, int) | |
54 KHASH_MAP_INIT_STR(kh_rg, const char *) | |
55 struct __faidx_t { | |
56 RAZF *rz; | |
57 int n, m; | |
58 char **name; | |
59 khash_t(kh_faidx) *hash; | |
60 }; | |
61 | |
62 typedef struct | |
63 { | |
64 float gc; | |
65 uint32_t depth; | |
66 } | |
67 gc_depth_t; | |
68 | |
69 // For coverage distribution, a simple pileup | |
70 typedef struct | |
71 { | |
72 int64_t pos; | |
73 int size, start; | |
74 int *buffer; | |
75 } | |
76 round_buffer_t; | |
77 | |
78 typedef struct { uint32_t from, to; } pos_t; | |
79 typedef struct | |
80 { | |
81 int npos,mpos,cpos; | |
82 pos_t *pos; | |
83 } | |
84 regions_t; | |
85 | |
86 typedef struct | |
87 { | |
88 // Parameters | |
89 int trim_qual; // bwa trim quality | |
90 | |
91 // Dimensions of the quality histogram holder (quals_1st,quals_2nd), GC content holder (gc_1st,gc_2nd), | |
92 // insert size histogram holder | |
93 int nquals; // The number of quality bins | |
94 int nbases; // The maximum sequence length the allocated array can hold | |
95 int nisize; // The maximum insert size that the allocated array can hold | |
96 int ngc; // The size of gc_1st and gc_2nd | |
97 int nindels; // The maximum indel length for indel distribution | |
98 | |
99 // Arrays for the histogram data | |
100 uint64_t *quals_1st, *quals_2nd; | |
101 uint64_t *gc_1st, *gc_2nd; | |
102 uint64_t *isize_inward, *isize_outward, *isize_other; | |
103 uint64_t *acgt_cycles; | |
104 uint64_t *read_lengths; | |
105 uint64_t *insertions, *deletions; | |
106 uint64_t *ins_cycles_1st, *ins_cycles_2nd, *del_cycles_1st, *del_cycles_2nd; | |
107 | |
108 // The extremes encountered | |
109 int max_len; // Maximum read length | |
110 int max_qual; // Maximum quality | |
111 float isize_main_bulk; // There are always some unrealistically big insert sizes, report only the main part | |
112 int is_sorted; | |
113 | |
114 // Summary numbers | |
115 uint64_t total_len; | |
116 uint64_t total_len_dup; | |
117 uint64_t nreads_1st; | |
118 uint64_t nreads_2nd; | |
119 uint64_t nreads_filtered; | |
120 uint64_t nreads_dup; | |
121 uint64_t nreads_unmapped; | |
122 uint64_t nreads_unpaired; | |
123 uint64_t nreads_paired; | |
124 uint64_t nreads_anomalous; | |
125 uint64_t nreads_mq0; | |
126 uint64_t nbases_mapped; | |
127 uint64_t nbases_mapped_cigar; | |
128 uint64_t nbases_trimmed; // bwa trimmed bases | |
129 uint64_t nmismatches; | |
130 uint64_t nreads_QCfailed, nreads_secondary; | |
131 | |
132 // GC-depth related data | |
133 uint32_t ngcd, igcd; // The maximum number of GC depth bins and index of the current bin | |
134 gc_depth_t *gcd; // The GC-depth bins holder | |
135 int gcd_bin_size; // The size of GC-depth bin | |
136 uint32_t gcd_ref_size; // The approximate size of the genome | |
137 int32_t tid, gcd_pos; // Position of the current bin | |
138 int32_t pos; // Position of the last read | |
139 | |
140 // Coverage distribution related data | |
141 int ncov; // The number of coverage bins | |
142 uint64_t *cov; // The coverage frequencies | |
143 int cov_min,cov_max,cov_step; // Minimum, maximum coverage and size of the coverage bins | |
144 round_buffer_t cov_rbuf; // Pileup round buffer | |
145 | |
146 // Mismatches by read cycle | |
147 uint8_t *rseq_buf; // A buffer for reference sequence to check the mismatches against | |
148 int mrseq_buf; // The size of the buffer | |
149 int32_t rseq_pos; // The coordinate of the first base in the buffer | |
150 int32_t nrseq_buf; // The used part of the buffer | |
151 uint64_t *mpc_buf; // Mismatches per cycle | |
152 | |
153 // Filters | |
154 int filter_readlen; | |
155 | |
156 // Target regions | |
157 int nregions, reg_from,reg_to; | |
158 regions_t *regions; | |
159 | |
160 // Auxiliary data | |
161 int flag_require, flag_filter; | |
162 double sum_qual; // For calculating average quality value | |
163 samfile_t *sam; | |
164 khash_t(kh_rg) *rg_hash; // Read groups to include, the array is null-terminated | |
165 faidx_t *fai; // Reference sequence for GC-depth graph | |
166 int argc; // Command line arguments to be printed on the output | |
167 char **argv; | |
168 } | |
169 stats_t; | |
170 | |
171 void error(const char *format, ...); | |
172 void bam_init_header_hash(bam_header_t *header); | |
173 int is_in_regions(bam1_t *bam_line, stats_t *stats); | |
174 | |
175 | |
176 // Coverage distribution methods | |
177 inline int coverage_idx(int min, int max, int n, int step, int depth) | |
178 { | |
179 if ( depth < min ) | |
180 return 0; | |
181 | |
182 if ( depth > max ) | |
183 return n-1; | |
184 | |
185 return 1 + (depth - min) / step; | |
186 } | |
187 | |
188 inline int round_buffer_lidx2ridx(int offset, int size, int64_t refpos, int64_t pos) | |
189 { | |
190 return (offset + (pos-refpos) % size) % size; | |
191 } | |
192 | |
193 void round_buffer_flush(stats_t *stats, int64_t pos) | |
194 { | |
195 int ibuf,idp; | |
196 | |
197 if ( pos==stats->cov_rbuf.pos ) | |
198 return; | |
199 | |
200 int64_t new_pos = pos; | |
201 if ( pos==-1 || pos - stats->cov_rbuf.pos >= stats->cov_rbuf.size ) | |
202 { | |
203 // Flush the whole buffer, but in sequential order, | |
204 pos = stats->cov_rbuf.pos + stats->cov_rbuf.size - 1; | |
205 } | |
206 | |
207 if ( pos < stats->cov_rbuf.pos ) | |
208 error("Expected coordinates in ascending order, got %ld after %ld\n", pos,stats->cov_rbuf.pos); | |
209 | |
210 int ifrom = stats->cov_rbuf.start; | |
211 int ito = round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos-1); | |
212 if ( ifrom>ito ) | |
213 { | |
214 for (ibuf=ifrom; ibuf<stats->cov_rbuf.size; ibuf++) | |
215 { | |
216 if ( !stats->cov_rbuf.buffer[ibuf] ) | |
217 continue; | |
218 idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]); | |
219 stats->cov[idp]++; | |
220 stats->cov_rbuf.buffer[ibuf] = 0; | |
221 } | |
222 ifrom = 0; | |
223 } | |
224 for (ibuf=ifrom; ibuf<=ito; ibuf++) | |
225 { | |
226 if ( !stats->cov_rbuf.buffer[ibuf] ) | |
227 continue; | |
228 idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]); | |
229 stats->cov[idp]++; | |
230 stats->cov_rbuf.buffer[ibuf] = 0; | |
231 } | |
232 stats->cov_rbuf.start = (new_pos==-1) ? 0 : round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos); | |
233 stats->cov_rbuf.pos = new_pos; | |
234 } | |
235 | |
236 void round_buffer_insert_read(round_buffer_t *rbuf, int64_t from, int64_t to) | |
237 { | |
238 if ( to-from >= rbuf->size ) | |
239 error("The read length too big (%d), please increase the buffer length (currently %d)\n", to-from+1,rbuf->size); | |
240 if ( from < rbuf->pos ) | |
241 error("The reads are not sorted (%ld comes after %ld).\n", from,rbuf->pos); | |
242 | |
243 int ifrom,ito,ibuf; | |
244 ifrom = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,from); | |
245 ito = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,to); | |
246 if ( ifrom>ito ) | |
247 { | |
248 for (ibuf=ifrom; ibuf<rbuf->size; ibuf++) | |
249 rbuf->buffer[ibuf]++; | |
250 ifrom = 0; | |
251 } | |
252 for (ibuf=ifrom; ibuf<=ito; ibuf++) | |
253 rbuf->buffer[ibuf]++; | |
254 } | |
255 | |
256 // Calculate the number of bases in the read trimmed by BWA | |
257 int bwa_trim_read(int trim_qual, uint8_t *quals, int len, int reverse) | |
258 { | |
259 if ( len<BWA_MIN_RDLEN ) return 0; | |
260 | |
261 // Although the name implies that the read cannot be trimmed to more than BWA_MIN_RDLEN, | |
262 // the calculation can in fact trim it to (BWA_MIN_RDLEN-1). (bwa_trim_read in bwa/bwaseqio.c). | |
263 int max_trimmed = len - BWA_MIN_RDLEN + 1; | |
264 int l, sum=0, max_sum=0, max_l=0; | |
265 | |
266 for (l=0; l<max_trimmed; l++) | |
267 { | |
268 sum += trim_qual - quals[ reverse ? l : len-1-l ]; | |
269 if ( sum<0 ) break; | |
270 if ( sum>max_sum ) | |
271 { | |
272 max_sum = sum; | |
273 // This is the correct way, but bwa clips from some reason one base less | |
274 // max_l = l+1; | |
275 max_l = l; | |
276 } | |
277 } | |
278 return max_l; | |
279 } | |
280 | |
281 | |
282 void count_indels(stats_t *stats,bam1_t *bam_line) | |
283 { | |
284 int is_fwd = IS_REVERSE(bam_line) ? 0 : 1; | |
285 int is_1st = IS_READ1(bam_line) ? 1 : 0; | |
286 int icig; | |
287 int icycle = 0; | |
288 int read_len = bam_line->core.l_qseq; | |
289 for (icig=0; icig<bam_line->core.n_cigar; icig++) | |
290 { | |
291 // Conversion from uint32_t to MIDNSHP | |
292 // 0123456 | |
293 // MIDNSHP | |
294 int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK; | |
295 int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT; | |
296 | |
297 if ( cig==1 ) | |
298 { | |
299 int idx = is_fwd ? icycle : read_len-icycle-ncig; | |
300 if ( idx<0 ) | |
301 error("FIXME: read_len=%d vs icycle=%d\n", read_len,icycle); | |
302 if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d, %s:%d %s\n", idx,stats->nbases, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line)); | |
303 if ( is_1st ) | |
304 stats->ins_cycles_1st[idx]++; | |
305 else | |
306 stats->ins_cycles_2nd[idx]++; | |
307 icycle += ncig; | |
308 if ( ncig<=stats->nindels ) | |
309 stats->insertions[ncig-1]++; | |
310 continue; | |
311 } | |
312 if ( cig==2 ) | |
313 { | |
314 int idx = is_fwd ? icycle-1 : read_len-icycle-1; | |
315 if ( idx<0 ) continue; // discard meaningless deletions | |
316 if ( idx >= stats->nbases ) error("FIXME: %d vs %d\n", idx,stats->nbases); | |
317 if ( is_1st ) | |
318 stats->del_cycles_1st[idx]++; | |
319 else | |
320 stats->del_cycles_2nd[idx]++; | |
321 if ( ncig<=stats->nindels ) | |
322 stats->deletions[ncig-1]++; | |
323 continue; | |
324 } | |
325 if ( cig!=3 && cig!=5 ) | |
326 icycle += ncig; | |
327 } | |
328 } | |
329 | |
330 void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line) | |
331 { | |
332 int is_fwd = IS_REVERSE(bam_line) ? 0 : 1; | |
333 int icig,iread=0,icycle=0; | |
334 int iref = bam_line->core.pos - stats->rseq_pos; | |
335 int read_len = bam_line->core.l_qseq; | |
336 uint8_t *read = bam1_seq(bam_line); | |
337 uint8_t *quals = bam1_qual(bam_line); | |
338 uint64_t *mpc_buf = stats->mpc_buf; | |
339 for (icig=0; icig<bam_line->core.n_cigar; icig++) | |
340 { | |
341 // Conversion from uint32_t to MIDNSHP | |
342 // 0123456 | |
343 // MIDNSHP | |
344 int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK; | |
345 int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT; | |
346 if ( cig==1 ) | |
347 { | |
348 iread += ncig; | |
349 icycle += ncig; | |
350 continue; | |
351 } | |
352 if ( cig==2 ) | |
353 { | |
354 iref += ncig; | |
355 continue; | |
356 } | |
357 if ( cig==4 ) | |
358 { | |
359 icycle += ncig; | |
360 // Soft-clips are present in the sequence, but the position of the read marks a start of non-clipped sequence | |
361 // iref += ncig; | |
362 iread += ncig; | |
363 continue; | |
364 } | |
365 if ( cig==5 ) | |
366 { | |
367 icycle += ncig; | |
368 continue; | |
369 } | |
370 // Ignore H and N CIGARs. The letter are inserted e.g. by TopHat and often require very large | |
371 // chunk of refseq in memory. Not very frequent and not noticable in the stats. | |
372 if ( cig==3 || cig==5 ) continue; | |
373 if ( cig!=0 ) | |
374 error("TODO: cigar %d, %s:%d %s\n", cig,stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line)); | |
375 | |
376 if ( ncig+iref > stats->nrseq_buf ) | |
377 error("FIXME: %d+%d > %d, %s, %s:%d\n",ncig,iref,stats->nrseq_buf, bam1_qname(bam_line),stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1); | |
378 | |
379 int im; | |
380 for (im=0; im<ncig; im++) | |
381 { | |
382 uint8_t cread = bam1_seqi(read,iread); | |
383 uint8_t cref = stats->rseq_buf[iref]; | |
384 | |
385 // ---------------15 | |
386 // =ACMGRSVTWYHKDBN | |
387 if ( cread==15 ) | |
388 { | |
389 int idx = is_fwd ? icycle : read_len-icycle-1; | |
390 if ( idx>stats->max_len ) | |
391 error("mpc: %d>%d\n",idx,stats->max_len); | |
392 idx = idx*stats->nquals; | |
393 if ( idx>=stats->nquals*stats->nbases ) | |
394 error("FIXME: mpc_buf overflow\n"); | |
395 mpc_buf[idx]++; | |
396 } | |
397 else if ( cref && cread && cref!=cread ) | |
398 { | |
399 uint8_t qual = quals[iread] + 1; | |
400 if ( qual>=stats->nquals ) | |
401 error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line)); | |
402 | |
403 int idx = is_fwd ? icycle : read_len-icycle-1; | |
404 if ( idx>stats->max_len ) | |
405 error("mpc: %d>%d\n",idx,stats->max_len); | |
406 | |
407 idx = idx*stats->nquals + qual; | |
408 if ( idx>=stats->nquals*stats->nbases ) | |
409 error("FIXME: mpc_buf overflow\n"); | |
410 mpc_buf[idx]++; | |
411 } | |
412 | |
413 iref++; | |
414 iread++; | |
415 icycle++; | |
416 } | |
417 } | |
418 } | |
419 | |
420 void read_ref_seq(stats_t *stats,int32_t tid,int32_t pos) | |
421 { | |
422 khash_t(kh_faidx) *h; | |
423 khiter_t iter; | |
424 faidx1_t val; | |
425 char *chr, c; | |
426 faidx_t *fai = stats->fai; | |
427 | |
428 h = fai->hash; | |
429 chr = stats->sam->header->target_name[tid]; | |
430 | |
431 // ID of the sequence name | |
432 iter = kh_get(kh_faidx, h, chr); | |
433 if (iter == kh_end(h)) | |
434 error("No such reference sequence [%s]?\n", chr); | |
435 val = kh_value(h, iter); | |
436 | |
437 // Check the boundaries | |
438 if (pos >= val.len) | |
439 error("Was the bam file mapped with the reference sequence supplied?" | |
440 " A read mapped beyond the end of the chromosome (%s:%d, chromosome length %d).\n", chr,pos,val.len); | |
441 int size = stats->mrseq_buf; | |
442 // The buffer extends beyond the chromosome end. Later the rest will be filled with N's. | |
443 if (size+pos > val.len) size = val.len-pos; | |
444 | |
445 // Position the razf reader | |
446 razf_seek(fai->rz, val.offset + pos / val.line_blen * val.line_len + pos % val.line_blen, SEEK_SET); | |
447 | |
448 uint8_t *ptr = stats->rseq_buf; | |
449 int nread = 0; | |
450 while ( nread<size && razf_read(fai->rz,&c,1) && !fai->rz->z_err ) | |
451 { | |
452 if ( !isgraph(c) ) | |
453 continue; | |
454 | |
455 // Conversion between uint8_t coding and ACGT | |
456 // -12-4---8------- | |
457 // =ACMGRSVTWYHKDBN | |
458 if ( c=='A' || c=='a' ) | |
459 *ptr = 1; | |
460 else if ( c=='C' || c=='c' ) | |
461 *ptr = 2; | |
462 else if ( c=='G' || c=='g' ) | |
463 *ptr = 4; | |
464 else if ( c=='T' || c=='t' ) | |
465 *ptr = 8; | |
466 else | |
467 *ptr = 0; | |
468 ptr++; | |
469 nread++; | |
470 } | |
471 if ( nread < stats->mrseq_buf ) | |
472 { | |
473 memset(ptr,0, stats->mrseq_buf - nread); | |
474 nread = stats->mrseq_buf; | |
475 } | |
476 stats->nrseq_buf = nread; | |
477 stats->rseq_pos = pos; | |
478 stats->tid = tid; | |
479 } | |
480 | |
481 float fai_gc_content(stats_t *stats, int pos, int len) | |
482 { | |
483 uint32_t gc,count,c; | |
484 int i = pos - stats->rseq_pos, ito = i + len; | |
485 assert( i>=0 && ito<=stats->nrseq_buf ); | |
486 | |
487 // Count GC content | |
488 gc = count = 0; | |
489 for (; i<ito; i++) | |
490 { | |
491 c = stats->rseq_buf[i]; | |
492 if ( c==2 || c==4 ) | |
493 { | |
494 gc++; | |
495 count++; | |
496 } | |
497 else if ( c==1 || c==8 ) | |
498 count++; | |
499 } | |
500 return count ? (float)gc/count : 0; | |
501 } | |
502 | |
503 void realloc_rseq_buffer(stats_t *stats) | |
504 { | |
505 int n = stats->nbases*10; | |
506 if ( stats->gcd_bin_size > n ) n = stats->gcd_bin_size; | |
507 if ( stats->mrseq_buf<n ) | |
508 { | |
509 stats->rseq_buf = realloc(stats->rseq_buf,sizeof(uint8_t)*n); | |
510 stats->mrseq_buf = n; | |
511 } | |
512 } | |
513 | |
514 void realloc_gcd_buffer(stats_t *stats, int seq_len) | |
515 { | |
516 if ( seq_len >= stats->gcd_bin_size ) | |
517 error("The --GC-depth bin size (%d) is set too low for the read length %d\n", stats->gcd_bin_size, seq_len); | |
518 | |
519 int n = 1 + stats->gcd_ref_size / (stats->gcd_bin_size - seq_len); | |
520 if ( n <= stats->igcd ) | |
521 error("The --GC-depth bin size is too small or reference genome too big; please decrease the bin size or increase the reference length\n"); | |
522 | |
523 if ( n > stats->ngcd ) | |
524 { | |
525 stats->gcd = realloc(stats->gcd, n*sizeof(gc_depth_t)); | |
526 if ( !stats->gcd ) | |
527 error("Could not realloc GCD buffer, too many chromosomes or the genome too long?? [%u %u]\n", stats->ngcd,n); | |
528 memset(&(stats->gcd[stats->ngcd]),0,(n-stats->ngcd)*sizeof(gc_depth_t)); | |
529 stats->ngcd = n; | |
530 } | |
531 | |
532 realloc_rseq_buffer(stats); | |
533 } | |
534 | |
535 void realloc_buffers(stats_t *stats, int seq_len) | |
536 { | |
537 int n = 2*(1 + seq_len - stats->nbases) + stats->nbases; | |
538 | |
539 stats->quals_1st = realloc(stats->quals_1st, n*stats->nquals*sizeof(uint64_t)); | |
540 if ( !stats->quals_1st ) | |
541 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t)); | |
542 memset(stats->quals_1st + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t)); | |
543 | |
544 stats->quals_2nd = realloc(stats->quals_2nd, n*stats->nquals*sizeof(uint64_t)); | |
545 if ( !stats->quals_2nd ) | |
546 error("Could not realloc buffers, the sequence too long: %d (2x%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t)); | |
547 memset(stats->quals_2nd + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t)); | |
548 | |
549 if ( stats->mpc_buf ) | |
550 { | |
551 stats->mpc_buf = realloc(stats->mpc_buf, n*stats->nquals*sizeof(uint64_t)); | |
552 if ( !stats->mpc_buf ) | |
553 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t)); | |
554 memset(stats->mpc_buf + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t)); | |
555 } | |
556 | |
557 stats->acgt_cycles = realloc(stats->acgt_cycles, n*4*sizeof(uint64_t)); | |
558 if ( !stats->acgt_cycles ) | |
559 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*4*sizeof(uint64_t)); | |
560 memset(stats->acgt_cycles + stats->nbases*4, 0, (n-stats->nbases)*4*sizeof(uint64_t)); | |
561 | |
562 stats->read_lengths = realloc(stats->read_lengths, n*sizeof(uint64_t)); | |
563 if ( !stats->read_lengths ) | |
564 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t)); | |
565 memset(stats->read_lengths + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t)); | |
566 | |
567 stats->insertions = realloc(stats->insertions, n*sizeof(uint64_t)); | |
568 if ( !stats->insertions ) | |
569 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t)); | |
570 memset(stats->insertions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t)); | |
571 | |
572 stats->deletions = realloc(stats->deletions, n*sizeof(uint64_t)); | |
573 if ( !stats->deletions ) | |
574 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t)); | |
575 memset(stats->deletions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t)); | |
576 | |
577 stats->ins_cycles_1st = realloc(stats->ins_cycles_1st, (n+1)*sizeof(uint64_t)); | |
578 if ( !stats->ins_cycles_1st ) | |
579 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t)); | |
580 memset(stats->ins_cycles_1st + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t)); | |
581 | |
582 stats->ins_cycles_2nd = realloc(stats->ins_cycles_2nd, (n+1)*sizeof(uint64_t)); | |
583 if ( !stats->ins_cycles_2nd ) | |
584 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t)); | |
585 memset(stats->ins_cycles_2nd + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t)); | |
586 | |
587 stats->del_cycles_1st = realloc(stats->del_cycles_1st, (n+1)*sizeof(uint64_t)); | |
588 if ( !stats->del_cycles_1st ) | |
589 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t)); | |
590 memset(stats->del_cycles_1st + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t)); | |
591 | |
592 stats->del_cycles_2nd = realloc(stats->del_cycles_2nd, (n+1)*sizeof(uint64_t)); | |
593 if ( !stats->del_cycles_2nd ) | |
594 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t)); | |
595 memset(stats->del_cycles_2nd + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t)); | |
596 | |
597 stats->nbases = n; | |
598 | |
599 // Realloc the coverage distribution buffer | |
600 int *rbuffer = calloc(sizeof(int),seq_len*5); | |
601 n = stats->cov_rbuf.size-stats->cov_rbuf.start; | |
602 memcpy(rbuffer,stats->cov_rbuf.buffer+stats->cov_rbuf.start,n); | |
603 if ( stats->cov_rbuf.start>1 ) | |
604 memcpy(rbuffer+n,stats->cov_rbuf.buffer,stats->cov_rbuf.start); | |
605 stats->cov_rbuf.start = 0; | |
606 free(stats->cov_rbuf.buffer); | |
607 stats->cov_rbuf.buffer = rbuffer; | |
608 stats->cov_rbuf.size = seq_len*5; | |
609 | |
610 realloc_rseq_buffer(stats); | |
611 } | |
612 | |
613 void collect_stats(bam1_t *bam_line, stats_t *stats) | |
614 { | |
615 if ( stats->rg_hash ) | |
616 { | |
617 const uint8_t *rg = bam_aux_get(bam_line, "RG"); | |
618 if ( !rg ) return; | |
619 khiter_t k = kh_get(kh_rg, stats->rg_hash, (const char*)(rg + 1)); | |
620 if ( k == kh_end(stats->rg_hash) ) return; | |
621 } | |
622 if ( stats->flag_require && (bam_line->core.flag & stats->flag_require)!=stats->flag_require ) | |
623 { | |
624 stats->nreads_filtered++; | |
625 return; | |
626 } | |
627 if ( stats->flag_filter && (bam_line->core.flag & stats->flag_filter) ) | |
628 { | |
629 stats->nreads_filtered++; | |
630 return; | |
631 } | |
632 if ( !is_in_regions(bam_line,stats) ) | |
633 return; | |
634 if ( stats->filter_readlen!=-1 && bam_line->core.l_qseq!=stats->filter_readlen ) | |
635 return; | |
636 | |
637 if ( bam_line->core.flag & BAM_FQCFAIL ) stats->nreads_QCfailed++; | |
638 if ( bam_line->core.flag & BAM_FSECONDARY ) stats->nreads_secondary++; | |
639 | |
640 int seq_len = bam_line->core.l_qseq; | |
641 if ( !seq_len ) return; | |
642 | |
643 if ( seq_len >= stats->nbases ) | |
644 realloc_buffers(stats,seq_len); | |
645 if ( stats->max_len<seq_len ) | |
646 stats->max_len = seq_len; | |
647 | |
648 stats->read_lengths[seq_len]++; | |
649 | |
650 // Count GC and ACGT per cycle | |
651 uint8_t base, *seq = bam1_seq(bam_line); | |
652 int gc_count = 0; | |
653 int i; | |
654 int reverse = IS_REVERSE(bam_line); | |
655 for (i=0; i<seq_len; i++) | |
656 { | |
657 // Conversion from uint8_t coding to ACGT | |
658 // -12-4---8------- | |
659 // =ACMGRSVTWYHKDBN | |
660 // 01 2 3 | |
661 base = bam1_seqi(seq,i); | |
662 base /= 2; | |
663 if ( base==1 || base==2 ) gc_count++; | |
664 else if ( base>2 ) base=3; | |
665 if ( 4*(reverse ? seq_len-i-1 : i) + base >= stats->nbases*4 ) | |
666 error("FIXME: acgt_cycles\n"); | |
667 stats->acgt_cycles[ 4*(reverse ? seq_len-i-1 : i) + base ]++; | |
668 } | |
669 int gc_idx_min = gc_count*(stats->ngc-1)/seq_len; | |
670 int gc_idx_max = (gc_count+1)*(stats->ngc-1)/seq_len; | |
671 if ( gc_idx_max >= stats->ngc ) gc_idx_max = stats->ngc - 1; | |
672 | |
673 // Determine which array (1st or 2nd read) will these stats go to, | |
674 // trim low quality bases from end the same way BWA does, | |
675 // fill GC histogram | |
676 uint64_t *quals; | |
677 uint8_t *bam_quals = bam1_qual(bam_line); | |
678 if ( bam_line->core.flag&BAM_FREAD2 ) | |
679 { | |
680 quals = stats->quals_2nd; | |
681 stats->nreads_2nd++; | |
682 for (i=gc_idx_min; i<gc_idx_max; i++) | |
683 stats->gc_2nd[i]++; | |
684 } | |
685 else | |
686 { | |
687 quals = stats->quals_1st; | |
688 stats->nreads_1st++; | |
689 for (i=gc_idx_min; i<gc_idx_max; i++) | |
690 stats->gc_1st[i]++; | |
691 } | |
692 if ( stats->trim_qual>0 ) | |
693 stats->nbases_trimmed += bwa_trim_read(stats->trim_qual, bam_quals, seq_len, reverse); | |
694 | |
695 // Quality histogram and average quality | |
696 for (i=0; i<seq_len; i++) | |
697 { | |
698 uint8_t qual = bam_quals[ reverse ? seq_len-i-1 : i]; | |
699 if ( qual>=stats->nquals ) | |
700 error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals,stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line)); | |
701 if ( qual>stats->max_qual ) | |
702 stats->max_qual = qual; | |
703 | |
704 quals[ i*stats->nquals+qual ]++; | |
705 stats->sum_qual += qual; | |
706 } | |
707 | |
708 // Look at the flags and increment appropriate counters (mapped, paired, etc) | |
709 if ( IS_UNMAPPED(bam_line) ) | |
710 stats->nreads_unmapped++; | |
711 else | |
712 { | |
713 if ( !bam_line->core.qual ) | |
714 stats->nreads_mq0++; | |
715 | |
716 count_indels(stats,bam_line); | |
717 | |
718 if ( !IS_PAIRED(bam_line) ) | |
719 stats->nreads_unpaired++; | |
720 else | |
721 { | |
722 stats->nreads_paired++; | |
723 | |
724 if ( bam_line->core.tid!=bam_line->core.mtid ) | |
725 stats->nreads_anomalous++; | |
726 | |
727 // The insert size is tricky, because for long inserts the libraries are | |
728 // prepared differently and the pairs point in other direction. BWA does | |
729 // not set the paired flag for them. Similar thing is true also for 454 | |
730 // reads. Mates mapped to different chromosomes have isize==0. | |
731 int32_t isize = bam_line->core.isize; | |
732 if ( isize<0 ) isize = -isize; | |
733 if ( isize >= stats->nisize ) | |
734 isize = stats->nisize-1; | |
735 if ( isize>0 || bam_line->core.tid==bam_line->core.mtid ) | |
736 { | |
737 int pos_fst = bam_line->core.mpos - bam_line->core.pos; | |
738 int is_fst = IS_READ1(bam_line) ? 1 : -1; | |
739 int is_fwd = IS_REVERSE(bam_line) ? -1 : 1; | |
740 int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1; | |
741 | |
742 if ( is_fwd*is_mfwd>0 ) | |
743 stats->isize_other[isize]++; | |
744 else if ( is_fst*pos_fst>0 ) | |
745 { | |
746 if ( is_fst*is_fwd>0 ) | |
747 stats->isize_inward[isize]++; | |
748 else | |
749 stats->isize_outward[isize]++; | |
750 } | |
751 else if ( is_fst*pos_fst<0 ) | |
752 { | |
753 if ( is_fst*is_fwd>0 ) | |
754 stats->isize_outward[isize]++; | |
755 else | |
756 stats->isize_inward[isize]++; | |
757 } | |
758 } | |
759 } | |
760 | |
761 // Number of mismatches | |
762 uint8_t *nm = bam_aux_get(bam_line,"NM"); | |
763 if (nm) | |
764 stats->nmismatches += bam_aux2i(nm); | |
765 | |
766 // Number of mapped bases from cigar | |
767 // Conversion from uint32_t to MIDNSHP | |
768 // 012-4-- | |
769 // MIDNSHP | |
770 if ( bam_line->core.n_cigar == 0) | |
771 error("FIXME: mapped read with no cigar?\n"); | |
772 int readlen=seq_len; | |
773 if ( stats->regions ) | |
774 { | |
775 // Count only on-target bases | |
776 int iref = bam_line->core.pos + 1; | |
777 for (i=0; i<bam_line->core.n_cigar; i++) | |
778 { | |
779 int cig = bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK; | |
780 int ncig = bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT; | |
781 if ( cig==2 ) readlen += ncig; | |
782 else if ( cig==0 ) | |
783 { | |
784 if ( iref < stats->reg_from ) ncig -= stats->reg_from-iref; | |
785 else if ( iref+ncig-1 > stats->reg_to ) ncig -= iref+ncig-1 - stats->reg_to; | |
786 if ( ncig<0 ) ncig = 0; | |
787 stats->nbases_mapped_cigar += ncig; | |
788 iref += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT; | |
789 } | |
790 else if ( cig==1 ) | |
791 { | |
792 iref += ncig; | |
793 if ( iref>=stats->reg_from && iref<=stats->reg_to ) | |
794 stats->nbases_mapped_cigar += ncig; | |
795 } | |
796 } | |
797 } | |
798 else | |
799 { | |
800 // Count the whole read | |
801 for (i=0; i<bam_line->core.n_cigar; i++) | |
802 { | |
803 if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==0 || (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==1 ) | |
804 stats->nbases_mapped_cigar += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT; | |
805 if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==2 ) | |
806 readlen += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT; | |
807 } | |
808 } | |
809 stats->nbases_mapped += seq_len; | |
810 | |
811 if ( stats->tid==bam_line->core.tid && bam_line->core.pos<stats->pos ) | |
812 stats->is_sorted = 0; | |
813 stats->pos = bam_line->core.pos; | |
814 | |
815 if ( stats->is_sorted ) | |
816 { | |
817 if ( stats->tid==-1 || stats->tid!=bam_line->core.tid ) | |
818 round_buffer_flush(stats,-1); | |
819 | |
820 // Mismatches per cycle and GC-depth graph. For simplicity, reads overlapping GCD bins | |
821 // are not splitted which results in up to seq_len-1 overlaps. The default bin size is | |
822 // 20kbp, so the effect is negligible. | |
823 if ( stats->fai ) | |
824 { | |
825 int inc_ref = 0, inc_gcd = 0; | |
826 // First pass or new chromosome | |
827 if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid ) { inc_ref=1; inc_gcd=1; } | |
828 // Read goes beyond the end of the rseq buffer | |
829 else if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+readlen ) { inc_ref=1; inc_gcd=1; } | |
830 // Read overlaps the next gcd bin | |
831 else if ( stats->gcd_pos+stats->gcd_bin_size < bam_line->core.pos+readlen ) | |
832 { | |
833 inc_gcd = 1; | |
834 if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+stats->gcd_bin_size ) inc_ref = 1; | |
835 } | |
836 if ( inc_gcd ) | |
837 { | |
838 stats->igcd++; | |
839 if ( stats->igcd >= stats->ngcd ) | |
840 realloc_gcd_buffer(stats, readlen); | |
841 if ( inc_ref ) | |
842 read_ref_seq(stats,bam_line->core.tid,bam_line->core.pos); | |
843 stats->gcd_pos = bam_line->core.pos; | |
844 stats->gcd[ stats->igcd ].gc = fai_gc_content(stats, stats->gcd_pos, stats->gcd_bin_size); | |
845 } | |
846 | |
847 count_mismatches_per_cycle(stats,bam_line); | |
848 } | |
849 // No reference and first pass, new chromosome or sequence going beyond the end of the gcd bin | |
850 else if ( stats->gcd_pos==-1 || stats->tid != bam_line->core.tid || bam_line->core.pos - stats->gcd_pos > stats->gcd_bin_size ) | |
851 { | |
852 // First pass or a new chromosome | |
853 stats->tid = bam_line->core.tid; | |
854 stats->gcd_pos = bam_line->core.pos; | |
855 stats->igcd++; | |
856 if ( stats->igcd >= stats->ngcd ) | |
857 realloc_gcd_buffer(stats, readlen); | |
858 } | |
859 stats->gcd[ stats->igcd ].depth++; | |
860 // When no reference sequence is given, approximate the GC from the read (much shorter window, but otherwise OK) | |
861 if ( !stats->fai ) | |
862 stats->gcd[ stats->igcd ].gc += (float) gc_count / seq_len; | |
863 | |
864 // Coverage distribution graph | |
865 round_buffer_flush(stats,bam_line->core.pos); | |
866 round_buffer_insert_read(&(stats->cov_rbuf),bam_line->core.pos,bam_line->core.pos+seq_len-1); | |
867 } | |
868 } | |
869 | |
870 stats->total_len += seq_len; | |
871 if ( IS_DUP(bam_line) ) | |
872 { | |
873 stats->total_len_dup += seq_len; | |
874 stats->nreads_dup++; | |
875 } | |
876 } | |
877 | |
878 // Sort by GC and depth | |
879 #define GCD_t(x) ((gc_depth_t *)x) | |
880 static int gcd_cmp(const void *a, const void *b) | |
881 { | |
882 if ( GCD_t(a)->gc < GCD_t(b)->gc ) return -1; | |
883 if ( GCD_t(a)->gc > GCD_t(b)->gc ) return 1; | |
884 if ( GCD_t(a)->depth < GCD_t(b)->depth ) return -1; | |
885 if ( GCD_t(a)->depth > GCD_t(b)->depth ) return 1; | |
886 return 0; | |
887 } | |
888 #undef GCD_t | |
889 | |
890 float gcd_percentile(gc_depth_t *gcd, int N, int p) | |
891 { | |
892 float n,d; | |
893 int k; | |
894 | |
895 n = p*(N+1)/100; | |
896 k = n; | |
897 if ( k<=0 ) | |
898 return gcd[0].depth; | |
899 if ( k>=N ) | |
900 return gcd[N-1].depth; | |
901 | |
902 d = n - k; | |
903 return gcd[k-1].depth + d*(gcd[k].depth - gcd[k-1].depth); | |
904 } | |
905 | |
906 void output_stats(stats_t *stats) | |
907 { | |
908 // Calculate average insert size and standard deviation (from the main bulk data only) | |
909 int isize, ibulk=0; | |
910 uint64_t nisize=0, nisize_inward=0, nisize_outward=0, nisize_other=0; | |
911 for (isize=0; isize<stats->nisize; isize++) | |
912 { | |
913 // Each pair was counted twice | |
914 stats->isize_inward[isize] *= 0.5; | |
915 stats->isize_outward[isize] *= 0.5; | |
916 stats->isize_other[isize] *= 0.5; | |
917 | |
918 nisize_inward += stats->isize_inward[isize]; | |
919 nisize_outward += stats->isize_outward[isize]; | |
920 nisize_other += stats->isize_other[isize]; | |
921 nisize += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]; | |
922 } | |
923 | |
924 double bulk=0, avg_isize=0, sd_isize=0; | |
925 for (isize=0; isize<stats->nisize; isize++) | |
926 { | |
927 bulk += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]; | |
928 avg_isize += isize * (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]); | |
929 | |
930 if ( bulk/nisize > stats->isize_main_bulk ) | |
931 { | |
932 ibulk = isize+1; | |
933 nisize = bulk; | |
934 break; | |
935 } | |
936 } | |
937 avg_isize /= nisize ? nisize : 1; | |
938 for (isize=1; isize<ibulk; isize++) | |
939 sd_isize += (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]) * (isize-avg_isize)*(isize-avg_isize) / nisize; | |
940 sd_isize = sqrt(sd_isize); | |
941 | |
942 | |
943 printf("# This file was produced by bamcheck (%s)\n",BAMCHECK_VERSION); | |
944 printf("# The command line was: %s",stats->argv[0]); | |
945 int i; | |
946 for (i=1; i<stats->argc; i++) | |
947 printf(" %s",stats->argv[i]); | |
948 printf("\n"); | |
949 printf("# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n"); | |
950 printf("SN\traw total sequences:\t%ld\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd)); | |
951 printf("SN\tfiltered sequences:\t%ld\n", (long)stats->nreads_filtered); | |
952 printf("SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd)); | |
953 printf("SN\tis paired:\t%d\n", stats->nreads_1st&&stats->nreads_2nd ? 1 : 0); | |
954 printf("SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0); | |
955 printf("SN\t1st fragments:\t%ld\n", (long)stats->nreads_1st); | |
956 printf("SN\tlast fragments:\t%ld\n", (long)stats->nreads_2nd); | |
957 printf("SN\treads mapped:\t%ld\n", (long)(stats->nreads_paired+stats->nreads_unpaired)); | |
958 printf("SN\treads unmapped:\t%ld\n", (long)stats->nreads_unmapped); | |
959 printf("SN\treads unpaired:\t%ld\n", (long)stats->nreads_unpaired); | |
960 printf("SN\treads paired:\t%ld\n", (long)stats->nreads_paired); | |
961 printf("SN\treads duplicated:\t%ld\n", (long)stats->nreads_dup); | |
962 printf("SN\treads MQ0:\t%ld\n", (long)stats->nreads_mq0); | |
963 printf("SN\treads QC failed:\t%ld\n", (long)stats->nreads_QCfailed); | |
964 printf("SN\tnon-primary alignments:\t%ld\n", (long)stats->nreads_secondary); | |
965 printf("SN\ttotal length:\t%ld\n", (long)stats->total_len); | |
966 printf("SN\tbases mapped:\t%ld\n", (long)stats->nbases_mapped); | |
967 printf("SN\tbases mapped (cigar):\t%ld\n", (long)stats->nbases_mapped_cigar); | |
968 printf("SN\tbases trimmed:\t%ld\n", (long)stats->nbases_trimmed); | |
969 printf("SN\tbases duplicated:\t%ld\n", (long)stats->total_len_dup); | |
970 printf("SN\tmismatches:\t%ld\n", (long)stats->nmismatches); | |
971 printf("SN\terror rate:\t%e\n", (float)stats->nmismatches/stats->nbases_mapped_cigar); | |
972 float avg_read_length = (stats->nreads_1st+stats->nreads_2nd)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd):0; | |
973 printf("SN\taverage length:\t%.0f\n", avg_read_length); | |
974 printf("SN\tmaximum length:\t%d\n", stats->max_len); | |
975 printf("SN\taverage quality:\t%.1f\n", stats->total_len?stats->sum_qual/stats->total_len:0); | |
976 printf("SN\tinsert size average:\t%.1f\n", avg_isize); | |
977 printf("SN\tinsert size standard deviation:\t%.1f\n", sd_isize); | |
978 printf("SN\tinward oriented pairs:\t%ld\n", (long)nisize_inward); | |
979 printf("SN\toutward oriented pairs:\t%ld\n", (long)nisize_outward); | |
980 printf("SN\tpairs with other orientation:\t%ld\n", (long)nisize_other); | |
981 printf("SN\tpairs on different chromosomes:\t%ld\n", (long)stats->nreads_anomalous/2); | |
982 | |
983 int ibase,iqual; | |
984 if ( stats->max_len<stats->nbases ) stats->max_len++; | |
985 if ( stats->max_qual+1<stats->nquals ) stats->max_qual++; | |
986 printf("# First Fragment Qualitites. Use `grep ^FFQ | cut -f 2-` to extract this part.\n"); | |
987 printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n"); | |
988 for (ibase=0; ibase<stats->max_len; ibase++) | |
989 { | |
990 printf("FFQ\t%d",ibase+1); | |
991 for (iqual=0; iqual<=stats->max_qual; iqual++) | |
992 { | |
993 printf("\t%ld", (long)stats->quals_1st[ibase*stats->nquals+iqual]); | |
994 } | |
995 printf("\n"); | |
996 } | |
997 printf("# Last Fragment Qualitites. Use `grep ^LFQ | cut -f 2-` to extract this part.\n"); | |
998 printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n"); | |
999 for (ibase=0; ibase<stats->max_len; ibase++) | |
1000 { | |
1001 printf("LFQ\t%d",ibase+1); | |
1002 for (iqual=0; iqual<=stats->max_qual; iqual++) | |
1003 { | |
1004 printf("\t%ld", (long)stats->quals_2nd[ibase*stats->nquals+iqual]); | |
1005 } | |
1006 printf("\n"); | |
1007 } | |
1008 if ( stats->mpc_buf ) | |
1009 { | |
1010 printf("# Mismatches per cycle and quality. Use `grep ^MPC | cut -f 2-` to extract this part.\n"); | |
1011 printf("# Columns correspond to qualities, rows to cycles. First column is the cycle number, second\n"); | |
1012 printf("# is the number of N's and the rest is the number of mismatches\n"); | |
1013 for (ibase=0; ibase<stats->max_len; ibase++) | |
1014 { | |
1015 printf("MPC\t%d",ibase+1); | |
1016 for (iqual=0; iqual<=stats->max_qual; iqual++) | |
1017 { | |
1018 printf("\t%ld", (long)stats->mpc_buf[ibase*stats->nquals+iqual]); | |
1019 } | |
1020 printf("\n"); | |
1021 } | |
1022 } | |
1023 printf("# GC Content of first fragments. Use `grep ^GCF | cut -f 2-` to extract this part.\n"); | |
1024 int ibase_prev = 0; | |
1025 for (ibase=0; ibase<stats->ngc; ibase++) | |
1026 { | |
1027 if ( stats->gc_1st[ibase]==stats->gc_1st[ibase_prev] ) continue; | |
1028 printf("GCF\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_1st[ibase_prev]); | |
1029 ibase_prev = ibase; | |
1030 } | |
1031 printf("# GC Content of last fragments. Use `grep ^GCL | cut -f 2-` to extract this part.\n"); | |
1032 ibase_prev = 0; | |
1033 for (ibase=0; ibase<stats->ngc; ibase++) | |
1034 { | |
1035 if ( stats->gc_2nd[ibase]==stats->gc_2nd[ibase_prev] ) continue; | |
1036 printf("GCL\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_2nd[ibase_prev]); | |
1037 ibase_prev = ibase; | |
1038 } | |
1039 printf("# ACGT content per cycle. Use `grep ^GCC | cut -f 2-` to extract this part. The columns are: cycle, and A,C,G,T counts [%%]\n"); | |
1040 for (ibase=0; ibase<stats->max_len; ibase++) | |
1041 { | |
1042 uint64_t *ptr = &(stats->acgt_cycles[ibase*4]); | |
1043 uint64_t sum = ptr[0]+ptr[1]+ptr[2]+ptr[3]; | |
1044 if ( ! sum ) continue; | |
1045 printf("GCC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase,100.*ptr[0]/sum,100.*ptr[1]/sum,100.*ptr[2]/sum,100.*ptr[3]/sum); | |
1046 } | |
1047 printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: pairs total, inward oriented pairs, outward oriented pairs, other pairs\n"); | |
1048 for (isize=0; isize<ibulk; isize++) | |
1049 printf("IS\t%d\t%ld\t%ld\t%ld\t%ld\n", isize, (long)(stats->isize_inward[isize]+stats->isize_outward[isize]+stats->isize_other[isize]), | |
1050 (long)stats->isize_inward[isize], (long)stats->isize_outward[isize], (long)stats->isize_other[isize]); | |
1051 | |
1052 printf("# Read lengths. Use `grep ^RL | cut -f 2-` to extract this part. The columns are: read length, count\n"); | |
1053 int ilen; | |
1054 for (ilen=0; ilen<stats->max_len; ilen++) | |
1055 { | |
1056 if ( stats->read_lengths[ilen]>0 ) | |
1057 printf("RL\t%d\t%ld\n", ilen, (long)stats->read_lengths[ilen]); | |
1058 } | |
1059 | |
1060 printf("# Indel distribution. Use `grep ^ID | cut -f 2-` to extract this part. The columns are: length, number of insertions, number of deletions\n"); | |
1061 for (ilen=0; ilen<stats->nindels; ilen++) | |
1062 { | |
1063 if ( stats->insertions[ilen]>0 || stats->deletions[ilen]>0 ) | |
1064 printf("ID\t%d\t%ld\t%ld\n", ilen+1, (long)stats->insertions[ilen], (long)stats->deletions[ilen]); | |
1065 } | |
1066 | |
1067 printf("# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions (fwd), .. (rev) , number of deletions (fwd), .. (rev)\n"); | |
1068 for (ilen=0; ilen<=stats->nbases; ilen++) | |
1069 { | |
1070 // For deletions we print the index of the cycle before the deleted base (1-based) and for insertions | |
1071 // the index of the cycle of the first inserted base (also 1-based) | |
1072 if ( stats->ins_cycles_1st[ilen]>0 || stats->ins_cycles_2nd[ilen]>0 || stats->del_cycles_1st[ilen]>0 || stats->del_cycles_2nd[ilen]>0 ) | |
1073 printf("IC\t%d\t%ld\t%ld\t%ld\t%ld\n", ilen+1, (long)stats->ins_cycles_1st[ilen], (long)stats->ins_cycles_2nd[ilen], (long)stats->del_cycles_1st[ilen], (long)stats->del_cycles_2nd[ilen]); | |
1074 } | |
1075 | |
1076 printf("# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n"); | |
1077 if ( stats->cov[0] ) | |
1078 printf("COV\t[<%d]\t%d\t%ld\n",stats->cov_min,stats->cov_min-1, (long)stats->cov[0]); | |
1079 int icov; | |
1080 for (icov=1; icov<stats->ncov-1; icov++) | |
1081 if ( stats->cov[icov] ) | |
1082 printf("COV\t[%d-%d]\t%d\t%ld\n",stats->cov_min + (icov-1)*stats->cov_step, stats->cov_min + icov*stats->cov_step-1,stats->cov_min + icov*stats->cov_step-1, (long)stats->cov[icov]); | |
1083 if ( stats->cov[stats->ncov-1] ) | |
1084 printf("COV\t[%d<]\t%d\t%ld\n",stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov_min + (stats->ncov-2)*stats->cov_step-1, (long)stats->cov[stats->ncov-1]); | |
1085 | |
1086 // Calculate average GC content, then sort by GC and depth | |
1087 printf("# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile\n"); | |
1088 uint32_t igcd; | |
1089 for (igcd=0; igcd<stats->igcd; igcd++) | |
1090 { | |
1091 if ( stats->fai ) | |
1092 stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc); | |
1093 else | |
1094 if ( stats->gcd[igcd].depth ) | |
1095 stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc / stats->gcd[igcd].depth); | |
1096 } | |
1097 qsort(stats->gcd, stats->igcd+1, sizeof(gc_depth_t), gcd_cmp); | |
1098 igcd = 0; | |
1099 while ( igcd < stats->igcd ) | |
1100 { | |
1101 // Calculate percentiles (10,25,50,75,90th) for the current GC content and print | |
1102 uint32_t nbins=0, itmp=igcd; | |
1103 float gc = stats->gcd[igcd].gc; | |
1104 while ( itmp<stats->igcd && fabs(stats->gcd[itmp].gc-gc)<0.1 ) | |
1105 { | |
1106 nbins++; | |
1107 itmp++; | |
1108 } | |
1109 printf("GCD\t%.1f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n", gc, (igcd+nbins+1)*100./(stats->igcd+1), | |
1110 gcd_percentile(&(stats->gcd[igcd]),nbins,10) *avg_read_length/stats->gcd_bin_size, | |
1111 gcd_percentile(&(stats->gcd[igcd]),nbins,25) *avg_read_length/stats->gcd_bin_size, | |
1112 gcd_percentile(&(stats->gcd[igcd]),nbins,50) *avg_read_length/stats->gcd_bin_size, | |
1113 gcd_percentile(&(stats->gcd[igcd]),nbins,75) *avg_read_length/stats->gcd_bin_size, | |
1114 gcd_percentile(&(stats->gcd[igcd]),nbins,90) *avg_read_length/stats->gcd_bin_size | |
1115 ); | |
1116 igcd += nbins; | |
1117 } | |
1118 } | |
1119 | |
1120 size_t mygetline(char **line, size_t *n, FILE *fp) | |
1121 { | |
1122 if (line == NULL || n == NULL || fp == NULL) | |
1123 { | |
1124 errno = EINVAL; | |
1125 return -1; | |
1126 } | |
1127 if (*n==0 || !*line) | |
1128 { | |
1129 *line = NULL; | |
1130 *n = 0; | |
1131 } | |
1132 | |
1133 size_t nread=0; | |
1134 int c; | |
1135 while ((c=getc(fp))!= EOF && c!='\n') | |
1136 { | |
1137 if ( ++nread>=*n ) | |
1138 { | |
1139 *n += 255; | |
1140 *line = realloc(*line, sizeof(char)*(*n)); | |
1141 } | |
1142 (*line)[nread-1] = c; | |
1143 } | |
1144 if ( nread>=*n ) | |
1145 { | |
1146 *n += 255; | |
1147 *line = realloc(*line, sizeof(char)*(*n)); | |
1148 } | |
1149 (*line)[nread] = 0; | |
1150 return nread>0 ? nread : -1; | |
1151 | |
1152 } | |
1153 | |
1154 void init_regions(stats_t *stats, char *file) | |
1155 { | |
1156 khiter_t iter; | |
1157 khash_t(kh_bam_tid) *header_hash; | |
1158 | |
1159 bam_init_header_hash(stats->sam->header); | |
1160 header_hash = (khash_t(kh_bam_tid)*)stats->sam->header->hash; | |
1161 | |
1162 FILE *fp = fopen(file,"r"); | |
1163 if ( !fp ) error("%s: %s\n",file,strerror(errno)); | |
1164 | |
1165 char *line = NULL; | |
1166 size_t len = 0; | |
1167 ssize_t nread; | |
1168 int warned = 0; | |
1169 int prev_tid=-1, prev_pos=-1; | |
1170 while ((nread = mygetline(&line, &len, fp)) != -1) | |
1171 { | |
1172 if ( line[0] == '#' ) continue; | |
1173 | |
1174 int i = 0; | |
1175 while ( i<nread && !isspace(line[i]) ) i++; | |
1176 if ( i>=nread ) error("Could not parse the file: %s [%s]\n", file,line); | |
1177 line[i] = 0; | |
1178 | |
1179 iter = kh_get(kh_bam_tid, header_hash, line); | |
1180 int tid = kh_val(header_hash, iter); | |
1181 if ( iter == kh_end(header_hash) ) | |
1182 { | |
1183 if ( !warned ) | |
1184 fprintf(stderr,"Warning: Some sequences not present in the BAM, e.g. \"%s\". This message is printed only once.\n", line); | |
1185 warned = 1; | |
1186 continue; | |
1187 } | |
1188 | |
1189 if ( tid >= stats->nregions ) | |
1190 { | |
1191 stats->regions = realloc(stats->regions,sizeof(regions_t)*(stats->nregions+100)); | |
1192 int j; | |
1193 for (j=stats->nregions; j<stats->nregions+100; j++) | |
1194 { | |
1195 stats->regions[j].npos = stats->regions[j].mpos = stats->regions[j].cpos = 0; | |
1196 stats->regions[j].pos = NULL; | |
1197 } | |
1198 stats->nregions += 100; | |
1199 } | |
1200 int npos = stats->regions[tid].npos; | |
1201 if ( npos >= stats->regions[tid].mpos ) | |
1202 { | |
1203 stats->regions[tid].mpos += 1000; | |
1204 stats->regions[tid].pos = realloc(stats->regions[tid].pos,sizeof(pos_t)*stats->regions[tid].mpos); | |
1205 } | |
1206 | |
1207 if ( (sscanf(line+i+1,"%d %d",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n"); | |
1208 if ( prev_tid==-1 || prev_tid!=tid ) | |
1209 { | |
1210 prev_tid = tid; | |
1211 prev_pos = stats->regions[tid].pos[npos].from; | |
1212 } | |
1213 if ( prev_pos>stats->regions[tid].pos[npos].from ) | |
1214 error("The positions are not in chromosomal order (%s:%d comes after %d)\n", line,stats->regions[tid].pos[npos].from,prev_pos); | |
1215 stats->regions[tid].npos++; | |
1216 } | |
1217 if (line) free(line); | |
1218 if ( !stats->regions ) error("Unable to map the -t sequences to the BAM sequences.\n"); | |
1219 fclose(fp); | |
1220 } | |
1221 | |
1222 void destroy_regions(stats_t *stats) | |
1223 { | |
1224 int i; | |
1225 for (i=0; i<stats->nregions; i++) | |
1226 { | |
1227 if ( !stats->regions[i].mpos ) continue; | |
1228 free(stats->regions[i].pos); | |
1229 } | |
1230 if ( stats->regions ) free(stats->regions); | |
1231 } | |
1232 | |
1233 static int fetch_read(const bam1_t *bam_line, void *data) | |
1234 { | |
1235 collect_stats((bam1_t*)bam_line,(stats_t*)data); | |
1236 return 1; | |
1237 } | |
1238 | |
1239 void reset_regions(stats_t *stats) | |
1240 { | |
1241 int i; | |
1242 for (i=0; i<stats->nregions; i++) | |
1243 stats->regions[i].cpos = 0; | |
1244 } | |
1245 | |
1246 int is_in_regions(bam1_t *bam_line, stats_t *stats) | |
1247 { | |
1248 if ( !stats->regions ) return 1; | |
1249 | |
1250 if ( bam_line->core.tid >= stats->nregions || bam_line->core.tid<0 ) return 0; | |
1251 if ( !stats->is_sorted ) error("The BAM must be sorted in order for -t to work.\n"); | |
1252 | |
1253 regions_t *reg = &stats->regions[bam_line->core.tid]; | |
1254 if ( reg->cpos==reg->npos ) return 0; // done for this chr | |
1255 | |
1256 // Find a matching interval or skip this read. No splicing of reads is done, no indels or soft clips considered, | |
1257 // even small overlap is enough to include the read in the stats. | |
1258 int i = reg->cpos; | |
1259 while ( i<reg->npos && reg->pos[i].to<=bam_line->core.pos ) i++; | |
1260 if ( i>=reg->npos ) { reg->cpos = reg->npos; return 0; } | |
1261 if ( bam_line->core.pos + bam_line->core.l_qseq + 1 < reg->pos[i].from ) return 0; | |
1262 reg->cpos = i; | |
1263 stats->reg_from = reg->pos[i].from; | |
1264 stats->reg_to = reg->pos[i].to; | |
1265 | |
1266 return 1; | |
1267 } | |
1268 | |
1269 void init_group_id(stats_t *stats, char *id) | |
1270 { | |
1271 if ( !stats->sam->header->dict ) | |
1272 stats->sam->header->dict = sam_header_parse2(stats->sam->header->text); | |
1273 void *iter = stats->sam->header->dict; | |
1274 const char *key, *val; | |
1275 int n = 0; | |
1276 stats->rg_hash = kh_init(kh_rg); | |
1277 while ( (iter = sam_header2key_val(iter, "RG","ID","SM", &key, &val)) ) | |
1278 { | |
1279 if ( !strcmp(id,key) || (val && !strcmp(id,val)) ) | |
1280 { | |
1281 khiter_t k = kh_get(kh_rg, stats->rg_hash, key); | |
1282 if ( k != kh_end(stats->rg_hash) ) | |
1283 fprintf(stderr, "[init_group_id] The group ID not unique: \"%s\"\n", key); | |
1284 int ret; | |
1285 k = kh_put(kh_rg, stats->rg_hash, key, &ret); | |
1286 kh_value(stats->rg_hash, k) = val; | |
1287 n++; | |
1288 } | |
1289 } | |
1290 if ( !n ) | |
1291 error("The sample or read group \"%s\" not present.\n", id); | |
1292 } | |
1293 | |
1294 | |
1295 void error(const char *format, ...) | |
1296 { | |
1297 if ( !format ) | |
1298 { | |
1299 printf("Version: %s\n", BAMCHECK_VERSION); | |
1300 printf("About: The program collects statistics from BAM files. The output can be visualized using plot-bamcheck.\n"); | |
1301 printf("Usage: bamcheck [OPTIONS] file.bam\n"); | |
1302 printf(" bamcheck [OPTIONS] file.bam chr:from-to\n"); | |
1303 printf("Options:\n"); | |
1304 printf(" -c, --coverage <int>,<int>,<int> Coverage distribution min,max,step [1,1000,1]\n"); | |
1305 printf(" -d, --remove-dups Exlude from statistics reads marked as duplicates\n"); | |
1306 printf(" -f, --required-flag <int> Required flag, 0 for unset [0]\n"); | |
1307 printf(" -F, --filtering-flag <int> Filtering flag, 0 for unset [0]\n"); | |
1308 printf(" --GC-depth <float,float> Bin size for GC-depth graph and the maximum reference length [2e4,4.2e9]\n"); | |
1309 printf(" -h, --help This help message\n"); | |
1310 printf(" -i, --insert-size <int> Maximum insert size [8000]\n"); | |
1311 printf(" -I, --id <string> Include only listed read group or sample name\n"); | |
1312 printf(" -l, --read-length <int> Include in the statistics only reads with the given read length []\n"); | |
1313 printf(" -m, --most-inserts <float> Report only the main part of inserts [0.99]\n"); | |
1314 printf(" -q, --trim-quality <int> The BWA trimming parameter [0]\n"); | |
1315 printf(" -r, --ref-seq <file> Reference sequence (required for GC-depth calculation).\n"); | |
1316 printf(" -t, --target-regions <file> Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n"); | |
1317 printf(" -s, --sam Input is SAM\n"); | |
1318 printf("\n"); | |
1319 } | |
1320 else | |
1321 { | |
1322 va_list ap; | |
1323 va_start(ap, format); | |
1324 vfprintf(stderr, format, ap); | |
1325 va_end(ap); | |
1326 } | |
1327 exit(-1); | |
1328 } | |
1329 | |
1330 int main(int argc, char *argv[]) | |
1331 { | |
1332 char *targets = NULL; | |
1333 char *bam_fname = NULL; | |
1334 char *group_id = NULL; | |
1335 samfile_t *sam = NULL; | |
1336 char in_mode[5]; | |
1337 | |
1338 stats_t *stats = calloc(1,sizeof(stats_t)); | |
1339 stats->ngc = 200; | |
1340 stats->nquals = 256; | |
1341 stats->nbases = 300; | |
1342 stats->nisize = 8000; | |
1343 stats->max_len = 30; | |
1344 stats->max_qual = 40; | |
1345 stats->isize_main_bulk = 0.99; // There are always outliers at the far end | |
1346 stats->gcd_bin_size = 20e3; | |
1347 stats->gcd_ref_size = 4.2e9; | |
1348 stats->rseq_pos = -1; | |
1349 stats->tid = stats->gcd_pos = -1; | |
1350 stats->igcd = 0; | |
1351 stats->is_sorted = 1; | |
1352 stats->cov_min = 1; | |
1353 stats->cov_max = 1000; | |
1354 stats->cov_step = 1; | |
1355 stats->argc = argc; | |
1356 stats->argv = argv; | |
1357 stats->filter_readlen = -1; | |
1358 stats->nindels = stats->nbases; | |
1359 | |
1360 strcpy(in_mode, "rb"); | |
1361 | |
1362 static struct option loptions[] = | |
1363 { | |
1364 {"help",0,0,'h'}, | |
1365 {"remove-dups",0,0,'d'}, | |
1366 {"sam",0,0,'s'}, | |
1367 {"ref-seq",1,0,'r'}, | |
1368 {"coverage",1,0,'c'}, | |
1369 {"read-length",1,0,'l'}, | |
1370 {"insert-size",1,0,'i'}, | |
1371 {"most-inserts",1,0,'m'}, | |
1372 {"trim-quality",1,0,'q'}, | |
1373 {"target-regions",0,0,'t'}, | |
1374 {"required-flag",1,0,'f'}, | |
1375 {"filtering-flag",0,0,'F'}, | |
1376 {"id",1,0,'I'}, | |
1377 {"GC-depth",1,0,1}, | |
1378 {0,0,0,0} | |
1379 }; | |
1380 int opt; | |
1381 while ( (opt=getopt_long(argc,argv,"?hdsr:c:l:i:t:m:q:f:F:I:1:",loptions,NULL))>0 ) | |
1382 { | |
1383 switch (opt) | |
1384 { | |
1385 case 'f': stats->flag_require=strtol(optarg,0,0); break; | |
1386 case 'F': stats->flag_filter=strtol(optarg,0,0); break; | |
1387 case 'd': stats->flag_filter|=BAM_FDUP; break; | |
1388 case 's': strcpy(in_mode, "r"); break; | |
1389 case 'r': stats->fai = fai_load(optarg); | |
1390 if (stats->fai==0) | |
1391 error("Could not load faidx: %s\n", optarg); | |
1392 break; | |
1393 case 1 : { | |
1394 float flen,fbin; | |
1395 if ( sscanf(optarg,"%f,%f",&fbin,&flen)!= 2 ) | |
1396 error("Unable to parse --GC-depth %s\n", optarg); | |
1397 stats->gcd_bin_size = fbin; | |
1398 stats->gcd_ref_size = flen; | |
1399 } | |
1400 break; | |
1401 case 'c': if ( sscanf(optarg,"%d,%d,%d",&stats->cov_min,&stats->cov_max,&stats->cov_step)!= 3 ) | |
1402 error("Unable to parse -c %s\n", optarg); | |
1403 break; | |
1404 case 'l': stats->filter_readlen = atoi(optarg); break; | |
1405 case 'i': stats->nisize = atoi(optarg); break; | |
1406 case 'm': stats->isize_main_bulk = atof(optarg); break; | |
1407 case 'q': stats->trim_qual = atoi(optarg); break; | |
1408 case 't': targets = optarg; break; | |
1409 case 'I': group_id = optarg; break; | |
1410 case '?': | |
1411 case 'h': error(NULL); | |
1412 default: error("Unknown argument: %s\n", optarg); | |
1413 } | |
1414 } | |
1415 if ( optind<argc ) | |
1416 bam_fname = argv[optind++]; | |
1417 | |
1418 if ( !bam_fname ) | |
1419 { | |
1420 if ( isatty(fileno((FILE *)stdin)) ) | |
1421 error(NULL); | |
1422 bam_fname = "-"; | |
1423 } | |
1424 | |
1425 // Init structures | |
1426 // .. coverage bins and round buffer | |
1427 if ( stats->cov_step > stats->cov_max - stats->cov_min + 1 ) | |
1428 { | |
1429 stats->cov_step = stats->cov_max - stats->cov_min; | |
1430 if ( stats->cov_step <= 0 ) | |
1431 stats->cov_step = 1; | |
1432 } | |
1433 stats->ncov = 3 + (stats->cov_max-stats->cov_min) / stats->cov_step; | |
1434 stats->cov_max = stats->cov_min + ((stats->cov_max-stats->cov_min)/stats->cov_step +1)*stats->cov_step - 1; | |
1435 stats->cov = calloc(sizeof(uint64_t),stats->ncov); | |
1436 stats->cov_rbuf.size = stats->nbases*5; | |
1437 stats->cov_rbuf.buffer = calloc(sizeof(int32_t),stats->cov_rbuf.size); | |
1438 // .. bam | |
1439 if ((sam = samopen(bam_fname, in_mode, NULL)) == 0) | |
1440 error("Failed to open: %s\n", bam_fname); | |
1441 stats->sam = sam; | |
1442 if ( group_id ) init_group_id(stats, group_id); | |
1443 bam1_t *bam_line = bam_init1(); | |
1444 // .. arrays | |
1445 stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t)); | |
1446 stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t)); | |
1447 stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t)); | |
1448 stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t)); | |
1449 stats->isize_inward = calloc(stats->nisize,sizeof(uint64_t)); | |
1450 stats->isize_outward = calloc(stats->nisize,sizeof(uint64_t)); | |
1451 stats->isize_other = calloc(stats->nisize,sizeof(uint64_t)); | |
1452 stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t)); | |
1453 stats->mpc_buf = stats->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL; | |
1454 stats->acgt_cycles = calloc(4*stats->nbases,sizeof(uint64_t)); | |
1455 stats->read_lengths = calloc(stats->nbases,sizeof(uint64_t)); | |
1456 stats->insertions = calloc(stats->nbases,sizeof(uint64_t)); | |
1457 stats->deletions = calloc(stats->nbases,sizeof(uint64_t)); | |
1458 stats->ins_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t)); | |
1459 stats->ins_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t)); | |
1460 stats->del_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t)); | |
1461 stats->del_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t)); | |
1462 realloc_rseq_buffer(stats); | |
1463 if ( targets ) | |
1464 init_regions(stats, targets); | |
1465 | |
1466 // Collect statistics | |
1467 if ( optind<argc ) | |
1468 { | |
1469 // Collect stats in selected regions only | |
1470 bam_index_t *bam_idx = bam_index_load(bam_fname); | |
1471 if (bam_idx == 0) | |
1472 error("Random alignment retrieval only works for indexed BAM files.\n"); | |
1473 | |
1474 int i; | |
1475 for (i=optind; i<argc; i++) | |
1476 { | |
1477 int tid, beg, end; | |
1478 bam_parse_region(stats->sam->header, argv[i], &tid, &beg, &end); | |
1479 if ( tid < 0 ) continue; | |
1480 reset_regions(stats); | |
1481 bam_fetch(stats->sam->x.bam, bam_idx, tid, beg, end, stats, fetch_read); | |
1482 } | |
1483 bam_index_destroy(bam_idx); | |
1484 } | |
1485 else | |
1486 { | |
1487 // Stream through the entire BAM ignoring off-target regions if -t is given | |
1488 while (samread(sam,bam_line) >= 0) | |
1489 collect_stats(bam_line,stats); | |
1490 } | |
1491 round_buffer_flush(stats,-1); | |
1492 | |
1493 output_stats(stats); | |
1494 | |
1495 bam_destroy1(bam_line); | |
1496 samclose(stats->sam); | |
1497 if (stats->fai) fai_destroy(stats->fai); | |
1498 free(stats->cov_rbuf.buffer); free(stats->cov); | |
1499 free(stats->quals_1st); free(stats->quals_2nd); | |
1500 free(stats->gc_1st); free(stats->gc_2nd); | |
1501 free(stats->isize_inward); free(stats->isize_outward); free(stats->isize_other); | |
1502 free(stats->gcd); | |
1503 free(stats->rseq_buf); | |
1504 free(stats->mpc_buf); | |
1505 free(stats->acgt_cycles); | |
1506 free(stats->read_lengths); | |
1507 free(stats->insertions); | |
1508 free(stats->deletions); | |
1509 free(stats->ins_cycles_1st); | |
1510 free(stats->ins_cycles_2nd); | |
1511 free(stats->del_cycles_1st); | |
1512 free(stats->del_cycles_2nd); | |
1513 destroy_regions(stats); | |
1514 free(stats); | |
1515 if ( stats->rg_hash ) kh_destroy(kh_rg, stats->rg_hash); | |
1516 | |
1517 return 0; | |
1518 } | |
1519 | |
1520 | |
1521 |