annotate PsiCLASS-1.0.2/samtools-0.1.19/misc/bamcheck.c @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1 /*
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
2 Author: petr.danecek@sanger
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
3 gcc -Wall -Winline -g -O2 -I ~/git/samtools bamcheck.c -o bamcheck -lm -lz -L ~/git/samtools -lbam -lpthread
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
4
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
5 Assumptions, approximations and other issues:
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
6 - GC-depth graph does not split reads, the starting position determines which bin is incremented.
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
7 There are small overlaps between bins (max readlen-1). However, the bins are big (20k).
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
8 - coverage distribution ignores softclips and deletions
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
9 - some stats require sorted BAMs
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
10 - GC content graph can have an untidy, step-like pattern when BAM contains multiple read lengths.
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
11 - 'bases mapped' (stats->nbases_mapped) is calculated from read lengths given by BAM (core.l_qseq)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
12 - With the -t option, the whole reads are used. Except for the number of mapped bases (cigar)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
13 counts, no splicing is done, no indels or soft clips are considered, even small overlap is
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
14 good enough to include the read in the stats.
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
15
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
16 */
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
17
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
18 #define BAMCHECK_VERSION "2012-09-04"
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
19
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
20 #define _ISOC99_SOURCE
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
21 #include <stdio.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
22 #include <stdlib.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
23 #include <stdarg.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
24 #include <string.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
25 #include <math.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
26 #include <ctype.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
27 #include <getopt.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
28 #include <errno.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
29 #include <assert.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
30 #include "faidx.h"
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
31 #include "khash.h"
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
32 #include "sam.h"
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
33 #include "sam_header.h"
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
34 #include "razf.h"
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
35
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
36 #define BWA_MIN_RDLEN 35
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
37 #define IS_PAIRED(bam) ((bam)->core.flag&BAM_FPAIRED && !((bam)->core.flag&BAM_FUNMAP) && !((bam)->core.flag&BAM_FMUNMAP))
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
38 #define IS_UNMAPPED(bam) ((bam)->core.flag&BAM_FUNMAP)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
39 #define IS_REVERSE(bam) ((bam)->core.flag&BAM_FREVERSE)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
40 #define IS_MATE_REVERSE(bam) ((bam)->core.flag&BAM_FMREVERSE)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
41 #define IS_READ1(bam) ((bam)->core.flag&BAM_FREAD1)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
42 #define IS_READ2(bam) ((bam)->core.flag&BAM_FREAD2)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
43 #define IS_DUP(bam) ((bam)->core.flag&BAM_FDUP)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
44
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
45 typedef struct
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
46 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
47 int32_t line_len, line_blen;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
48 int64_t len;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
49 uint64_t offset;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
50 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
51 faidx1_t;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
52 KHASH_MAP_INIT_STR(kh_faidx, faidx1_t)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
53 KHASH_MAP_INIT_STR(kh_bam_tid, int)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
54 KHASH_MAP_INIT_STR(kh_rg, const char *)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
55 struct __faidx_t {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
56 RAZF *rz;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
57 int n, m;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
58 char **name;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
59 khash_t(kh_faidx) *hash;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
60 };
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
61
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
62 typedef struct
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
63 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
64 float gc;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
65 uint32_t depth;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
66 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
67 gc_depth_t;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
68
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
69 // For coverage distribution, a simple pileup
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
70 typedef struct
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
71 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
72 int64_t pos;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
73 int size, start;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
74 int *buffer;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
75 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
76 round_buffer_t;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
77
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
78 typedef struct { uint32_t from, to; } pos_t;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
79 typedef struct
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
80 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
81 int npos,mpos,cpos;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
82 pos_t *pos;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
83 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
84 regions_t;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
85
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
86 typedef struct
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
87 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
88 // Parameters
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
89 int trim_qual; // bwa trim quality
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
90
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
91 // Dimensions of the quality histogram holder (quals_1st,quals_2nd), GC content holder (gc_1st,gc_2nd),
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
92 // insert size histogram holder
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
93 int nquals; // The number of quality bins
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
94 int nbases; // The maximum sequence length the allocated array can hold
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
95 int nisize; // The maximum insert size that the allocated array can hold
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
96 int ngc; // The size of gc_1st and gc_2nd
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
97 int nindels; // The maximum indel length for indel distribution
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
98
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
99 // Arrays for the histogram data
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
100 uint64_t *quals_1st, *quals_2nd;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
101 uint64_t *gc_1st, *gc_2nd;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
102 uint64_t *isize_inward, *isize_outward, *isize_other;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
103 uint64_t *acgt_cycles;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
104 uint64_t *read_lengths;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
105 uint64_t *insertions, *deletions;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
106 uint64_t *ins_cycles_1st, *ins_cycles_2nd, *del_cycles_1st, *del_cycles_2nd;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
107
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
108 // The extremes encountered
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
109 int max_len; // Maximum read length
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
110 int max_qual; // Maximum quality
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
111 float isize_main_bulk; // There are always some unrealistically big insert sizes, report only the main part
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
112 int is_sorted;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
113
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
114 // Summary numbers
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
115 uint64_t total_len;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
116 uint64_t total_len_dup;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
117 uint64_t nreads_1st;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
118 uint64_t nreads_2nd;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
119 uint64_t nreads_filtered;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
120 uint64_t nreads_dup;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
121 uint64_t nreads_unmapped;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
122 uint64_t nreads_unpaired;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
123 uint64_t nreads_paired;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
124 uint64_t nreads_anomalous;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
125 uint64_t nreads_mq0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
126 uint64_t nbases_mapped;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
127 uint64_t nbases_mapped_cigar;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
128 uint64_t nbases_trimmed; // bwa trimmed bases
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
129 uint64_t nmismatches;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
130 uint64_t nreads_QCfailed, nreads_secondary;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
131
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
132 // GC-depth related data
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
133 uint32_t ngcd, igcd; // The maximum number of GC depth bins and index of the current bin
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
134 gc_depth_t *gcd; // The GC-depth bins holder
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
135 int gcd_bin_size; // The size of GC-depth bin
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
136 uint32_t gcd_ref_size; // The approximate size of the genome
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
137 int32_t tid, gcd_pos; // Position of the current bin
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
138 int32_t pos; // Position of the last read
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
139
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
140 // Coverage distribution related data
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
141 int ncov; // The number of coverage bins
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
142 uint64_t *cov; // The coverage frequencies
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
143 int cov_min,cov_max,cov_step; // Minimum, maximum coverage and size of the coverage bins
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
144 round_buffer_t cov_rbuf; // Pileup round buffer
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
145
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
146 // Mismatches by read cycle
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
147 uint8_t *rseq_buf; // A buffer for reference sequence to check the mismatches against
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
148 int mrseq_buf; // The size of the buffer
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
149 int32_t rseq_pos; // The coordinate of the first base in the buffer
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
150 int32_t nrseq_buf; // The used part of the buffer
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
151 uint64_t *mpc_buf; // Mismatches per cycle
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
152
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
153 // Filters
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
154 int filter_readlen;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
155
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
156 // Target regions
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
157 int nregions, reg_from,reg_to;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
158 regions_t *regions;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
159
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
160 // Auxiliary data
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
161 int flag_require, flag_filter;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
162 double sum_qual; // For calculating average quality value
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
163 samfile_t *sam;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
164 khash_t(kh_rg) *rg_hash; // Read groups to include, the array is null-terminated
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
165 faidx_t *fai; // Reference sequence for GC-depth graph
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
166 int argc; // Command line arguments to be printed on the output
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
167 char **argv;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
168 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
169 stats_t;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
170
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
171 void error(const char *format, ...);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
172 void bam_init_header_hash(bam_header_t *header);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
173 int is_in_regions(bam1_t *bam_line, stats_t *stats);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
174
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
175
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
176 // Coverage distribution methods
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
177 inline int coverage_idx(int min, int max, int n, int step, int depth)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
178 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
179 if ( depth < min )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
180 return 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
181
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
182 if ( depth > max )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
183 return n-1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
184
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
185 return 1 + (depth - min) / step;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
186 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
187
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
188 inline int round_buffer_lidx2ridx(int offset, int size, int64_t refpos, int64_t pos)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
189 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
190 return (offset + (pos-refpos) % size) % size;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
191 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
192
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
193 void round_buffer_flush(stats_t *stats, int64_t pos)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
194 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
195 int ibuf,idp;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
196
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
197 if ( pos==stats->cov_rbuf.pos )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
198 return;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
199
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
200 int64_t new_pos = pos;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
201 if ( pos==-1 || pos - stats->cov_rbuf.pos >= stats->cov_rbuf.size )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
202 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
203 // Flush the whole buffer, but in sequential order,
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
204 pos = stats->cov_rbuf.pos + stats->cov_rbuf.size - 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
205 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
206
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
207 if ( pos < stats->cov_rbuf.pos )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
208 error("Expected coordinates in ascending order, got %ld after %ld\n", pos,stats->cov_rbuf.pos);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
209
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
210 int ifrom = stats->cov_rbuf.start;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
211 int ito = round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos-1);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
212 if ( ifrom>ito )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
213 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
214 for (ibuf=ifrom; ibuf<stats->cov_rbuf.size; ibuf++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
215 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
216 if ( !stats->cov_rbuf.buffer[ibuf] )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
217 continue;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
218 idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
219 stats->cov[idp]++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
220 stats->cov_rbuf.buffer[ibuf] = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
221 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
222 ifrom = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
223 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
224 for (ibuf=ifrom; ibuf<=ito; ibuf++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
225 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
226 if ( !stats->cov_rbuf.buffer[ibuf] )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
227 continue;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
228 idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
229 stats->cov[idp]++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
230 stats->cov_rbuf.buffer[ibuf] = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
231 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
232 stats->cov_rbuf.start = (new_pos==-1) ? 0 : round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
233 stats->cov_rbuf.pos = new_pos;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
234 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
235
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
236 void round_buffer_insert_read(round_buffer_t *rbuf, int64_t from, int64_t to)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
237 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
238 if ( to-from >= rbuf->size )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
239 error("The read length too big (%d), please increase the buffer length (currently %d)\n", to-from+1,rbuf->size);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
240 if ( from < rbuf->pos )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
241 error("The reads are not sorted (%ld comes after %ld).\n", from,rbuf->pos);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
242
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
243 int ifrom,ito,ibuf;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
244 ifrom = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,from);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
245 ito = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,to);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
246 if ( ifrom>ito )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
247 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
248 for (ibuf=ifrom; ibuf<rbuf->size; ibuf++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
249 rbuf->buffer[ibuf]++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
250 ifrom = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
251 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
252 for (ibuf=ifrom; ibuf<=ito; ibuf++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
253 rbuf->buffer[ibuf]++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
254 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
255
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
256 // Calculate the number of bases in the read trimmed by BWA
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
257 int bwa_trim_read(int trim_qual, uint8_t *quals, int len, int reverse)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
258 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
259 if ( len<BWA_MIN_RDLEN ) return 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
260
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
261 // Although the name implies that the read cannot be trimmed to more than BWA_MIN_RDLEN,
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
262 // the calculation can in fact trim it to (BWA_MIN_RDLEN-1). (bwa_trim_read in bwa/bwaseqio.c).
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
263 int max_trimmed = len - BWA_MIN_RDLEN + 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
264 int l, sum=0, max_sum=0, max_l=0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
265
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
266 for (l=0; l<max_trimmed; l++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
267 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
268 sum += trim_qual - quals[ reverse ? l : len-1-l ];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
269 if ( sum<0 ) break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
270 if ( sum>max_sum )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
271 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
272 max_sum = sum;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
273 // This is the correct way, but bwa clips from some reason one base less
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
274 // max_l = l+1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
275 max_l = l;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
276 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
277 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
278 return max_l;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
279 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
280
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
281
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
282 void count_indels(stats_t *stats,bam1_t *bam_line)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
283 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
284 int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
285 int is_1st = IS_READ1(bam_line) ? 1 : 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
286 int icig;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
287 int icycle = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
288 int read_len = bam_line->core.l_qseq;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
289 for (icig=0; icig<bam_line->core.n_cigar; icig++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
290 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
291 // Conversion from uint32_t to MIDNSHP
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
292 // 0123456
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
293 // MIDNSHP
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
294 int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
295 int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
296
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
297 if ( cig==1 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
298 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
299 int idx = is_fwd ? icycle : read_len-icycle-ncig;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
300 if ( idx<0 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
301 error("FIXME: read_len=%d vs icycle=%d\n", read_len,icycle);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
302 if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d, %s:%d %s\n", idx,stats->nbases, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
303 if ( is_1st )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
304 stats->ins_cycles_1st[idx]++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
305 else
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
306 stats->ins_cycles_2nd[idx]++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
307 icycle += ncig;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
308 if ( ncig<=stats->nindels )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
309 stats->insertions[ncig-1]++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
310 continue;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
311 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
312 if ( cig==2 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
313 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
314 int idx = is_fwd ? icycle-1 : read_len-icycle-1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
315 if ( idx<0 ) continue; // discard meaningless deletions
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
316 if ( idx >= stats->nbases ) error("FIXME: %d vs %d\n", idx,stats->nbases);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
317 if ( is_1st )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
318 stats->del_cycles_1st[idx]++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
319 else
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
320 stats->del_cycles_2nd[idx]++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
321 if ( ncig<=stats->nindels )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
322 stats->deletions[ncig-1]++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
323 continue;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
324 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
325 if ( cig!=3 && cig!=5 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
326 icycle += ncig;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
327 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
328 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
329
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
330 void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
331 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
332 int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
333 int icig,iread=0,icycle=0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
334 int iref = bam_line->core.pos - stats->rseq_pos;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
335 int read_len = bam_line->core.l_qseq;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
336 uint8_t *read = bam1_seq(bam_line);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
337 uint8_t *quals = bam1_qual(bam_line);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
338 uint64_t *mpc_buf = stats->mpc_buf;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
339 for (icig=0; icig<bam_line->core.n_cigar; icig++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
340 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
341 // Conversion from uint32_t to MIDNSHP
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
342 // 0123456
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
343 // MIDNSHP
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
344 int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
345 int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
346 if ( cig==1 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
347 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
348 iread += ncig;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
349 icycle += ncig;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
350 continue;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
351 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
352 if ( cig==2 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
353 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
354 iref += ncig;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
355 continue;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
356 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
357 if ( cig==4 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
358 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
359 icycle += ncig;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
360 // Soft-clips are present in the sequence, but the position of the read marks a start of non-clipped sequence
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
361 // iref += ncig;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
362 iread += ncig;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
363 continue;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
364 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
365 if ( cig==5 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
366 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
367 icycle += ncig;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
368 continue;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
369 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
370 // Ignore H and N CIGARs. The letter are inserted e.g. by TopHat and often require very large
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
371 // chunk of refseq in memory. Not very frequent and not noticable in the stats.
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
372 if ( cig==3 || cig==5 ) continue;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
373 if ( cig!=0 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
374 error("TODO: cigar %d, %s:%d %s\n", cig,stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
375
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
376 if ( ncig+iref > stats->nrseq_buf )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
377 error("FIXME: %d+%d > %d, %s, %s:%d\n",ncig,iref,stats->nrseq_buf, bam1_qname(bam_line),stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
378
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
379 int im;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
380 for (im=0; im<ncig; im++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
381 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
382 uint8_t cread = bam1_seqi(read,iread);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
383 uint8_t cref = stats->rseq_buf[iref];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
384
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
385 // ---------------15
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
386 // =ACMGRSVTWYHKDBN
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
387 if ( cread==15 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
388 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
389 int idx = is_fwd ? icycle : read_len-icycle-1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
390 if ( idx>stats->max_len )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
391 error("mpc: %d>%d\n",idx,stats->max_len);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
392 idx = idx*stats->nquals;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
393 if ( idx>=stats->nquals*stats->nbases )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
394 error("FIXME: mpc_buf overflow\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
395 mpc_buf[idx]++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
396 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
397 else if ( cref && cread && cref!=cread )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
398 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
399 uint8_t qual = quals[iread] + 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
400 if ( qual>=stats->nquals )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
401 error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
402
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
403 int idx = is_fwd ? icycle : read_len-icycle-1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
404 if ( idx>stats->max_len )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
405 error("mpc: %d>%d\n",idx,stats->max_len);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
406
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
407 idx = idx*stats->nquals + qual;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
408 if ( idx>=stats->nquals*stats->nbases )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
409 error("FIXME: mpc_buf overflow\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
410 mpc_buf[idx]++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
411 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
412
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
413 iref++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
414 iread++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
415 icycle++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
416 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
417 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
418 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
419
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
420 void read_ref_seq(stats_t *stats,int32_t tid,int32_t pos)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
421 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
422 khash_t(kh_faidx) *h;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
423 khiter_t iter;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
424 faidx1_t val;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
425 char *chr, c;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
426 faidx_t *fai = stats->fai;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
427
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
428 h = fai->hash;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
429 chr = stats->sam->header->target_name[tid];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
430
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
431 // ID of the sequence name
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
432 iter = kh_get(kh_faidx, h, chr);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
433 if (iter == kh_end(h))
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
434 error("No such reference sequence [%s]?\n", chr);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
435 val = kh_value(h, iter);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
436
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
437 // Check the boundaries
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
438 if (pos >= val.len)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
439 error("Was the bam file mapped with the reference sequence supplied?"
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
440 " A read mapped beyond the end of the chromosome (%s:%d, chromosome length %d).\n", chr,pos,val.len);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
441 int size = stats->mrseq_buf;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
442 // The buffer extends beyond the chromosome end. Later the rest will be filled with N's.
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
443 if (size+pos > val.len) size = val.len-pos;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
444
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
445 // Position the razf reader
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
446 razf_seek(fai->rz, val.offset + pos / val.line_blen * val.line_len + pos % val.line_blen, SEEK_SET);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
447
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
448 uint8_t *ptr = stats->rseq_buf;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
449 int nread = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
450 while ( nread<size && razf_read(fai->rz,&c,1) && !fai->rz->z_err )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
451 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
452 if ( !isgraph(c) )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
453 continue;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
454
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
455 // Conversion between uint8_t coding and ACGT
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
456 // -12-4---8-------
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
457 // =ACMGRSVTWYHKDBN
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
458 if ( c=='A' || c=='a' )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
459 *ptr = 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
460 else if ( c=='C' || c=='c' )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
461 *ptr = 2;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
462 else if ( c=='G' || c=='g' )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
463 *ptr = 4;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
464 else if ( c=='T' || c=='t' )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
465 *ptr = 8;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
466 else
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
467 *ptr = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
468 ptr++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
469 nread++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
470 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
471 if ( nread < stats->mrseq_buf )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
472 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
473 memset(ptr,0, stats->mrseq_buf - nread);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
474 nread = stats->mrseq_buf;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
475 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
476 stats->nrseq_buf = nread;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
477 stats->rseq_pos = pos;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
478 stats->tid = tid;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
479 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
480
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
481 float fai_gc_content(stats_t *stats, int pos, int len)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
482 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
483 uint32_t gc,count,c;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
484 int i = pos - stats->rseq_pos, ito = i + len;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
485 assert( i>=0 && ito<=stats->nrseq_buf );
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
486
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
487 // Count GC content
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
488 gc = count = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
489 for (; i<ito; i++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
490 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
491 c = stats->rseq_buf[i];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
492 if ( c==2 || c==4 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
493 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
494 gc++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
495 count++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
496 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
497 else if ( c==1 || c==8 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
498 count++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
499 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
500 return count ? (float)gc/count : 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
501 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
502
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
503 void realloc_rseq_buffer(stats_t *stats)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
504 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
505 int n = stats->nbases*10;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
506 if ( stats->gcd_bin_size > n ) n = stats->gcd_bin_size;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
507 if ( stats->mrseq_buf<n )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
508 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
509 stats->rseq_buf = realloc(stats->rseq_buf,sizeof(uint8_t)*n);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
510 stats->mrseq_buf = n;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
511 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
512 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
513
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
514 void realloc_gcd_buffer(stats_t *stats, int seq_len)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
515 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
516 if ( seq_len >= stats->gcd_bin_size )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
517 error("The --GC-depth bin size (%d) is set too low for the read length %d\n", stats->gcd_bin_size, seq_len);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
518
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
519 int n = 1 + stats->gcd_ref_size / (stats->gcd_bin_size - seq_len);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
520 if ( n <= stats->igcd )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
521 error("The --GC-depth bin size is too small or reference genome too big; please decrease the bin size or increase the reference length\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
522
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
523 if ( n > stats->ngcd )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
524 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
525 stats->gcd = realloc(stats->gcd, n*sizeof(gc_depth_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
526 if ( !stats->gcd )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
527 error("Could not realloc GCD buffer, too many chromosomes or the genome too long?? [%u %u]\n", stats->ngcd,n);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
528 memset(&(stats->gcd[stats->ngcd]),0,(n-stats->ngcd)*sizeof(gc_depth_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
529 stats->ngcd = n;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
530 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
531
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
532 realloc_rseq_buffer(stats);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
533 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
534
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
535 void realloc_buffers(stats_t *stats, int seq_len)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
536 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
537 int n = 2*(1 + seq_len - stats->nbases) + stats->nbases;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
538
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
539 stats->quals_1st = realloc(stats->quals_1st, n*stats->nquals*sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
540 if ( !stats->quals_1st )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
541 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
542 memset(stats->quals_1st + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
543
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
544 stats->quals_2nd = realloc(stats->quals_2nd, n*stats->nquals*sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
545 if ( !stats->quals_2nd )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
546 error("Could not realloc buffers, the sequence too long: %d (2x%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
547 memset(stats->quals_2nd + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
548
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
549 if ( stats->mpc_buf )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
550 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
551 stats->mpc_buf = realloc(stats->mpc_buf, n*stats->nquals*sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
552 if ( !stats->mpc_buf )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
553 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
554 memset(stats->mpc_buf + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
555 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
556
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
557 stats->acgt_cycles = realloc(stats->acgt_cycles, n*4*sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
558 if ( !stats->acgt_cycles )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
559 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*4*sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
560 memset(stats->acgt_cycles + stats->nbases*4, 0, (n-stats->nbases)*4*sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
561
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
562 stats->read_lengths = realloc(stats->read_lengths, n*sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
563 if ( !stats->read_lengths )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
564 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
565 memset(stats->read_lengths + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
566
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
567 stats->insertions = realloc(stats->insertions, n*sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
568 if ( !stats->insertions )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
569 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
570 memset(stats->insertions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
571
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
572 stats->deletions = realloc(stats->deletions, n*sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
573 if ( !stats->deletions )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
574 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
575 memset(stats->deletions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
576
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
577 stats->ins_cycles_1st = realloc(stats->ins_cycles_1st, (n+1)*sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
578 if ( !stats->ins_cycles_1st )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
579 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
580 memset(stats->ins_cycles_1st + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
581
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
582 stats->ins_cycles_2nd = realloc(stats->ins_cycles_2nd, (n+1)*sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
583 if ( !stats->ins_cycles_2nd )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
584 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
585 memset(stats->ins_cycles_2nd + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
586
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
587 stats->del_cycles_1st = realloc(stats->del_cycles_1st, (n+1)*sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
588 if ( !stats->del_cycles_1st )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
589 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
590 memset(stats->del_cycles_1st + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
591
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
592 stats->del_cycles_2nd = realloc(stats->del_cycles_2nd, (n+1)*sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
593 if ( !stats->del_cycles_2nd )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
594 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
595 memset(stats->del_cycles_2nd + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
596
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
597 stats->nbases = n;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
598
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
599 // Realloc the coverage distribution buffer
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
600 int *rbuffer = calloc(sizeof(int),seq_len*5);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
601 n = stats->cov_rbuf.size-stats->cov_rbuf.start;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
602 memcpy(rbuffer,stats->cov_rbuf.buffer+stats->cov_rbuf.start,n);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
603 if ( stats->cov_rbuf.start>1 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
604 memcpy(rbuffer+n,stats->cov_rbuf.buffer,stats->cov_rbuf.start);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
605 stats->cov_rbuf.start = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
606 free(stats->cov_rbuf.buffer);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
607 stats->cov_rbuf.buffer = rbuffer;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
608 stats->cov_rbuf.size = seq_len*5;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
609
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
610 realloc_rseq_buffer(stats);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
611 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
612
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
613 void collect_stats(bam1_t *bam_line, stats_t *stats)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
614 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
615 if ( stats->rg_hash )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
616 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
617 const uint8_t *rg = bam_aux_get(bam_line, "RG");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
618 if ( !rg ) return;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
619 khiter_t k = kh_get(kh_rg, stats->rg_hash, (const char*)(rg + 1));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
620 if ( k == kh_end(stats->rg_hash) ) return;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
621 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
622 if ( stats->flag_require && (bam_line->core.flag & stats->flag_require)!=stats->flag_require )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
623 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
624 stats->nreads_filtered++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
625 return;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
626 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
627 if ( stats->flag_filter && (bam_line->core.flag & stats->flag_filter) )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
628 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
629 stats->nreads_filtered++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
630 return;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
631 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
632 if ( !is_in_regions(bam_line,stats) )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
633 return;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
634 if ( stats->filter_readlen!=-1 && bam_line->core.l_qseq!=stats->filter_readlen )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
635 return;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
636
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
637 if ( bam_line->core.flag & BAM_FQCFAIL ) stats->nreads_QCfailed++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
638 if ( bam_line->core.flag & BAM_FSECONDARY ) stats->nreads_secondary++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
639
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
640 int seq_len = bam_line->core.l_qseq;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
641 if ( !seq_len ) return;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
642
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
643 if ( seq_len >= stats->nbases )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
644 realloc_buffers(stats,seq_len);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
645 if ( stats->max_len<seq_len )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
646 stats->max_len = seq_len;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
647
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
648 stats->read_lengths[seq_len]++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
649
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
650 // Count GC and ACGT per cycle
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
651 uint8_t base, *seq = bam1_seq(bam_line);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
652 int gc_count = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
653 int i;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
654 int reverse = IS_REVERSE(bam_line);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
655 for (i=0; i<seq_len; i++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
656 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
657 // Conversion from uint8_t coding to ACGT
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
658 // -12-4---8-------
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
659 // =ACMGRSVTWYHKDBN
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
660 // 01 2 3
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
661 base = bam1_seqi(seq,i);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
662 base /= 2;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
663 if ( base==1 || base==2 ) gc_count++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
664 else if ( base>2 ) base=3;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
665 if ( 4*(reverse ? seq_len-i-1 : i) + base >= stats->nbases*4 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
666 error("FIXME: acgt_cycles\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
667 stats->acgt_cycles[ 4*(reverse ? seq_len-i-1 : i) + base ]++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
668 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
669 int gc_idx_min = gc_count*(stats->ngc-1)/seq_len;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
670 int gc_idx_max = (gc_count+1)*(stats->ngc-1)/seq_len;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
671 if ( gc_idx_max >= stats->ngc ) gc_idx_max = stats->ngc - 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
672
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
673 // Determine which array (1st or 2nd read) will these stats go to,
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
674 // trim low quality bases from end the same way BWA does,
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
675 // fill GC histogram
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
676 uint64_t *quals;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
677 uint8_t *bam_quals = bam1_qual(bam_line);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
678 if ( bam_line->core.flag&BAM_FREAD2 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
679 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
680 quals = stats->quals_2nd;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
681 stats->nreads_2nd++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
682 for (i=gc_idx_min; i<gc_idx_max; i++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
683 stats->gc_2nd[i]++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
684 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
685 else
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
686 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
687 quals = stats->quals_1st;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
688 stats->nreads_1st++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
689 for (i=gc_idx_min; i<gc_idx_max; i++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
690 stats->gc_1st[i]++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
691 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
692 if ( stats->trim_qual>0 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
693 stats->nbases_trimmed += bwa_trim_read(stats->trim_qual, bam_quals, seq_len, reverse);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
694
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
695 // Quality histogram and average quality
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
696 for (i=0; i<seq_len; i++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
697 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
698 uint8_t qual = bam_quals[ reverse ? seq_len-i-1 : i];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
699 if ( qual>=stats->nquals )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
700 error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals,stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
701 if ( qual>stats->max_qual )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
702 stats->max_qual = qual;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
703
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
704 quals[ i*stats->nquals+qual ]++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
705 stats->sum_qual += qual;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
706 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
707
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
708 // Look at the flags and increment appropriate counters (mapped, paired, etc)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
709 if ( IS_UNMAPPED(bam_line) )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
710 stats->nreads_unmapped++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
711 else
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
712 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
713 if ( !bam_line->core.qual )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
714 stats->nreads_mq0++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
715
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
716 count_indels(stats,bam_line);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
717
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
718 if ( !IS_PAIRED(bam_line) )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
719 stats->nreads_unpaired++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
720 else
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
721 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
722 stats->nreads_paired++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
723
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
724 if ( bam_line->core.tid!=bam_line->core.mtid )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
725 stats->nreads_anomalous++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
726
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
727 // The insert size is tricky, because for long inserts the libraries are
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
728 // prepared differently and the pairs point in other direction. BWA does
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
729 // not set the paired flag for them. Similar thing is true also for 454
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
730 // reads. Mates mapped to different chromosomes have isize==0.
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
731 int32_t isize = bam_line->core.isize;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
732 if ( isize<0 ) isize = -isize;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
733 if ( isize >= stats->nisize )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
734 isize = stats->nisize-1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
735 if ( isize>0 || bam_line->core.tid==bam_line->core.mtid )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
736 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
737 int pos_fst = bam_line->core.mpos - bam_line->core.pos;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
738 int is_fst = IS_READ1(bam_line) ? 1 : -1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
739 int is_fwd = IS_REVERSE(bam_line) ? -1 : 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
740 int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
741
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
742 if ( is_fwd*is_mfwd>0 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
743 stats->isize_other[isize]++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
744 else if ( is_fst*pos_fst>0 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
745 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
746 if ( is_fst*is_fwd>0 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
747 stats->isize_inward[isize]++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
748 else
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
749 stats->isize_outward[isize]++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
750 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
751 else if ( is_fst*pos_fst<0 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
752 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
753 if ( is_fst*is_fwd>0 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
754 stats->isize_outward[isize]++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
755 else
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
756 stats->isize_inward[isize]++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
757 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
758 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
759 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
760
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
761 // Number of mismatches
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
762 uint8_t *nm = bam_aux_get(bam_line,"NM");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
763 if (nm)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
764 stats->nmismatches += bam_aux2i(nm);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
765
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
766 // Number of mapped bases from cigar
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
767 // Conversion from uint32_t to MIDNSHP
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
768 // 012-4--
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
769 // MIDNSHP
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
770 if ( bam_line->core.n_cigar == 0)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
771 error("FIXME: mapped read with no cigar?\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
772 int readlen=seq_len;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
773 if ( stats->regions )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
774 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
775 // Count only on-target bases
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
776 int iref = bam_line->core.pos + 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
777 for (i=0; i<bam_line->core.n_cigar; i++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
778 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
779 int cig = bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
780 int ncig = bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
781 if ( cig==2 ) readlen += ncig;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
782 else if ( cig==0 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
783 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
784 if ( iref < stats->reg_from ) ncig -= stats->reg_from-iref;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
785 else if ( iref+ncig-1 > stats->reg_to ) ncig -= iref+ncig-1 - stats->reg_to;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
786 if ( ncig<0 ) ncig = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
787 stats->nbases_mapped_cigar += ncig;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
788 iref += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
789 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
790 else if ( cig==1 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
791 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
792 iref += ncig;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
793 if ( iref>=stats->reg_from && iref<=stats->reg_to )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
794 stats->nbases_mapped_cigar += ncig;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
795 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
796 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
797 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
798 else
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
799 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
800 // Count the whole read
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
801 for (i=0; i<bam_line->core.n_cigar; i++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
802 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
803 if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==0 || (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==1 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
804 stats->nbases_mapped_cigar += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
805 if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==2 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
806 readlen += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
807 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
808 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
809 stats->nbases_mapped += seq_len;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
810
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
811 if ( stats->tid==bam_line->core.tid && bam_line->core.pos<stats->pos )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
812 stats->is_sorted = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
813 stats->pos = bam_line->core.pos;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
814
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
815 if ( stats->is_sorted )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
816 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
817 if ( stats->tid==-1 || stats->tid!=bam_line->core.tid )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
818 round_buffer_flush(stats,-1);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
819
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
820 // Mismatches per cycle and GC-depth graph. For simplicity, reads overlapping GCD bins
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
821 // are not splitted which results in up to seq_len-1 overlaps. The default bin size is
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
822 // 20kbp, so the effect is negligible.
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
823 if ( stats->fai )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
824 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
825 int inc_ref = 0, inc_gcd = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
826 // First pass or new chromosome
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
827 if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid ) { inc_ref=1; inc_gcd=1; }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
828 // Read goes beyond the end of the rseq buffer
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
829 else if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+readlen ) { inc_ref=1; inc_gcd=1; }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
830 // Read overlaps the next gcd bin
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
831 else if ( stats->gcd_pos+stats->gcd_bin_size < bam_line->core.pos+readlen )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
832 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
833 inc_gcd = 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
834 if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+stats->gcd_bin_size ) inc_ref = 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
835 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
836 if ( inc_gcd )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
837 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
838 stats->igcd++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
839 if ( stats->igcd >= stats->ngcd )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
840 realloc_gcd_buffer(stats, readlen);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
841 if ( inc_ref )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
842 read_ref_seq(stats,bam_line->core.tid,bam_line->core.pos);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
843 stats->gcd_pos = bam_line->core.pos;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
844 stats->gcd[ stats->igcd ].gc = fai_gc_content(stats, stats->gcd_pos, stats->gcd_bin_size);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
845 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
846
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
847 count_mismatches_per_cycle(stats,bam_line);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
848 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
849 // No reference and first pass, new chromosome or sequence going beyond the end of the gcd bin
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
850 else if ( stats->gcd_pos==-1 || stats->tid != bam_line->core.tid || bam_line->core.pos - stats->gcd_pos > stats->gcd_bin_size )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
851 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
852 // First pass or a new chromosome
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
853 stats->tid = bam_line->core.tid;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
854 stats->gcd_pos = bam_line->core.pos;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
855 stats->igcd++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
856 if ( stats->igcd >= stats->ngcd )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
857 realloc_gcd_buffer(stats, readlen);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
858 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
859 stats->gcd[ stats->igcd ].depth++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
860 // When no reference sequence is given, approximate the GC from the read (much shorter window, but otherwise OK)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
861 if ( !stats->fai )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
862 stats->gcd[ stats->igcd ].gc += (float) gc_count / seq_len;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
863
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
864 // Coverage distribution graph
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
865 round_buffer_flush(stats,bam_line->core.pos);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
866 round_buffer_insert_read(&(stats->cov_rbuf),bam_line->core.pos,bam_line->core.pos+seq_len-1);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
867 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
868 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
869
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
870 stats->total_len += seq_len;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
871 if ( IS_DUP(bam_line) )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
872 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
873 stats->total_len_dup += seq_len;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
874 stats->nreads_dup++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
875 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
876 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
877
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
878 // Sort by GC and depth
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
879 #define GCD_t(x) ((gc_depth_t *)x)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
880 static int gcd_cmp(const void *a, const void *b)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
881 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
882 if ( GCD_t(a)->gc < GCD_t(b)->gc ) return -1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
883 if ( GCD_t(a)->gc > GCD_t(b)->gc ) return 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
884 if ( GCD_t(a)->depth < GCD_t(b)->depth ) return -1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
885 if ( GCD_t(a)->depth > GCD_t(b)->depth ) return 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
886 return 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
887 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
888 #undef GCD_t
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
889
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
890 float gcd_percentile(gc_depth_t *gcd, int N, int p)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
891 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
892 float n,d;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
893 int k;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
894
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
895 n = p*(N+1)/100;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
896 k = n;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
897 if ( k<=0 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
898 return gcd[0].depth;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
899 if ( k>=N )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
900 return gcd[N-1].depth;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
901
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
902 d = n - k;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
903 return gcd[k-1].depth + d*(gcd[k].depth - gcd[k-1].depth);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
904 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
905
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
906 void output_stats(stats_t *stats)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
907 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
908 // Calculate average insert size and standard deviation (from the main bulk data only)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
909 int isize, ibulk=0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
910 uint64_t nisize=0, nisize_inward=0, nisize_outward=0, nisize_other=0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
911 for (isize=0; isize<stats->nisize; isize++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
912 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
913 // Each pair was counted twice
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
914 stats->isize_inward[isize] *= 0.5;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
915 stats->isize_outward[isize] *= 0.5;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
916 stats->isize_other[isize] *= 0.5;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
917
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
918 nisize_inward += stats->isize_inward[isize];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
919 nisize_outward += stats->isize_outward[isize];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
920 nisize_other += stats->isize_other[isize];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
921 nisize += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
922 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
923
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
924 double bulk=0, avg_isize=0, sd_isize=0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
925 for (isize=0; isize<stats->nisize; isize++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
926 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
927 bulk += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
928 avg_isize += isize * (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
929
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
930 if ( bulk/nisize > stats->isize_main_bulk )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
931 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
932 ibulk = isize+1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
933 nisize = bulk;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
934 break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
935 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
936 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
937 avg_isize /= nisize ? nisize : 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
938 for (isize=1; isize<ibulk; isize++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
939 sd_isize += (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]) * (isize-avg_isize)*(isize-avg_isize) / nisize;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
940 sd_isize = sqrt(sd_isize);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
941
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
942
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
943 printf("# This file was produced by bamcheck (%s)\n",BAMCHECK_VERSION);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
944 printf("# The command line was: %s",stats->argv[0]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
945 int i;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
946 for (i=1; i<stats->argc; i++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
947 printf(" %s",stats->argv[i]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
948 printf("\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
949 printf("# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
950 printf("SN\traw total sequences:\t%ld\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
951 printf("SN\tfiltered sequences:\t%ld\n", (long)stats->nreads_filtered);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
952 printf("SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
953 printf("SN\tis paired:\t%d\n", stats->nreads_1st&&stats->nreads_2nd ? 1 : 0);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
954 printf("SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
955 printf("SN\t1st fragments:\t%ld\n", (long)stats->nreads_1st);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
956 printf("SN\tlast fragments:\t%ld\n", (long)stats->nreads_2nd);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
957 printf("SN\treads mapped:\t%ld\n", (long)(stats->nreads_paired+stats->nreads_unpaired));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
958 printf("SN\treads unmapped:\t%ld\n", (long)stats->nreads_unmapped);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
959 printf("SN\treads unpaired:\t%ld\n", (long)stats->nreads_unpaired);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
960 printf("SN\treads paired:\t%ld\n", (long)stats->nreads_paired);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
961 printf("SN\treads duplicated:\t%ld\n", (long)stats->nreads_dup);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
962 printf("SN\treads MQ0:\t%ld\n", (long)stats->nreads_mq0);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
963 printf("SN\treads QC failed:\t%ld\n", (long)stats->nreads_QCfailed);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
964 printf("SN\tnon-primary alignments:\t%ld\n", (long)stats->nreads_secondary);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
965 printf("SN\ttotal length:\t%ld\n", (long)stats->total_len);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
966 printf("SN\tbases mapped:\t%ld\n", (long)stats->nbases_mapped);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
967 printf("SN\tbases mapped (cigar):\t%ld\n", (long)stats->nbases_mapped_cigar);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
968 printf("SN\tbases trimmed:\t%ld\n", (long)stats->nbases_trimmed);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
969 printf("SN\tbases duplicated:\t%ld\n", (long)stats->total_len_dup);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
970 printf("SN\tmismatches:\t%ld\n", (long)stats->nmismatches);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
971 printf("SN\terror rate:\t%e\n", (float)stats->nmismatches/stats->nbases_mapped_cigar);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
972 float avg_read_length = (stats->nreads_1st+stats->nreads_2nd)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd):0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
973 printf("SN\taverage length:\t%.0f\n", avg_read_length);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
974 printf("SN\tmaximum length:\t%d\n", stats->max_len);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
975 printf("SN\taverage quality:\t%.1f\n", stats->total_len?stats->sum_qual/stats->total_len:0);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
976 printf("SN\tinsert size average:\t%.1f\n", avg_isize);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
977 printf("SN\tinsert size standard deviation:\t%.1f\n", sd_isize);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
978 printf("SN\tinward oriented pairs:\t%ld\n", (long)nisize_inward);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
979 printf("SN\toutward oriented pairs:\t%ld\n", (long)nisize_outward);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
980 printf("SN\tpairs with other orientation:\t%ld\n", (long)nisize_other);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
981 printf("SN\tpairs on different chromosomes:\t%ld\n", (long)stats->nreads_anomalous/2);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
982
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
983 int ibase,iqual;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
984 if ( stats->max_len<stats->nbases ) stats->max_len++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
985 if ( stats->max_qual+1<stats->nquals ) stats->max_qual++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
986 printf("# First Fragment Qualitites. Use `grep ^FFQ | cut -f 2-` to extract this part.\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
987 printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
988 for (ibase=0; ibase<stats->max_len; ibase++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
989 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
990 printf("FFQ\t%d",ibase+1);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
991 for (iqual=0; iqual<=stats->max_qual; iqual++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
992 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
993 printf("\t%ld", (long)stats->quals_1st[ibase*stats->nquals+iqual]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
994 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
995 printf("\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
996 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
997 printf("# Last Fragment Qualitites. Use `grep ^LFQ | cut -f 2-` to extract this part.\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
998 printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
999 for (ibase=0; ibase<stats->max_len; ibase++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1000 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1001 printf("LFQ\t%d",ibase+1);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1002 for (iqual=0; iqual<=stats->max_qual; iqual++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1003 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1004 printf("\t%ld", (long)stats->quals_2nd[ibase*stats->nquals+iqual]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1005 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1006 printf("\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1007 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1008 if ( stats->mpc_buf )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1009 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1010 printf("# Mismatches per cycle and quality. Use `grep ^MPC | cut -f 2-` to extract this part.\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1011 printf("# Columns correspond to qualities, rows to cycles. First column is the cycle number, second\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1012 printf("# is the number of N's and the rest is the number of mismatches\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1013 for (ibase=0; ibase<stats->max_len; ibase++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1014 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1015 printf("MPC\t%d",ibase+1);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1016 for (iqual=0; iqual<=stats->max_qual; iqual++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1017 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1018 printf("\t%ld", (long)stats->mpc_buf[ibase*stats->nquals+iqual]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1019 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1020 printf("\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1021 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1022 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1023 printf("# GC Content of first fragments. Use `grep ^GCF | cut -f 2-` to extract this part.\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1024 int ibase_prev = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1025 for (ibase=0; ibase<stats->ngc; ibase++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1026 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1027 if ( stats->gc_1st[ibase]==stats->gc_1st[ibase_prev] ) continue;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1028 printf("GCF\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_1st[ibase_prev]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1029 ibase_prev = ibase;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1030 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1031 printf("# GC Content of last fragments. Use `grep ^GCL | cut -f 2-` to extract this part.\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1032 ibase_prev = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1033 for (ibase=0; ibase<stats->ngc; ibase++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1034 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1035 if ( stats->gc_2nd[ibase]==stats->gc_2nd[ibase_prev] ) continue;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1036 printf("GCL\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_2nd[ibase_prev]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1037 ibase_prev = ibase;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1038 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1039 printf("# ACGT content per cycle. Use `grep ^GCC | cut -f 2-` to extract this part. The columns are: cycle, and A,C,G,T counts [%%]\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1040 for (ibase=0; ibase<stats->max_len; ibase++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1041 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1042 uint64_t *ptr = &(stats->acgt_cycles[ibase*4]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1043 uint64_t sum = ptr[0]+ptr[1]+ptr[2]+ptr[3];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1044 if ( ! sum ) continue;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1045 printf("GCC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase,100.*ptr[0]/sum,100.*ptr[1]/sum,100.*ptr[2]/sum,100.*ptr[3]/sum);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1046 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1047 printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: pairs total, inward oriented pairs, outward oriented pairs, other pairs\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1048 for (isize=0; isize<ibulk; isize++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1049 printf("IS\t%d\t%ld\t%ld\t%ld\t%ld\n", isize, (long)(stats->isize_inward[isize]+stats->isize_outward[isize]+stats->isize_other[isize]),
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1050 (long)stats->isize_inward[isize], (long)stats->isize_outward[isize], (long)stats->isize_other[isize]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1051
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1052 printf("# Read lengths. Use `grep ^RL | cut -f 2-` to extract this part. The columns are: read length, count\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1053 int ilen;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1054 for (ilen=0; ilen<stats->max_len; ilen++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1055 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1056 if ( stats->read_lengths[ilen]>0 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1057 printf("RL\t%d\t%ld\n", ilen, (long)stats->read_lengths[ilen]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1058 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1059
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1060 printf("# Indel distribution. Use `grep ^ID | cut -f 2-` to extract this part. The columns are: length, number of insertions, number of deletions\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1061 for (ilen=0; ilen<stats->nindels; ilen++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1062 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1063 if ( stats->insertions[ilen]>0 || stats->deletions[ilen]>0 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1064 printf("ID\t%d\t%ld\t%ld\n", ilen+1, (long)stats->insertions[ilen], (long)stats->deletions[ilen]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1065 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1066
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1067 printf("# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions (fwd), .. (rev) , number of deletions (fwd), .. (rev)\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1068 for (ilen=0; ilen<=stats->nbases; ilen++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1069 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1070 // For deletions we print the index of the cycle before the deleted base (1-based) and for insertions
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1071 // the index of the cycle of the first inserted base (also 1-based)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1072 if ( stats->ins_cycles_1st[ilen]>0 || stats->ins_cycles_2nd[ilen]>0 || stats->del_cycles_1st[ilen]>0 || stats->del_cycles_2nd[ilen]>0 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1073 printf("IC\t%d\t%ld\t%ld\t%ld\t%ld\n", ilen+1, (long)stats->ins_cycles_1st[ilen], (long)stats->ins_cycles_2nd[ilen], (long)stats->del_cycles_1st[ilen], (long)stats->del_cycles_2nd[ilen]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1074 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1075
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1076 printf("# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1077 if ( stats->cov[0] )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1078 printf("COV\t[<%d]\t%d\t%ld\n",stats->cov_min,stats->cov_min-1, (long)stats->cov[0]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1079 int icov;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1080 for (icov=1; icov<stats->ncov-1; icov++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1081 if ( stats->cov[icov] )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1082 printf("COV\t[%d-%d]\t%d\t%ld\n",stats->cov_min + (icov-1)*stats->cov_step, stats->cov_min + icov*stats->cov_step-1,stats->cov_min + icov*stats->cov_step-1, (long)stats->cov[icov]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1083 if ( stats->cov[stats->ncov-1] )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1084 printf("COV\t[%d<]\t%d\t%ld\n",stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov_min + (stats->ncov-2)*stats->cov_step-1, (long)stats->cov[stats->ncov-1]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1085
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1086 // Calculate average GC content, then sort by GC and depth
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1087 printf("# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1088 uint32_t igcd;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1089 for (igcd=0; igcd<stats->igcd; igcd++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1090 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1091 if ( stats->fai )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1092 stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1093 else
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1094 if ( stats->gcd[igcd].depth )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1095 stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc / stats->gcd[igcd].depth);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1096 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1097 qsort(stats->gcd, stats->igcd+1, sizeof(gc_depth_t), gcd_cmp);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1098 igcd = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1099 while ( igcd < stats->igcd )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1100 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1101 // Calculate percentiles (10,25,50,75,90th) for the current GC content and print
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1102 uint32_t nbins=0, itmp=igcd;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1103 float gc = stats->gcd[igcd].gc;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1104 while ( itmp<stats->igcd && fabs(stats->gcd[itmp].gc-gc)<0.1 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1105 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1106 nbins++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1107 itmp++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1108 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1109 printf("GCD\t%.1f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n", gc, (igcd+nbins+1)*100./(stats->igcd+1),
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1110 gcd_percentile(&(stats->gcd[igcd]),nbins,10) *avg_read_length/stats->gcd_bin_size,
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1111 gcd_percentile(&(stats->gcd[igcd]),nbins,25) *avg_read_length/stats->gcd_bin_size,
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1112 gcd_percentile(&(stats->gcd[igcd]),nbins,50) *avg_read_length/stats->gcd_bin_size,
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1113 gcd_percentile(&(stats->gcd[igcd]),nbins,75) *avg_read_length/stats->gcd_bin_size,
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1114 gcd_percentile(&(stats->gcd[igcd]),nbins,90) *avg_read_length/stats->gcd_bin_size
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1115 );
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1116 igcd += nbins;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1117 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1118 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1119
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1120 size_t mygetline(char **line, size_t *n, FILE *fp)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1121 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1122 if (line == NULL || n == NULL || fp == NULL)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1123 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1124 errno = EINVAL;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1125 return -1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1126 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1127 if (*n==0 || !*line)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1128 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1129 *line = NULL;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1130 *n = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1131 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1132
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1133 size_t nread=0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1134 int c;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1135 while ((c=getc(fp))!= EOF && c!='\n')
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1136 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1137 if ( ++nread>=*n )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1138 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1139 *n += 255;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1140 *line = realloc(*line, sizeof(char)*(*n));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1141 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1142 (*line)[nread-1] = c;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1143 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1144 if ( nread>=*n )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1145 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1146 *n += 255;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1147 *line = realloc(*line, sizeof(char)*(*n));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1148 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1149 (*line)[nread] = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1150 return nread>0 ? nread : -1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1151
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1152 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1153
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1154 void init_regions(stats_t *stats, char *file)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1155 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1156 khiter_t iter;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1157 khash_t(kh_bam_tid) *header_hash;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1158
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1159 bam_init_header_hash(stats->sam->header);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1160 header_hash = (khash_t(kh_bam_tid)*)stats->sam->header->hash;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1161
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1162 FILE *fp = fopen(file,"r");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1163 if ( !fp ) error("%s: %s\n",file,strerror(errno));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1164
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1165 char *line = NULL;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1166 size_t len = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1167 ssize_t nread;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1168 int warned = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1169 int prev_tid=-1, prev_pos=-1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1170 while ((nread = mygetline(&line, &len, fp)) != -1)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1171 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1172 if ( line[0] == '#' ) continue;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1173
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1174 int i = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1175 while ( i<nread && !isspace(line[i]) ) i++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1176 if ( i>=nread ) error("Could not parse the file: %s [%s]\n", file,line);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1177 line[i] = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1178
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1179 iter = kh_get(kh_bam_tid, header_hash, line);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1180 int tid = kh_val(header_hash, iter);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1181 if ( iter == kh_end(header_hash) )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1182 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1183 if ( !warned )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1184 fprintf(stderr,"Warning: Some sequences not present in the BAM, e.g. \"%s\". This message is printed only once.\n", line);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1185 warned = 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1186 continue;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1187 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1188
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1189 if ( tid >= stats->nregions )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1190 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1191 stats->regions = realloc(stats->regions,sizeof(regions_t)*(stats->nregions+100));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1192 int j;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1193 for (j=stats->nregions; j<stats->nregions+100; j++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1194 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1195 stats->regions[j].npos = stats->regions[j].mpos = stats->regions[j].cpos = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1196 stats->regions[j].pos = NULL;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1197 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1198 stats->nregions += 100;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1199 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1200 int npos = stats->regions[tid].npos;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1201 if ( npos >= stats->regions[tid].mpos )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1202 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1203 stats->regions[tid].mpos += 1000;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1204 stats->regions[tid].pos = realloc(stats->regions[tid].pos,sizeof(pos_t)*stats->regions[tid].mpos);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1205 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1206
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1207 if ( (sscanf(line+i+1,"%d %d",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1208 if ( prev_tid==-1 || prev_tid!=tid )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1209 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1210 prev_tid = tid;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1211 prev_pos = stats->regions[tid].pos[npos].from;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1212 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1213 if ( prev_pos>stats->regions[tid].pos[npos].from )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1214 error("The positions are not in chromosomal order (%s:%d comes after %d)\n", line,stats->regions[tid].pos[npos].from,prev_pos);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1215 stats->regions[tid].npos++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1216 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1217 if (line) free(line);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1218 if ( !stats->regions ) error("Unable to map the -t sequences to the BAM sequences.\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1219 fclose(fp);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1220 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1221
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1222 void destroy_regions(stats_t *stats)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1223 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1224 int i;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1225 for (i=0; i<stats->nregions; i++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1226 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1227 if ( !stats->regions[i].mpos ) continue;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1228 free(stats->regions[i].pos);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1229 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1230 if ( stats->regions ) free(stats->regions);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1231 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1232
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1233 static int fetch_read(const bam1_t *bam_line, void *data)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1234 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1235 collect_stats((bam1_t*)bam_line,(stats_t*)data);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1236 return 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1237 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1238
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1239 void reset_regions(stats_t *stats)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1240 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1241 int i;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1242 for (i=0; i<stats->nregions; i++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1243 stats->regions[i].cpos = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1244 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1245
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1246 int is_in_regions(bam1_t *bam_line, stats_t *stats)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1247 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1248 if ( !stats->regions ) return 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1249
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1250 if ( bam_line->core.tid >= stats->nregions || bam_line->core.tid<0 ) return 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1251 if ( !stats->is_sorted ) error("The BAM must be sorted in order for -t to work.\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1252
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1253 regions_t *reg = &stats->regions[bam_line->core.tid];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1254 if ( reg->cpos==reg->npos ) return 0; // done for this chr
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1255
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1256 // Find a matching interval or skip this read. No splicing of reads is done, no indels or soft clips considered,
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1257 // even small overlap is enough to include the read in the stats.
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1258 int i = reg->cpos;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1259 while ( i<reg->npos && reg->pos[i].to<=bam_line->core.pos ) i++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1260 if ( i>=reg->npos ) { reg->cpos = reg->npos; return 0; }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1261 if ( bam_line->core.pos + bam_line->core.l_qseq + 1 < reg->pos[i].from ) return 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1262 reg->cpos = i;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1263 stats->reg_from = reg->pos[i].from;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1264 stats->reg_to = reg->pos[i].to;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1265
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1266 return 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1267 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1268
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1269 void init_group_id(stats_t *stats, char *id)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1270 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1271 if ( !stats->sam->header->dict )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1272 stats->sam->header->dict = sam_header_parse2(stats->sam->header->text);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1273 void *iter = stats->sam->header->dict;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1274 const char *key, *val;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1275 int n = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1276 stats->rg_hash = kh_init(kh_rg);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1277 while ( (iter = sam_header2key_val(iter, "RG","ID","SM", &key, &val)) )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1278 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1279 if ( !strcmp(id,key) || (val && !strcmp(id,val)) )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1280 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1281 khiter_t k = kh_get(kh_rg, stats->rg_hash, key);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1282 if ( k != kh_end(stats->rg_hash) )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1283 fprintf(stderr, "[init_group_id] The group ID not unique: \"%s\"\n", key);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1284 int ret;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1285 k = kh_put(kh_rg, stats->rg_hash, key, &ret);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1286 kh_value(stats->rg_hash, k) = val;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1287 n++;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1288 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1289 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1290 if ( !n )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1291 error("The sample or read group \"%s\" not present.\n", id);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1292 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1293
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1294
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1295 void error(const char *format, ...)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1296 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1297 if ( !format )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1298 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1299 printf("Version: %s\n", BAMCHECK_VERSION);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1300 printf("About: The program collects statistics from BAM files. The output can be visualized using plot-bamcheck.\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1301 printf("Usage: bamcheck [OPTIONS] file.bam\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1302 printf(" bamcheck [OPTIONS] file.bam chr:from-to\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1303 printf("Options:\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1304 printf(" -c, --coverage <int>,<int>,<int> Coverage distribution min,max,step [1,1000,1]\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1305 printf(" -d, --remove-dups Exlude from statistics reads marked as duplicates\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1306 printf(" -f, --required-flag <int> Required flag, 0 for unset [0]\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1307 printf(" -F, --filtering-flag <int> Filtering flag, 0 for unset [0]\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1308 printf(" --GC-depth <float,float> Bin size for GC-depth graph and the maximum reference length [2e4,4.2e9]\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1309 printf(" -h, --help This help message\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1310 printf(" -i, --insert-size <int> Maximum insert size [8000]\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1311 printf(" -I, --id <string> Include only listed read group or sample name\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1312 printf(" -l, --read-length <int> Include in the statistics only reads with the given read length []\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1313 printf(" -m, --most-inserts <float> Report only the main part of inserts [0.99]\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1314 printf(" -q, --trim-quality <int> The BWA trimming parameter [0]\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1315 printf(" -r, --ref-seq <file> Reference sequence (required for GC-depth calculation).\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1316 printf(" -t, --target-regions <file> Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1317 printf(" -s, --sam Input is SAM\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1318 printf("\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1319 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1320 else
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1321 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1322 va_list ap;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1323 va_start(ap, format);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1324 vfprintf(stderr, format, ap);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1325 va_end(ap);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1326 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1327 exit(-1);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1328 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1329
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1330 int main(int argc, char *argv[])
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1331 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1332 char *targets = NULL;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1333 char *bam_fname = NULL;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1334 char *group_id = NULL;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1335 samfile_t *sam = NULL;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1336 char in_mode[5];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1337
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1338 stats_t *stats = calloc(1,sizeof(stats_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1339 stats->ngc = 200;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1340 stats->nquals = 256;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1341 stats->nbases = 300;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1342 stats->nisize = 8000;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1343 stats->max_len = 30;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1344 stats->max_qual = 40;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1345 stats->isize_main_bulk = 0.99; // There are always outliers at the far end
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1346 stats->gcd_bin_size = 20e3;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1347 stats->gcd_ref_size = 4.2e9;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1348 stats->rseq_pos = -1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1349 stats->tid = stats->gcd_pos = -1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1350 stats->igcd = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1351 stats->is_sorted = 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1352 stats->cov_min = 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1353 stats->cov_max = 1000;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1354 stats->cov_step = 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1355 stats->argc = argc;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1356 stats->argv = argv;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1357 stats->filter_readlen = -1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1358 stats->nindels = stats->nbases;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1359
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1360 strcpy(in_mode, "rb");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1361
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1362 static struct option loptions[] =
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1363 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1364 {"help",0,0,'h'},
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1365 {"remove-dups",0,0,'d'},
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1366 {"sam",0,0,'s'},
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1367 {"ref-seq",1,0,'r'},
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1368 {"coverage",1,0,'c'},
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1369 {"read-length",1,0,'l'},
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1370 {"insert-size",1,0,'i'},
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1371 {"most-inserts",1,0,'m'},
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1372 {"trim-quality",1,0,'q'},
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1373 {"target-regions",0,0,'t'},
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1374 {"required-flag",1,0,'f'},
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1375 {"filtering-flag",0,0,'F'},
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1376 {"id",1,0,'I'},
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1377 {"GC-depth",1,0,1},
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1378 {0,0,0,0}
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1379 };
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1380 int opt;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1381 while ( (opt=getopt_long(argc,argv,"?hdsr:c:l:i:t:m:q:f:F:I:1:",loptions,NULL))>0 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1382 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1383 switch (opt)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1384 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1385 case 'f': stats->flag_require=strtol(optarg,0,0); break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1386 case 'F': stats->flag_filter=strtol(optarg,0,0); break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1387 case 'd': stats->flag_filter|=BAM_FDUP; break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1388 case 's': strcpy(in_mode, "r"); break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1389 case 'r': stats->fai = fai_load(optarg);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1390 if (stats->fai==0)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1391 error("Could not load faidx: %s\n", optarg);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1392 break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1393 case 1 : {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1394 float flen,fbin;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1395 if ( sscanf(optarg,"%f,%f",&fbin,&flen)!= 2 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1396 error("Unable to parse --GC-depth %s\n", optarg);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1397 stats->gcd_bin_size = fbin;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1398 stats->gcd_ref_size = flen;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1399 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1400 break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1401 case 'c': if ( sscanf(optarg,"%d,%d,%d",&stats->cov_min,&stats->cov_max,&stats->cov_step)!= 3 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1402 error("Unable to parse -c %s\n", optarg);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1403 break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1404 case 'l': stats->filter_readlen = atoi(optarg); break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1405 case 'i': stats->nisize = atoi(optarg); break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1406 case 'm': stats->isize_main_bulk = atof(optarg); break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1407 case 'q': stats->trim_qual = atoi(optarg); break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1408 case 't': targets = optarg; break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1409 case 'I': group_id = optarg; break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1410 case '?':
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1411 case 'h': error(NULL);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1412 default: error("Unknown argument: %s\n", optarg);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1413 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1414 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1415 if ( optind<argc )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1416 bam_fname = argv[optind++];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1417
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1418 if ( !bam_fname )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1419 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1420 if ( isatty(fileno((FILE *)stdin)) )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1421 error(NULL);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1422 bam_fname = "-";
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1423 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1424
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1425 // Init structures
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1426 // .. coverage bins and round buffer
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1427 if ( stats->cov_step > stats->cov_max - stats->cov_min + 1 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1428 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1429 stats->cov_step = stats->cov_max - stats->cov_min;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1430 if ( stats->cov_step <= 0 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1431 stats->cov_step = 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1432 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1433 stats->ncov = 3 + (stats->cov_max-stats->cov_min) / stats->cov_step;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1434 stats->cov_max = stats->cov_min + ((stats->cov_max-stats->cov_min)/stats->cov_step +1)*stats->cov_step - 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1435 stats->cov = calloc(sizeof(uint64_t),stats->ncov);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1436 stats->cov_rbuf.size = stats->nbases*5;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1437 stats->cov_rbuf.buffer = calloc(sizeof(int32_t),stats->cov_rbuf.size);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1438 // .. bam
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1439 if ((sam = samopen(bam_fname, in_mode, NULL)) == 0)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1440 error("Failed to open: %s\n", bam_fname);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1441 stats->sam = sam;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1442 if ( group_id ) init_group_id(stats, group_id);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1443 bam1_t *bam_line = bam_init1();
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1444 // .. arrays
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1445 stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1446 stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1447 stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1448 stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1449 stats->isize_inward = calloc(stats->nisize,sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1450 stats->isize_outward = calloc(stats->nisize,sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1451 stats->isize_other = calloc(stats->nisize,sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1452 stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1453 stats->mpc_buf = stats->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1454 stats->acgt_cycles = calloc(4*stats->nbases,sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1455 stats->read_lengths = calloc(stats->nbases,sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1456 stats->insertions = calloc(stats->nbases,sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1457 stats->deletions = calloc(stats->nbases,sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1458 stats->ins_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1459 stats->ins_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1460 stats->del_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1461 stats->del_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1462 realloc_rseq_buffer(stats);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1463 if ( targets )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1464 init_regions(stats, targets);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1465
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1466 // Collect statistics
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1467 if ( optind<argc )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1468 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1469 // Collect stats in selected regions only
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1470 bam_index_t *bam_idx = bam_index_load(bam_fname);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1471 if (bam_idx == 0)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1472 error("Random alignment retrieval only works for indexed BAM files.\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1473
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1474 int i;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1475 for (i=optind; i<argc; i++)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1476 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1477 int tid, beg, end;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1478 bam_parse_region(stats->sam->header, argv[i], &tid, &beg, &end);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1479 if ( tid < 0 ) continue;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1480 reset_regions(stats);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1481 bam_fetch(stats->sam->x.bam, bam_idx, tid, beg, end, stats, fetch_read);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1482 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1483 bam_index_destroy(bam_idx);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1484 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1485 else
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1486 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1487 // Stream through the entire BAM ignoring off-target regions if -t is given
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1488 while (samread(sam,bam_line) >= 0)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1489 collect_stats(bam_line,stats);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1490 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1491 round_buffer_flush(stats,-1);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1492
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1493 output_stats(stats);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1494
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1495 bam_destroy1(bam_line);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1496 samclose(stats->sam);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1497 if (stats->fai) fai_destroy(stats->fai);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1498 free(stats->cov_rbuf.buffer); free(stats->cov);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1499 free(stats->quals_1st); free(stats->quals_2nd);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1500 free(stats->gc_1st); free(stats->gc_2nd);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1501 free(stats->isize_inward); free(stats->isize_outward); free(stats->isize_other);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1502 free(stats->gcd);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1503 free(stats->rseq_buf);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1504 free(stats->mpc_buf);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1505 free(stats->acgt_cycles);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1506 free(stats->read_lengths);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1507 free(stats->insertions);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1508 free(stats->deletions);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1509 free(stats->ins_cycles_1st);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1510 free(stats->ins_cycles_2nd);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1511 free(stats->del_cycles_1st);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1512 free(stats->del_cycles_2nd);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1513 destroy_regions(stats);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1514 free(stats);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1515 if ( stats->rg_hash ) kh_destroy(kh_rg, stats->rg_hash);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1516
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1517 return 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1518 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1519
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1520
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1521