annotate bwa-0.7.9a/bwamem.h @ 0:ce5a8082bbb8 draft

Uploaded
author xilinxu
date Thu, 14 Aug 2014 02:16:48 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
1 #ifndef BWAMEM_H_
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
2 #define BWAMEM_H_
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
3
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
4 #include "bwt.h"
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
5 #include "bntseq.h"
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
6 #include "bwa.h"
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
7
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
8 #define MEM_MAPQ_COEF 30.0
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
9 #define MEM_MAPQ_MAX 60
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
10
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
11 struct __smem_i;
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
12 typedef struct __smem_i smem_i;
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
13
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
14 #define MEM_F_PE 0x2
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
15 #define MEM_F_NOPAIRING 0x4
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
16 #define MEM_F_ALL 0x8
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
17 #define MEM_F_NO_MULTI 0x10
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
18 #define MEM_F_NO_RESCUE 0x20
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
19 #define MEM_F_SELF_OVLP 0x40
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
20 #define MEM_F_ALN_REG 0x80
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
21 #define MEM_F_SOFTCLIP 0x200
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
22
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
23 typedef struct {
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
24 int a, b; // match score and mismatch penalty
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
25 int o_del, e_del;
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
26 int o_ins, e_ins;
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
27 int pen_unpaired; // phred-scaled penalty for unpaired reads
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
28 int pen_clip5,pen_clip3;// clipping penalty. This score is not deducted from the DP score.
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
29 int w; // band width
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
30 int zdrop; // Z-dropoff
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
31
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
32 int T; // output score threshold; only affecting output
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
33 int flag; // see MEM_F_* macros
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
34 int min_seed_len; // minimum seed length
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
35 int min_chain_weight;
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
36 int max_chain_extend;
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
37 float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
38 int split_width; // split into a seed if its occurence is smaller than this value
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
39 int max_occ; // skip a seed if its occurence is larger than this value
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
40 int max_chain_gap; // do not chain seed if it is max_chain_gap-bp away from the closest seed
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
41 int n_threads; // number of threads
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
42 int chunk_size; // process chunk_size-bp sequences in a batch
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
43 float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
44 float drop_ratio; // drop a chain if its seed coverage is below drop_ratio times the seed coverage of a better chain overlapping with the small chain
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
45 float XA_drop_ratio; // when counting hits for the XA tag, ignore alignments with score < XA_drop_ratio * max_score; only effective for the XA tag
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
46 float mask_level_redun;
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
47 float mapQ_coef_len;
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
48 int mapQ_coef_fac;
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
49 int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
50 int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
51 int max_hits; // if there are max_hits or fewer, output them all
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
52 int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
53 } mem_opt_t;
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
54
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
55 typedef struct {
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
56 int64_t rb, re; // [rb,re): reference sequence in the alignment
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
57 int qb, qe; // [qb,qe): query sequence in the alignment
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
58 int rid; // reference seq ID
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
59 int score; // best local SW score
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
60 int truesc; // actual score corresponding to the aligned region; possibly smaller than $score
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
61 int sub; // 2nd best SW score
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
62 int csub; // SW score of a tandem hit
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
63 int sub_n; // approximate number of suboptimal hits
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
64 int w; // actual band width used in extension
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
65 int seedcov; // length of regions coverged by seeds
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
66 int secondary; // index of the parent hit shadowing the current hit; <0 if primary
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
67 int seedlen0; // length of the starting seed
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
68 int n_comp; // number of sub-alignments chained together
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
69 float frac_rep;
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
70 uint64_t hash;
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
71 } mem_alnreg_t;
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
72
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
73 typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v;
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
74
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
75 typedef struct {
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
76 int low, high; // lower and upper bounds within which a read pair is considered to be properly paired
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
77 int failed; // non-zero if the orientation is not supported by sufficient data
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
78 double avg, std; // mean and stddev of the insert size distribution
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
79 } mem_pestat_t;
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
80
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
81 typedef struct { // This struct is only used for the convenience of API.
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
82 int64_t pos; // forward strand 5'-end mapping position
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
83 int rid; // reference sequence index in bntseq_t; <0 for unmapped
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
84 int flag; // extra flag
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
85 uint32_t is_rev:1, mapq:8, NM:23; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
86 int n_cigar; // number of CIGAR operations
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
87 uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
88 char *XA; // alternative mappings
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
89
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
90 int score, sub;
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
91 } mem_aln_t;
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
92
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
93 #ifdef __cplusplus
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
94 extern "C" {
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
95 #endif
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
96
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
97 smem_i *smem_itr_init(const bwt_t *bwt);
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
98 void smem_itr_destroy(smem_i *itr);
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
99 void smem_set_query(smem_i *itr, int len, const uint8_t *query);
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
100 const bwtintv_v *smem_next(smem_i *itr);
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
101
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
102 mem_opt_t *mem_opt_init(void);
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
103 void mem_fill_scmat(int a, int b, int8_t mat[25]);
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
104
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
105 /**
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
106 * Align a batch of sequences and generate the alignments in the SAM format
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
107 *
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
108 * This routine requires $seqs[i].{l_seq,seq,name} and write $seqs[i].sam.
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
109 * Note that $seqs[i].sam may consist of several SAM lines if the
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
110 * corresponding sequence has multiple primary hits.
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
111 *
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
112 * In the paired-end mode (i.e. MEM_F_PE is set in $opt->flag), query
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
113 * sequences must be interleaved: $n must be an even number and the 2i-th
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
114 * sequence and the (2i+1)-th sequence constitute a read pair. In this
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
115 * mode, there should be enough (typically >50) unique pairs for the
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
116 * routine to infer the orientation and insert size.
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
117 *
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
118 * @param opt alignment parameters
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
119 * @param bwt FM-index of the reference sequence
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
120 * @param bns Information of the reference
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
121 * @param pac 2-bit encoded reference
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
122 * @param n number of query sequences
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
123 * @param seqs query sequences; $seqs[i].seq/sam to be modified after the call
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
124 * @param pes0 insert-size info; if NULL, infer from data; if not NULL, it should be an array with 4 elements,
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
125 * corresponding to each FF, FR, RF and RR orientation. See mem_pestat() for more info.
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
126 */
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
127 void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0);
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
128
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
129 /**
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
130 * Find the aligned regions for one query sequence
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
131 *
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
132 * Note that this routine does not generate CIGAR. CIGAR should be
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
133 * generated later by mem_reg2aln() below.
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
134 *
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
135 * @param opt alignment parameters
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
136 * @param bwt FM-index of the reference sequence
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
137 * @param bns Information of the reference
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
138 * @param pac 2-bit encoded reference
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
139 * @param l_seq length of query sequence
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
140 * @param seq query sequence
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
141 *
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
142 * @return list of aligned regions.
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
143 */
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
144 mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq);
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
145
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
146 /**
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
147 * Generate CIGAR and forward-strand position from alignment region
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
148 *
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
149 * @param opt alignment parameters
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
150 * @param bns Information of the reference
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
151 * @param pac 2-bit encoded reference
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
152 * @param l_seq length of query sequence
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
153 * @param seq query sequence
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
154 * @param ar one alignment region
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
155 *
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
156 * @return CIGAR, strand, mapping quality and forward-strand position
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
157 */
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
158 mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq, const mem_alnreg_t *ar);
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
159 mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq, const mem_alnreg_t *ar, const char *name);
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
160
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
161 /**
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
162 * Infer the insert size distribution from interleaved alignment regions
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
163 *
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
164 * This function can be called after mem_align1(), as long as paired-end
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
165 * reads are properly interleaved.
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
166 *
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
167 * @param opt alignment parameters
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
168 * @param l_pac length of concatenated reference sequence
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
169 * @param n number of query sequences; must be an even number
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
170 * @param regs region array of size $n; 2i-th and (2i+1)-th elements constitute a pair
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
171 * @param pes inferred insert size distribution (output)
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
172 */
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
173 void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]);
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
174
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
175 #ifdef __cplusplus
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
176 }
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
177 #endif
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
178
ce5a8082bbb8 Uploaded
xilinxu
parents:
diff changeset
179 #endif