| 0 | 1 #ifndef BWAMEM_H_ | 
|  | 2 #define BWAMEM_H_ | 
|  | 3 | 
|  | 4 #include "bwt.h" | 
|  | 5 #include "bntseq.h" | 
|  | 6 #include "bwa.h" | 
|  | 7 | 
|  | 8 #define MEM_MAPQ_COEF 30.0 | 
|  | 9 #define MEM_MAPQ_MAX  60 | 
|  | 10 | 
|  | 11 struct __smem_i; | 
|  | 12 typedef struct __smem_i smem_i; | 
|  | 13 | 
|  | 14 #define MEM_F_PE        0x2 | 
|  | 15 #define MEM_F_NOPAIRING 0x4 | 
|  | 16 #define MEM_F_ALL       0x8 | 
|  | 17 #define MEM_F_NO_MULTI  0x10 | 
|  | 18 #define MEM_F_NO_RESCUE 0x20 | 
|  | 19 #define MEM_F_SELF_OVLP 0x40 | 
|  | 20 #define MEM_F_ALN_REG   0x80 | 
|  | 21 #define MEM_F_SOFTCLIP  0x200 | 
|  | 22 | 
|  | 23 typedef struct { | 
|  | 24 	int a, b;               // match score and mismatch penalty | 
|  | 25 	int o_del, e_del; | 
|  | 26 	int o_ins, e_ins; | 
|  | 27 	int pen_unpaired;       // phred-scaled penalty for unpaired reads | 
|  | 28 	int pen_clip5,pen_clip3;// clipping penalty. This score is not deducted from the DP score. | 
|  | 29 	int w;                  // band width | 
|  | 30 	int zdrop;              // Z-dropoff | 
|  | 31 | 
|  | 32 	int T;                  // output score threshold; only affecting output | 
|  | 33 	int flag;               // see MEM_F_* macros | 
|  | 34 	int min_seed_len;       // minimum seed length | 
|  | 35 	int min_chain_weight; | 
|  | 36 	int max_chain_extend; | 
|  | 37 	float split_factor;     // split into a seed if MEM is longer than min_seed_len*split_factor | 
|  | 38 	int split_width;        // split into a seed if its occurence is smaller than this value | 
|  | 39 	int max_occ;            // skip a seed if its occurence is larger than this value | 
|  | 40 	int max_chain_gap;      // do not chain seed if it is max_chain_gap-bp away from the closest seed | 
|  | 41 	int n_threads;          // number of threads | 
|  | 42 	int chunk_size;         // process chunk_size-bp sequences in a batch | 
|  | 43 	float mask_level;       // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits | 
|  | 44 	float drop_ratio;       // drop a chain if its seed coverage is below drop_ratio times the seed coverage of a better chain overlapping with the small chain | 
|  | 45 	float XA_drop_ratio;    // when counting hits for the XA tag, ignore alignments with score < XA_drop_ratio * max_score; only effective for the XA tag | 
|  | 46 	float mask_level_redun; | 
|  | 47 	float mapQ_coef_len; | 
|  | 48 	int mapQ_coef_fac; | 
|  | 49 	int max_ins;            // when estimating insert size distribution, skip pairs with insert longer than this value | 
|  | 50 	int max_matesw;         // perform maximally max_matesw rounds of mate-SW for each end | 
|  | 51 	int max_hits;           // if there are max_hits or fewer, output them all | 
|  | 52 	int8_t mat[25];         // scoring matrix; mat[0] == 0 if unset | 
|  | 53 } mem_opt_t; | 
|  | 54 | 
|  | 55 typedef struct { | 
|  | 56 	int64_t rb, re; // [rb,re): reference sequence in the alignment | 
|  | 57 	int qb, qe;     // [qb,qe): query sequence in the alignment | 
|  | 58 	int rid;        // reference seq ID | 
|  | 59 	int score;      // best local SW score | 
|  | 60 	int truesc;     // actual score corresponding to the aligned region; possibly smaller than $score | 
|  | 61 	int sub;        // 2nd best SW score | 
|  | 62 	int csub;       // SW score of a tandem hit | 
|  | 63 	int sub_n;      // approximate number of suboptimal hits | 
|  | 64 	int w;          // actual band width used in extension | 
|  | 65 	int seedcov;    // length of regions coverged by seeds | 
|  | 66 	int secondary;  // index of the parent hit shadowing the current hit; <0 if primary | 
|  | 67 	int seedlen0;   // length of the starting seed | 
|  | 68 	int n_comp;     // number of sub-alignments chained together | 
|  | 69 	float frac_rep; | 
|  | 70 	uint64_t hash; | 
|  | 71 } mem_alnreg_t; | 
|  | 72 | 
|  | 73 typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v; | 
|  | 74 | 
|  | 75 typedef struct { | 
|  | 76 	int low, high;   // lower and upper bounds within which a read pair is considered to be properly paired | 
|  | 77 	int failed;      // non-zero if the orientation is not supported by sufficient data | 
|  | 78 	double avg, std; // mean and stddev of the insert size distribution | 
|  | 79 } mem_pestat_t; | 
|  | 80 | 
|  | 81 typedef struct { // This struct is only used for the convenience of API. | 
|  | 82 	int64_t pos;     // forward strand 5'-end mapping position | 
|  | 83 	int rid;         // reference sequence index in bntseq_t; <0 for unmapped | 
|  | 84 	int flag;        // extra flag | 
|  | 85 	uint32_t is_rev:1, mapq:8, NM:23; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance | 
|  | 86 	int n_cigar;     // number of CIGAR operations | 
|  | 87 	uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234 | 
|  | 88 	char *XA;        // alternative mappings | 
|  | 89 | 
|  | 90 	int score, sub; | 
|  | 91 } mem_aln_t; | 
|  | 92 | 
|  | 93 #ifdef __cplusplus | 
|  | 94 extern "C" { | 
|  | 95 #endif | 
|  | 96 | 
|  | 97 	smem_i *smem_itr_init(const bwt_t *bwt); | 
|  | 98 	void smem_itr_destroy(smem_i *itr); | 
|  | 99 	void smem_set_query(smem_i *itr, int len, const uint8_t *query); | 
|  | 100 	const bwtintv_v *smem_next(smem_i *itr); | 
|  | 101 | 
|  | 102 	mem_opt_t *mem_opt_init(void); | 
|  | 103 	void mem_fill_scmat(int a, int b, int8_t mat[25]); | 
|  | 104 | 
|  | 105 	/** | 
|  | 106 	 * Align a batch of sequences and generate the alignments in the SAM format | 
|  | 107 	 * | 
|  | 108 	 * This routine requires $seqs[i].{l_seq,seq,name} and write $seqs[i].sam. | 
|  | 109 	 * Note that $seqs[i].sam may consist of several SAM lines if the | 
|  | 110 	 * corresponding sequence has multiple primary hits. | 
|  | 111 	 * | 
|  | 112 	 * In the paired-end mode (i.e. MEM_F_PE is set in $opt->flag), query | 
|  | 113 	 * sequences must be interleaved: $n must be an even number and the 2i-th | 
|  | 114 	 * sequence and the (2i+1)-th sequence constitute a read pair. In this | 
|  | 115 	 * mode, there should be enough (typically >50) unique pairs for the | 
|  | 116 	 * routine to infer the orientation and insert size. | 
|  | 117 	 * | 
|  | 118 	 * @param opt    alignment parameters | 
|  | 119 	 * @param bwt    FM-index of the reference sequence | 
|  | 120 	 * @param bns    Information of the reference | 
|  | 121 	 * @param pac    2-bit encoded reference | 
|  | 122 	 * @param n      number of query sequences | 
|  | 123 	 * @param seqs   query sequences; $seqs[i].seq/sam to be modified after the call | 
|  | 124 	 * @param pes0   insert-size info; if NULL, infer from data; if not NULL, it should be an array with 4 elements, | 
|  | 125 	 *               corresponding to each FF, FR, RF and RR orientation. See mem_pestat() for more info. | 
|  | 126 	 */ | 
|  | 127 	void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0); | 
|  | 128 | 
|  | 129 	/** | 
|  | 130 	 * Find the aligned regions for one query sequence | 
|  | 131 	 * | 
|  | 132 	 * Note that this routine does not generate CIGAR. CIGAR should be | 
|  | 133 	 * generated later by mem_reg2aln() below. | 
|  | 134 	 * | 
|  | 135 	 * @param opt    alignment parameters | 
|  | 136 	 * @param bwt    FM-index of the reference sequence | 
|  | 137 	 * @param bns    Information of the reference | 
|  | 138 	 * @param pac    2-bit encoded reference | 
|  | 139 	 * @param l_seq  length of query sequence | 
|  | 140 	 * @param seq    query sequence | 
|  | 141 	 * | 
|  | 142 	 * @return       list of aligned regions. | 
|  | 143 	 */ | 
|  | 144 	mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq); | 
|  | 145 | 
|  | 146 	/** | 
|  | 147 	 * Generate CIGAR and forward-strand position from alignment region | 
|  | 148 	 * | 
|  | 149 	 * @param opt    alignment parameters | 
|  | 150 	 * @param bns    Information of the reference | 
|  | 151 	 * @param pac    2-bit encoded reference | 
|  | 152 	 * @param l_seq  length of query sequence | 
|  | 153 	 * @param seq    query sequence | 
|  | 154 	 * @param ar     one alignment region | 
|  | 155 	 * | 
|  | 156 	 * @return       CIGAR, strand, mapping quality and forward-strand position | 
|  | 157 	 */ | 
|  | 158 	mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq, const mem_alnreg_t *ar); | 
|  | 159 	mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq, const mem_alnreg_t *ar, const char *name); | 
|  | 160 | 
|  | 161 	/** | 
|  | 162 	 * Infer the insert size distribution from interleaved alignment regions | 
|  | 163 	 * | 
|  | 164 	 * This function can be called after mem_align1(), as long as paired-end | 
|  | 165 	 * reads are properly interleaved. | 
|  | 166 	 * | 
|  | 167 	 * @param opt    alignment parameters | 
|  | 168 	 * @param l_pac  length of concatenated reference sequence | 
|  | 169 	 * @param n      number of query sequences; must be an even number | 
|  | 170 	 * @param regs   region array of size $n; 2i-th and (2i+1)-th elements constitute a pair | 
|  | 171 	 * @param pes    inferred insert size distribution (output) | 
|  | 172 	 */ | 
|  | 173 	void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]); | 
|  | 174 | 
|  | 175 #ifdef __cplusplus | 
|  | 176 } | 
|  | 177 #endif | 
|  | 178 | 
|  | 179 #endif |