comparison pyPRADA_1.2/tools/bwa-0.5.7-mh/bwtaln.c @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:acc2ca1a3ba4
1 #include <stdio.h>
2 #include <unistd.h>
3 #include <math.h>
4 #include <stdlib.h>
5 #include <string.h>
6 #include <time.h>
7 #include <stdint.h>
8 #ifdef HAVE_CONFIG_H
9 #include "config.h"
10 #endif
11 #include "bwtaln.h"
12 #include "bwtgap.h"
13 #include "utils.h"
14
15 #ifdef HAVE_PTHREAD
16 #define THREAD_BLOCK_SIZE 1024
17 #include <pthread.h>
18 static pthread_mutex_t g_seq_lock = PTHREAD_MUTEX_INITIALIZER;
19 #endif
20
21 gap_opt_t *gap_init_opt()
22 {
23 gap_opt_t *o;
24 o = (gap_opt_t*)calloc(1, sizeof(gap_opt_t));
25 /* IMPORTANT: s_mm*10 should be about the average base error
26 rate. Voilating this requirement will break pairing! */
27 o->s_mm = 3; o->s_gapo = 11; o->s_gape = 4;
28 o->max_diff = -1; o->max_gapo = 1; o->max_gape = 6;
29 o->indel_end_skip = 5; o->max_del_occ = 10; o->max_entries = 2000000;
30 o->mode = BWA_MODE_GAPE | BWA_MODE_COMPREAD;
31 o->seed_len = 32; o->max_seed_diff = 2;
32 o->fnr = 0.04;
33 o->n_threads = 1;
34 o->max_top2 = 30;
35 o->trim_qual = 0;
36 return o;
37 }
38
39 int bwa_cal_maxdiff(int l, double err, double thres)
40 {
41 double elambda = exp(-l * err);
42 double sum, y = 1.0;
43 int k, x = 1;
44 for (k = 1, sum = elambda; k < 1000; ++k) {
45 y *= l * err;
46 x *= k;
47 sum += elambda * y / x;
48 if (1.0 - sum < thres) return k;
49 }
50 return 2;
51 }
52
53 // width must be filled as zero
54 static int bwt_cal_width(const bwt_t *rbwt, int len, const ubyte_t *str, bwt_width_t *width)
55 {
56 bwtint_t k, l, ok, ol;
57 int i, bid;
58 bid = 0;
59 k = 0; l = rbwt->seq_len;
60 for (i = 0; i < len; ++i) {
61 ubyte_t c = str[i];
62 if (c < 4) {
63 bwt_2occ(rbwt, k - 1, l, c, &ok, &ol);
64 k = rbwt->L2[c] + ok + 1;
65 l = rbwt->L2[c] + ol;
66 }
67 if (k > l || c > 3) { // then restart
68 k = 0;
69 l = rbwt->seq_len;
70 ++bid;
71 }
72 width[i].w = l - k + 1;
73 width[i].bid = bid;
74 }
75 width[len].w = 0;
76 width[len].bid = ++bid;
77 return bid;
78 }
79
80 void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt[2], int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt)
81 {
82 int i, max_l = 0, max_len;
83 gap_stack_t *stack;
84 bwt_width_t *w[2], *seed_w[2];
85 const ubyte_t *seq[2];
86 gap_opt_t local_opt = *opt;
87
88 // initiate priority stack
89 for (i = max_len = 0; i != n_seqs; ++i)
90 if (seqs[i].len > max_len) max_len = seqs[i].len;
91 if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(max_len, BWA_AVG_ERR, opt->fnr);
92 if (local_opt.max_diff < local_opt.max_gapo) local_opt.max_gapo = local_opt.max_diff;
93 stack = gap_init_stack(local_opt.max_diff, local_opt.max_gapo, local_opt.max_gape, &local_opt);
94
95 seed_w[0] = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t));
96 seed_w[1] = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t));
97 w[0] = w[1] = 0;
98 for (i = 0; i != n_seqs; ++i) {
99 bwa_seq_t *p = seqs + i;
100 #ifdef HAVE_PTHREAD
101 if (opt->n_threads > 1) {
102 pthread_mutex_lock(&g_seq_lock);
103 if (p->tid < 0) { // unassigned
104 int j;
105 for (j = i; j < n_seqs && j < i + THREAD_BLOCK_SIZE; ++j)
106 seqs[j].tid = tid;
107 } else if (p->tid != tid) {
108 pthread_mutex_unlock(&g_seq_lock);
109 continue;
110 }
111 pthread_mutex_unlock(&g_seq_lock);
112 }
113 #endif
114 p->sa = 0; p->type = BWA_TYPE_NO_MATCH; p->c1 = p->c2 = 0; p->n_aln = 0; p->aln = 0;
115 seq[0] = p->seq; seq[1] = p->rseq;
116 if (max_l < p->len) {
117 max_l = p->len;
118 w[0] = (bwt_width_t*)calloc(max_l + 1, sizeof(bwt_width_t));
119 w[1] = (bwt_width_t*)calloc(max_l + 1, sizeof(bwt_width_t));
120 }
121 bwt_cal_width(bwt[0], p->len, seq[0], w[0]);
122 bwt_cal_width(bwt[1], p->len, seq[1], w[1]);
123 if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(p->len, BWA_AVG_ERR, opt->fnr);
124 local_opt.seed_len = opt->seed_len < p->len? opt->seed_len : 0x7fffffff;
125 if (p->len > opt->seed_len) {
126 bwt_cal_width(bwt[0], opt->seed_len, seq[0] + (p->len - opt->seed_len), seed_w[0]);
127 bwt_cal_width(bwt[1], opt->seed_len, seq[1] + (p->len - opt->seed_len), seed_w[1]);
128 }
129 // core function
130 p->aln = bwt_match_gap(bwt, p->len, seq, w, p->len <= opt->seed_len? 0 : seed_w, &local_opt, &p->n_aln, stack);
131 // store the alignment
132 free(p->name); free(p->seq); free(p->rseq); free(p->qual);
133 p->name = 0; p->seq = p->rseq = p->qual = 0;
134 }
135 free(seed_w[0]); free(seed_w[1]);
136 free(w[0]); free(w[1]);
137 gap_destroy_stack(stack);
138 }
139
140 #ifdef HAVE_PTHREAD
141 typedef struct {
142 int tid;
143 bwt_t *bwt[2];
144 int n_seqs;
145 bwa_seq_t *seqs;
146 const gap_opt_t *opt;
147 } thread_aux_t;
148
149 static void *worker(void *data)
150 {
151 thread_aux_t *d = (thread_aux_t*)data;
152 bwa_cal_sa_reg_gap(d->tid, d->bwt, d->n_seqs, d->seqs, d->opt);
153 return 0;
154 }
155 #endif
156
157 void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt)
158 {
159 int i, n_seqs, tot_seqs = 0;
160 bwa_seq_t *seqs;
161 bwa_seqio_t *ks;
162 clock_t t;
163 bwt_t *bwt[2];
164
165 // initialization
166 ks = bwa_seq_open(fn_fa);
167
168 { // load BWT
169 char *str = (char*)calloc(strlen(prefix) + 10, 1);
170 strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str);
171 strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str);
172 free(str);
173 }
174
175 // core loop
176 fwrite(opt, sizeof(gap_opt_t), 1, stdout);
177 while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt->mode & BWA_MODE_COMPREAD, opt->trim_qual)) != 0) {
178 tot_seqs += n_seqs;
179 t = clock();
180
181 fprintf(stderr, "[bwa_aln_core] calculate SA coordinate... ");
182
183 #ifdef HAVE_PTHREAD
184 if (opt->n_threads <= 1) { // no multi-threading at all
185 bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt);
186 } else {
187 pthread_t *tid;
188 pthread_attr_t attr;
189 thread_aux_t *data;
190 int j;
191 pthread_attr_init(&attr);
192 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
193 data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t));
194 tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t));
195 for (j = 0; j < opt->n_threads; ++j) {
196 data[j].tid = j; data[j].bwt[0] = bwt[0]; data[j].bwt[1] = bwt[1];
197 data[j].n_seqs = n_seqs; data[j].seqs = seqs; data[j].opt = opt;
198 pthread_create(&tid[j], &attr, worker, data + j);
199 }
200 for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0);
201 free(data); free(tid);
202 }
203 #else
204 bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt);
205 #endif
206
207 fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
208
209 t = clock();
210 fprintf(stderr, "[bwa_aln_core] write to the disk... ");
211 for (i = 0; i < n_seqs; ++i) {
212 bwa_seq_t *p = seqs + i;
213 fwrite(&p->n_aln, 4, 1, stdout);
214 if (p->n_aln) fwrite(p->aln, sizeof(bwt_aln1_t), p->n_aln, stdout);
215 }
216 fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
217
218 bwa_free_read_seq(n_seqs, seqs);
219 fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs);
220 }
221
222 // destroy
223 bwt_destroy(bwt[0]); bwt_destroy(bwt[1]);
224 bwa_seq_close(ks);
225 }
226
227 int bwa_aln(int argc, char *argv[])
228 {
229 int c, opte = -1;
230 gap_opt_t *opt;
231
232 opt = gap_init_opt();
233 while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:")) >= 0) {
234 switch (c) {
235 case 'n':
236 if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1;
237 else opt->max_diff = atoi(optarg), opt->fnr = -1.0;
238 break;
239 case 'o': opt->max_gapo = atoi(optarg); break;
240 case 'e': opte = atoi(optarg); break;
241 case 'M': opt->s_mm = atoi(optarg); break;
242 case 'O': opt->s_gapo = atoi(optarg); break;
243 case 'E': opt->s_gape = atoi(optarg); break;
244 case 'd': opt->max_del_occ = atoi(optarg); break;
245 case 'i': opt->indel_end_skip = atoi(optarg); break;
246 case 'l': opt->seed_len = atoi(optarg); break;
247 case 'k': opt->max_seed_diff = atoi(optarg); break;
248 case 'm': opt->max_entries = atoi(optarg); break;
249 case 't': opt->n_threads = atoi(optarg); break;
250 case 'L': opt->mode |= BWA_MODE_LOGGAP; break;
251 case 'R': opt->max_top2 = atoi(optarg); break;
252 case 'q': opt->trim_qual = atoi(optarg); break;
253 case 'c': opt->mode &= ~BWA_MODE_COMPREAD; break;
254 case 'N': opt->mode |= BWA_MODE_NONSTOP; opt->max_top2 = 0x7fffffff; break;
255 case 'f': freopen(optarg, "wb", stdout); break;
256 default: return 1;
257 }
258 }
259 if (opte > 0) {
260 opt->max_gape = opte;
261 opt->mode &= ~BWA_MODE_GAPE;
262 }
263
264 if (optind + 2 > argc) {
265 fprintf(stderr, "\n");
266 fprintf(stderr, "Usage: bwa aln [options] <prefix> <in.fq>\n\n");
267 fprintf(stderr, "Options: -n NUM max #diff (int) or missing prob under %.2f err rate (float) [%.2f]\n",
268 BWA_AVG_ERR, opt->fnr);
269 fprintf(stderr, " -o INT maximum number or fraction of gap opens [%d]\n", opt->max_gapo);
270 fprintf(stderr, " -e INT maximum number of gap extensions, -1 for disabling long gaps [-1]\n");
271 fprintf(stderr, " -i INT do not put an indel within INT bp towards the ends [%d]\n", opt->indel_end_skip);
272 fprintf(stderr, " -d INT maximum occurrences for extending a long deletion [%d]\n", opt->max_del_occ);
273 fprintf(stderr, " -l INT seed length [%d]\n", opt->seed_len);
274 fprintf(stderr, " -k INT maximum differences in the seed [%d]\n", opt->max_seed_diff);
275 fprintf(stderr, " -m INT maximum entries in the queue [%d]\n", opt->max_entries);
276 fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
277 fprintf(stderr, " -M INT mismatch penalty [%d]\n", opt->s_mm);
278 fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->s_gapo);
279 fprintf(stderr, " -E INT gap extension penalty [%d]\n", opt->s_gape);
280 fprintf(stderr, " -R INT stop searching when there are >INT equally best hits [%d]\n", opt->max_top2);
281 fprintf(stderr, " -q INT quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual);
282 fprintf(stderr, " -c input sequences are in the color space\n");
283 fprintf(stderr, " -L log-scaled gap penalty for long deletions\n");
284 fprintf(stderr, " -N non-iterative mode: search for all n-difference hits (slooow)\n");
285 fprintf(stderr, " -f FILE file to write output to instead of stdout\n");
286 fprintf(stderr, "\n");
287 return 1;
288 }
289 if (opt->fnr > 0.0) {
290 int i, k;
291 for (i = 17, k = 0; i <= 250; ++i) {
292 int l = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr);
293 if (l != k) fprintf(stderr, "[bwa_aln] %dbp reads: max_diff = %d\n", i, l);
294 k = l;
295 }
296 }
297 bwa_aln_core(argv[optind], argv[optind+1], opt);
298 free(opt);
299 return 0;
300 }
301
302 /* rgoya: Temporary clone of aln_path2cigar to accomodate for bwa_cigar_t,
303 __cigar_op and __cigar_len while keeping stdaln stand alone */
304 bwa_cigar_t *bwa_aln_path2cigar(const path_t *path, int path_len, int *n_cigar)
305 {
306 uint32_t *cigar32;
307 bwa_cigar_t *cigar;
308 int i;
309 cigar32 = aln_path2cigar32((path_t*) path, path_len, n_cigar);
310 cigar = (bwa_cigar_t*)cigar32;
311 for (i = 0; i < *n_cigar; ++i)
312 cigar[i] = __cigar_create( (cigar32[i]&0xf), (cigar32[i]>>4) );
313 return cigar;
314 }
315