Mercurial > repos > siyuan > prada
comparison pyPRADA_1.2/tools/bwa-0.5.7-mh/bwtaln.c @ 0:acc2ca1a3ba4
Uploaded
author | siyuan |
---|---|
date | Thu, 20 Feb 2014 00:44:58 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:acc2ca1a3ba4 |
---|---|
1 #include <stdio.h> | |
2 #include <unistd.h> | |
3 #include <math.h> | |
4 #include <stdlib.h> | |
5 #include <string.h> | |
6 #include <time.h> | |
7 #include <stdint.h> | |
8 #ifdef HAVE_CONFIG_H | |
9 #include "config.h" | |
10 #endif | |
11 #include "bwtaln.h" | |
12 #include "bwtgap.h" | |
13 #include "utils.h" | |
14 | |
15 #ifdef HAVE_PTHREAD | |
16 #define THREAD_BLOCK_SIZE 1024 | |
17 #include <pthread.h> | |
18 static pthread_mutex_t g_seq_lock = PTHREAD_MUTEX_INITIALIZER; | |
19 #endif | |
20 | |
21 gap_opt_t *gap_init_opt() | |
22 { | |
23 gap_opt_t *o; | |
24 o = (gap_opt_t*)calloc(1, sizeof(gap_opt_t)); | |
25 /* IMPORTANT: s_mm*10 should be about the average base error | |
26 rate. Voilating this requirement will break pairing! */ | |
27 o->s_mm = 3; o->s_gapo = 11; o->s_gape = 4; | |
28 o->max_diff = -1; o->max_gapo = 1; o->max_gape = 6; | |
29 o->indel_end_skip = 5; o->max_del_occ = 10; o->max_entries = 2000000; | |
30 o->mode = BWA_MODE_GAPE | BWA_MODE_COMPREAD; | |
31 o->seed_len = 32; o->max_seed_diff = 2; | |
32 o->fnr = 0.04; | |
33 o->n_threads = 1; | |
34 o->max_top2 = 30; | |
35 o->trim_qual = 0; | |
36 return o; | |
37 } | |
38 | |
39 int bwa_cal_maxdiff(int l, double err, double thres) | |
40 { | |
41 double elambda = exp(-l * err); | |
42 double sum, y = 1.0; | |
43 int k, x = 1; | |
44 for (k = 1, sum = elambda; k < 1000; ++k) { | |
45 y *= l * err; | |
46 x *= k; | |
47 sum += elambda * y / x; | |
48 if (1.0 - sum < thres) return k; | |
49 } | |
50 return 2; | |
51 } | |
52 | |
53 // width must be filled as zero | |
54 static int bwt_cal_width(const bwt_t *rbwt, int len, const ubyte_t *str, bwt_width_t *width) | |
55 { | |
56 bwtint_t k, l, ok, ol; | |
57 int i, bid; | |
58 bid = 0; | |
59 k = 0; l = rbwt->seq_len; | |
60 for (i = 0; i < len; ++i) { | |
61 ubyte_t c = str[i]; | |
62 if (c < 4) { | |
63 bwt_2occ(rbwt, k - 1, l, c, &ok, &ol); | |
64 k = rbwt->L2[c] + ok + 1; | |
65 l = rbwt->L2[c] + ol; | |
66 } | |
67 if (k > l || c > 3) { // then restart | |
68 k = 0; | |
69 l = rbwt->seq_len; | |
70 ++bid; | |
71 } | |
72 width[i].w = l - k + 1; | |
73 width[i].bid = bid; | |
74 } | |
75 width[len].w = 0; | |
76 width[len].bid = ++bid; | |
77 return bid; | |
78 } | |
79 | |
80 void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt[2], int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt) | |
81 { | |
82 int i, max_l = 0, max_len; | |
83 gap_stack_t *stack; | |
84 bwt_width_t *w[2], *seed_w[2]; | |
85 const ubyte_t *seq[2]; | |
86 gap_opt_t local_opt = *opt; | |
87 | |
88 // initiate priority stack | |
89 for (i = max_len = 0; i != n_seqs; ++i) | |
90 if (seqs[i].len > max_len) max_len = seqs[i].len; | |
91 if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(max_len, BWA_AVG_ERR, opt->fnr); | |
92 if (local_opt.max_diff < local_opt.max_gapo) local_opt.max_gapo = local_opt.max_diff; | |
93 stack = gap_init_stack(local_opt.max_diff, local_opt.max_gapo, local_opt.max_gape, &local_opt); | |
94 | |
95 seed_w[0] = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t)); | |
96 seed_w[1] = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t)); | |
97 w[0] = w[1] = 0; | |
98 for (i = 0; i != n_seqs; ++i) { | |
99 bwa_seq_t *p = seqs + i; | |
100 #ifdef HAVE_PTHREAD | |
101 if (opt->n_threads > 1) { | |
102 pthread_mutex_lock(&g_seq_lock); | |
103 if (p->tid < 0) { // unassigned | |
104 int j; | |
105 for (j = i; j < n_seqs && j < i + THREAD_BLOCK_SIZE; ++j) | |
106 seqs[j].tid = tid; | |
107 } else if (p->tid != tid) { | |
108 pthread_mutex_unlock(&g_seq_lock); | |
109 continue; | |
110 } | |
111 pthread_mutex_unlock(&g_seq_lock); | |
112 } | |
113 #endif | |
114 p->sa = 0; p->type = BWA_TYPE_NO_MATCH; p->c1 = p->c2 = 0; p->n_aln = 0; p->aln = 0; | |
115 seq[0] = p->seq; seq[1] = p->rseq; | |
116 if (max_l < p->len) { | |
117 max_l = p->len; | |
118 w[0] = (bwt_width_t*)calloc(max_l + 1, sizeof(bwt_width_t)); | |
119 w[1] = (bwt_width_t*)calloc(max_l + 1, sizeof(bwt_width_t)); | |
120 } | |
121 bwt_cal_width(bwt[0], p->len, seq[0], w[0]); | |
122 bwt_cal_width(bwt[1], p->len, seq[1], w[1]); | |
123 if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(p->len, BWA_AVG_ERR, opt->fnr); | |
124 local_opt.seed_len = opt->seed_len < p->len? opt->seed_len : 0x7fffffff; | |
125 if (p->len > opt->seed_len) { | |
126 bwt_cal_width(bwt[0], opt->seed_len, seq[0] + (p->len - opt->seed_len), seed_w[0]); | |
127 bwt_cal_width(bwt[1], opt->seed_len, seq[1] + (p->len - opt->seed_len), seed_w[1]); | |
128 } | |
129 // core function | |
130 p->aln = bwt_match_gap(bwt, p->len, seq, w, p->len <= opt->seed_len? 0 : seed_w, &local_opt, &p->n_aln, stack); | |
131 // store the alignment | |
132 free(p->name); free(p->seq); free(p->rseq); free(p->qual); | |
133 p->name = 0; p->seq = p->rseq = p->qual = 0; | |
134 } | |
135 free(seed_w[0]); free(seed_w[1]); | |
136 free(w[0]); free(w[1]); | |
137 gap_destroy_stack(stack); | |
138 } | |
139 | |
140 #ifdef HAVE_PTHREAD | |
141 typedef struct { | |
142 int tid; | |
143 bwt_t *bwt[2]; | |
144 int n_seqs; | |
145 bwa_seq_t *seqs; | |
146 const gap_opt_t *opt; | |
147 } thread_aux_t; | |
148 | |
149 static void *worker(void *data) | |
150 { | |
151 thread_aux_t *d = (thread_aux_t*)data; | |
152 bwa_cal_sa_reg_gap(d->tid, d->bwt, d->n_seqs, d->seqs, d->opt); | |
153 return 0; | |
154 } | |
155 #endif | |
156 | |
157 void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) | |
158 { | |
159 int i, n_seqs, tot_seqs = 0; | |
160 bwa_seq_t *seqs; | |
161 bwa_seqio_t *ks; | |
162 clock_t t; | |
163 bwt_t *bwt[2]; | |
164 | |
165 // initialization | |
166 ks = bwa_seq_open(fn_fa); | |
167 | |
168 { // load BWT | |
169 char *str = (char*)calloc(strlen(prefix) + 10, 1); | |
170 strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str); | |
171 strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str); | |
172 free(str); | |
173 } | |
174 | |
175 // core loop | |
176 fwrite(opt, sizeof(gap_opt_t), 1, stdout); | |
177 while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt->mode & BWA_MODE_COMPREAD, opt->trim_qual)) != 0) { | |
178 tot_seqs += n_seqs; | |
179 t = clock(); | |
180 | |
181 fprintf(stderr, "[bwa_aln_core] calculate SA coordinate... "); | |
182 | |
183 #ifdef HAVE_PTHREAD | |
184 if (opt->n_threads <= 1) { // no multi-threading at all | |
185 bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt); | |
186 } else { | |
187 pthread_t *tid; | |
188 pthread_attr_t attr; | |
189 thread_aux_t *data; | |
190 int j; | |
191 pthread_attr_init(&attr); | |
192 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); | |
193 data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t)); | |
194 tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); | |
195 for (j = 0; j < opt->n_threads; ++j) { | |
196 data[j].tid = j; data[j].bwt[0] = bwt[0]; data[j].bwt[1] = bwt[1]; | |
197 data[j].n_seqs = n_seqs; data[j].seqs = seqs; data[j].opt = opt; | |
198 pthread_create(&tid[j], &attr, worker, data + j); | |
199 } | |
200 for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0); | |
201 free(data); free(tid); | |
202 } | |
203 #else | |
204 bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt); | |
205 #endif | |
206 | |
207 fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); | |
208 | |
209 t = clock(); | |
210 fprintf(stderr, "[bwa_aln_core] write to the disk... "); | |
211 for (i = 0; i < n_seqs; ++i) { | |
212 bwa_seq_t *p = seqs + i; | |
213 fwrite(&p->n_aln, 4, 1, stdout); | |
214 if (p->n_aln) fwrite(p->aln, sizeof(bwt_aln1_t), p->n_aln, stdout); | |
215 } | |
216 fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); | |
217 | |
218 bwa_free_read_seq(n_seqs, seqs); | |
219 fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs); | |
220 } | |
221 | |
222 // destroy | |
223 bwt_destroy(bwt[0]); bwt_destroy(bwt[1]); | |
224 bwa_seq_close(ks); | |
225 } | |
226 | |
227 int bwa_aln(int argc, char *argv[]) | |
228 { | |
229 int c, opte = -1; | |
230 gap_opt_t *opt; | |
231 | |
232 opt = gap_init_opt(); | |
233 while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:")) >= 0) { | |
234 switch (c) { | |
235 case 'n': | |
236 if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1; | |
237 else opt->max_diff = atoi(optarg), opt->fnr = -1.0; | |
238 break; | |
239 case 'o': opt->max_gapo = atoi(optarg); break; | |
240 case 'e': opte = atoi(optarg); break; | |
241 case 'M': opt->s_mm = atoi(optarg); break; | |
242 case 'O': opt->s_gapo = atoi(optarg); break; | |
243 case 'E': opt->s_gape = atoi(optarg); break; | |
244 case 'd': opt->max_del_occ = atoi(optarg); break; | |
245 case 'i': opt->indel_end_skip = atoi(optarg); break; | |
246 case 'l': opt->seed_len = atoi(optarg); break; | |
247 case 'k': opt->max_seed_diff = atoi(optarg); break; | |
248 case 'm': opt->max_entries = atoi(optarg); break; | |
249 case 't': opt->n_threads = atoi(optarg); break; | |
250 case 'L': opt->mode |= BWA_MODE_LOGGAP; break; | |
251 case 'R': opt->max_top2 = atoi(optarg); break; | |
252 case 'q': opt->trim_qual = atoi(optarg); break; | |
253 case 'c': opt->mode &= ~BWA_MODE_COMPREAD; break; | |
254 case 'N': opt->mode |= BWA_MODE_NONSTOP; opt->max_top2 = 0x7fffffff; break; | |
255 case 'f': freopen(optarg, "wb", stdout); break; | |
256 default: return 1; | |
257 } | |
258 } | |
259 if (opte > 0) { | |
260 opt->max_gape = opte; | |
261 opt->mode &= ~BWA_MODE_GAPE; | |
262 } | |
263 | |
264 if (optind + 2 > argc) { | |
265 fprintf(stderr, "\n"); | |
266 fprintf(stderr, "Usage: bwa aln [options] <prefix> <in.fq>\n\n"); | |
267 fprintf(stderr, "Options: -n NUM max #diff (int) or missing prob under %.2f err rate (float) [%.2f]\n", | |
268 BWA_AVG_ERR, opt->fnr); | |
269 fprintf(stderr, " -o INT maximum number or fraction of gap opens [%d]\n", opt->max_gapo); | |
270 fprintf(stderr, " -e INT maximum number of gap extensions, -1 for disabling long gaps [-1]\n"); | |
271 fprintf(stderr, " -i INT do not put an indel within INT bp towards the ends [%d]\n", opt->indel_end_skip); | |
272 fprintf(stderr, " -d INT maximum occurrences for extending a long deletion [%d]\n", opt->max_del_occ); | |
273 fprintf(stderr, " -l INT seed length [%d]\n", opt->seed_len); | |
274 fprintf(stderr, " -k INT maximum differences in the seed [%d]\n", opt->max_seed_diff); | |
275 fprintf(stderr, " -m INT maximum entries in the queue [%d]\n", opt->max_entries); | |
276 fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); | |
277 fprintf(stderr, " -M INT mismatch penalty [%d]\n", opt->s_mm); | |
278 fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->s_gapo); | |
279 fprintf(stderr, " -E INT gap extension penalty [%d]\n", opt->s_gape); | |
280 fprintf(stderr, " -R INT stop searching when there are >INT equally best hits [%d]\n", opt->max_top2); | |
281 fprintf(stderr, " -q INT quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual); | |
282 fprintf(stderr, " -c input sequences are in the color space\n"); | |
283 fprintf(stderr, " -L log-scaled gap penalty for long deletions\n"); | |
284 fprintf(stderr, " -N non-iterative mode: search for all n-difference hits (slooow)\n"); | |
285 fprintf(stderr, " -f FILE file to write output to instead of stdout\n"); | |
286 fprintf(stderr, "\n"); | |
287 return 1; | |
288 } | |
289 if (opt->fnr > 0.0) { | |
290 int i, k; | |
291 for (i = 17, k = 0; i <= 250; ++i) { | |
292 int l = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr); | |
293 if (l != k) fprintf(stderr, "[bwa_aln] %dbp reads: max_diff = %d\n", i, l); | |
294 k = l; | |
295 } | |
296 } | |
297 bwa_aln_core(argv[optind], argv[optind+1], opt); | |
298 free(opt); | |
299 return 0; | |
300 } | |
301 | |
302 /* rgoya: Temporary clone of aln_path2cigar to accomodate for bwa_cigar_t, | |
303 __cigar_op and __cigar_len while keeping stdaln stand alone */ | |
304 bwa_cigar_t *bwa_aln_path2cigar(const path_t *path, int path_len, int *n_cigar) | |
305 { | |
306 uint32_t *cigar32; | |
307 bwa_cigar_t *cigar; | |
308 int i; | |
309 cigar32 = aln_path2cigar32((path_t*) path, path_len, n_cigar); | |
310 cigar = (bwa_cigar_t*)cigar32; | |
311 for (i = 0; i < *n_cigar; ++i) | |
312 cigar[i] = __cigar_create( (cigar32[i]&0xf), (cigar32[i]>>4) ); | |
313 return cigar; | |
314 } | |
315 |