annotate pyPRADA_1.2/tools/bwa-0.5.7-mh/bntseq.c @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
1 /* The MIT License
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
2
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
3 Copyright (c) 2008 Genome Research Ltd (GRL).
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
4
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
5 Permission is hereby granted, free of charge, to any person obtaining
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
6 a copy of this software and associated documentation files (the
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
7 "Software"), to deal in the Software without restriction, including
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
8 without limitation the rights to use, copy, modify, merge, publish,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
9 distribute, sublicense, and/or sell copies of the Software, and to
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
10 permit persons to whom the Software is furnished to do so, subject to
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
11 the following conditions:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
12
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
13 The above copyright notice and this permission notice shall be
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
14 included in all copies or substantial portions of the Software.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
15
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
23 SOFTWARE.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
24 */
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
25
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
27
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
28 #include <stdio.h>
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
29 #include <stdlib.h>
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
30 #include <string.h>
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
31 #include <zlib.h>
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
32 #include "bntseq.h"
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
33 #include "main.h"
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
34 #include "utils.h"
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
35
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
36 #include "kseq.h"
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
37 KSEQ_INIT(gzFile, gzread)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
38
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
39 unsigned char nst_nt4_table[256] = {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
40 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
41 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
42 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
43 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
44 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
45 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
46 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
47 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
48 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
49 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
50 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
51 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
52 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
53 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
54 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
55 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
56 };
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
57
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
58 void bns_dump(const bntseq_t *bns, const char *prefix)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
59 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
60 char str[1024];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
61 FILE *fp;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
62 int i;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
63 { // dump .ann
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
64 strcpy(str, prefix); strcat(str, ".ann");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
65 fp = xopen(str, "w");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
66 fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->seed);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
67 for (i = 0; i != bns->n_seqs; ++i) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
68 bntann1_t *p = bns->anns + i;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
69 fprintf(fp, "%d %s", p->gi, p->name);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
70 if (p->anno[0]) fprintf(fp, " %s\n", p->anno);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
71 else fprintf(fp, "\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
72 fprintf(fp, "%lld %d %d\n", (long long)p->offset, p->len, p->n_ambs);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
73 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
74 fclose(fp);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
75 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
76 { // dump .amb
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
77 strcpy(str, prefix); strcat(str, ".amb");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
78 fp = xopen(str, "w");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
79 fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->n_holes);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
80 for (i = 0; i != bns->n_holes; ++i) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
81 bntamb1_t *p = bns->ambs + i;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
82 fprintf(fp, "%lld %d %c\n", (long long)p->offset, p->len, p->amb);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
83 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
84 fclose(fp);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
85 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
86 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
87
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
88 bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
89 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
90 char str[1024];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
91 FILE *fp;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
92 bntseq_t *bns;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
93 long long xx;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
94 int i;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
95 bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
96 { // read .ann
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
97 fp = xopen(ann_filename, "r");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
98 fscanf(fp, "%lld%d%u", &xx, &bns->n_seqs, &bns->seed);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
99 bns->l_pac = xx;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
100 bns->anns = (bntann1_t*)calloc(bns->n_seqs, sizeof(bntann1_t));
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
101 for (i = 0; i < bns->n_seqs; ++i) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
102 bntann1_t *p = bns->anns + i;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
103 char *q = str;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
104 int c;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
105 // read gi and sequence name
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
106 fscanf(fp, "%u%s", &p->gi, str);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
107 p->name = strdup(str);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
108 // read fasta comments
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
109 while ((c = fgetc(fp)) != '\n' && c != EOF) *q++ = c;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
110 *q = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
111 if (q - str > 1) p->anno = strdup(str + 1); // skip leading space
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
112 else p->anno = strdup("");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
113 // read the rest
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
114 fscanf(fp, "%lld%d%d", &xx, &p->len, &p->n_ambs);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
115 p->offset = xx;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
116 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
117 fclose(fp);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
118 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
119 { // read .amb
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
120 int64_t l_pac;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
121 int32_t n_seqs;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
122 fp = xopen(amb_filename, "r");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
123 fscanf(fp, "%lld%d%d", &xx, &n_seqs, &bns->n_holes);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
124 l_pac = xx;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
125 xassert(l_pac == bns->l_pac && n_seqs == bns->n_seqs, "inconsistent .ann and .amb files.");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
126 bns->ambs = (bntamb1_t*)calloc(bns->n_holes, sizeof(bntamb1_t));
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
127 for (i = 0; i < bns->n_holes; ++i) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
128 bntamb1_t *p = bns->ambs + i;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
129 fscanf(fp, "%lld%d%s", &xx, &p->len, str);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
130 p->offset = xx;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
131 p->amb = str[0];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
132 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
133 fclose(fp);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
134 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
135 { // open .pac
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
136 bns->fp_pac = xopen(pac_filename, "rb");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
137 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
138 return bns;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
139 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
140
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
141 bntseq_t *bns_restore(const char *prefix)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
142 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
143 char ann_filename[1024], amb_filename[1024], pac_filename[1024];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
144 strcat(strcpy(ann_filename, prefix), ".ann");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
145 strcat(strcpy(amb_filename, prefix), ".amb");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
146 strcat(strcpy(pac_filename, prefix), ".pac");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
147 return bns_restore_core(ann_filename, amb_filename, pac_filename);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
148 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
149
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
150 void bns_destroy(bntseq_t *bns)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
151 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
152 if (bns == 0) return;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
153 else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
154 int i;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
155 if (bns->fp_pac) fclose(bns->fp_pac);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
156 free(bns->ambs);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
157 for (i = 0; i < bns->n_seqs; ++i) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
158 free(bns->anns[i].name);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
159 free(bns->anns[i].anno);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
160 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
161 free(bns->anns);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
162 free(bns);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
163 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
164 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
165
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
166 void bns_fasta2bntseq(gzFile fp_fa, const char *prefix)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
167 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
168 kseq_t *seq;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
169 char name[1024];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
170 bntseq_t *bns;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
171 bntamb1_t *q;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
172 int l_buf;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
173 unsigned char buf[0x10000];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
174 int32_t m_seqs, m_holes, l, i;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
175 FILE *fp;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
176
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
177 // initialization
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
178 seq = kseq_init(fp_fa);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
179 bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
180 bns->seed = 11; // fixed seed for random generator
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
181 srand48(bns->seed);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
182 m_seqs = m_holes = 8;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
183 bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t));
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
184 bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t));
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
185 q = bns->ambs;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
186 l_buf = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
187 strcpy(name, prefix); strcat(name, ".pac");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
188 fp = xopen(name, "wb");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
189 memset(buf, 0, 0x10000);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
190 // read sequences
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
191 while ((l = kseq_read(seq)) >= 0) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
192 bntann1_t *p;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
193 int lasts;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
194 if (bns->n_seqs == m_seqs) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
195 m_seqs <<= 1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
196 bns->anns = (bntann1_t*)realloc(bns->anns, m_seqs * sizeof(bntann1_t));
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
197 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
198 p = bns->anns + bns->n_seqs;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
199 p->name = strdup((char*)seq->name.s);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
200 p->anno = seq->comment.s? strdup((char*)seq->comment.s) : strdup("(null)");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
201 p->gi = 0; p->len = l;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
202 p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
203 p->n_ambs = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
204 for (i = 0, lasts = 0; i < l; ++i) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
205 int c = nst_nt4_table[(int)seq->seq.s[i]];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
206 if (c >= 4) { // N
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
207 if (lasts == seq->seq.s[i]) { // contiguous N
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
208 ++q->len;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
209 } else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
210 if (bns->n_holes == m_holes) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
211 m_holes <<= 1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
212 bns->ambs = (bntamb1_t*)realloc(bns->ambs, m_holes * sizeof(bntamb1_t));
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
213 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
214 q = bns->ambs + bns->n_holes;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
215 q->len = 1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
216 q->offset = p->offset + i;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
217 q->amb = seq->seq.s[i];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
218 ++p->n_ambs;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
219 ++bns->n_holes;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
220 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
221 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
222 lasts = seq->seq.s[i];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
223 { // fill buffer
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
224 if (c >= 4) c = lrand48()&0x3;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
225 if (l_buf == 0x40000) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
226 fwrite(buf, 1, 0x10000, fp);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
227 memset(buf, 0, 0x10000);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
228 l_buf = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
229 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
230 buf[l_buf>>2] |= c << ((3 - (l_buf&3)) << 1);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
231 ++l_buf;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
232 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
233 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
234 ++bns->n_seqs;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
235 bns->l_pac += seq->seq.l;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
236 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
237 xassert(bns->l_pac, "zero length sequence.");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
238 { // finalize .pac file
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
239 ubyte_t ct;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
240 fwrite(buf, 1, (l_buf>>2) + ((l_buf&3) == 0? 0 : 1), fp);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
241 // the following codes make the pac file size always (l_pac/4+1+1)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
242 if (bns->l_pac % 4 == 0) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
243 ct = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
244 fwrite(&ct, 1, 1, fp);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
245 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
246 ct = bns->l_pac % 4;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
247 fwrite(&ct, 1, 1, fp);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
248 // close .pac file
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
249 fclose(fp);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
250 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
251 bns_dump(bns, prefix);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
252 bns_destroy(bns);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
253 kseq_destroy(seq);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
254 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
255
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
256 int bwa_fa2pac(int argc, char *argv[])
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
257 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
258 gzFile fp;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
259 if (argc < 2) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
260 fprintf(stderr, "Usage: bwa fa2pac <in.fasta> [<out.prefix>]\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
261 return 1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
262 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
263 fp = xzopen(argv[1], "r");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
264 bns_fasta2bntseq(fp, (argc < 3)? argv[1] : argv[2]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
265 gzclose(fp);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
266 return 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
267 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
268
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
269 int bns_coor_pac2real(const bntseq_t *bns, int64_t pac_coor, int len, int32_t *real_seq)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
270 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
271 int left, mid, right, nn;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
272 if (pac_coor >= bns->l_pac)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
273 err_fatal("bns_coor_pac2real", "bug! Coordinate is longer than sequence (%lld>=%lld).", pac_coor, bns->l_pac);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
274 // binary search for the sequence ID. Note that this is a bit different from the following one...
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
275 left = 0; mid = 0; right = bns->n_seqs;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
276 while (left < right) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
277 mid = (left + right) >> 1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
278 if (pac_coor >= bns->anns[mid].offset) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
279 if (mid == bns->n_seqs - 1) break;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
280 if (pac_coor < bns->anns[mid+1].offset) break;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
281 left = mid + 1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
282 } else right = mid;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
283 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
284 *real_seq = mid;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
285 // binary search for holes
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
286 left = 0; right = bns->n_holes; nn = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
287 while (left < right) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
288 int64_t mid = (left + right) >> 1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
289 if (pac_coor >= bns->ambs[mid].offset + bns->ambs[mid].len) left = mid + 1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
290 else if (pac_coor + len <= bns->ambs[mid].offset) right = mid;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
291 else { // overlap
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
292 if (pac_coor >= bns->ambs[mid].offset) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
293 nn += bns->ambs[mid].offset + bns->ambs[mid].len < pac_coor + len?
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
294 bns->ambs[mid].offset + bns->ambs[mid].len - pac_coor : len;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
295 } else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
296 nn += bns->ambs[mid].offset + bns->ambs[mid].len < pac_coor + len?
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
297 bns->ambs[mid].len : len - (bns->ambs[mid].offset - pac_coor);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
298 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
299 break;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
300 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
301 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
302 return nn;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
303 }