comparison PsiCLASS-1.0.2/samtools-0.1.19/misc/ace2sam.c @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:903fc43d6227
1 /* The MIT License
2
3 Copyright (c) 2011 Heng Li <lh3@live.co.uk>
4
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
12
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
15
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 SOFTWARE.
24 */
25
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <unistd.h>
30 #include <zlib.h>
31 #include "kstring.h"
32 #include "kseq.h"
33 KSTREAM_INIT(gzFile, gzread, 16384)
34
35 #define N_TMPSTR 5
36 #define LINE_LEN 60
37
38 // append a CIGAR operation plus length
39 #define write_cigar(_c, _n, _m, _v) do { \
40 if (_n == _m) { \
41 _m = _m? _m<<1 : 4; \
42 _c = realloc(_c, _m * sizeof(unsigned)); \
43 } \
44 _c[_n++] = (_v); \
45 } while (0)
46
47 // a fatal error
48 static void fatal(const char *msg)
49 {
50 fprintf(stderr, "E %s\n", msg);
51 exit(1);
52 }
53 // remove pads
54 static void remove_pads(const kstring_t *src, kstring_t *dst)
55 {
56 int i, j;
57 dst->l = 0;
58 kputsn(src->s, src->l, dst);
59 for (i = j = 0; i < dst->l; ++i)
60 if (dst->s[i] != '*') dst->s[j++] = dst->s[i];
61 dst->s[j] = 0;
62 dst->l = j;
63 }
64
65 int main(int argc, char *argv[])
66 {
67 gzFile fp;
68 kstream_t *ks;
69 kstring_t s, t[N_TMPSTR];
70 int dret, i, k, af_n, af_max, af_i, c, is_padded = 0, write_cns = 0, *p2u = 0;
71 long m_cigar = 0, n_cigar = 0;
72 unsigned *af, *cigar = 0;
73
74 while ((c = getopt(argc, argv, "pc")) >= 0) {
75 switch (c) {
76 case 'p': is_padded = 1; break;
77 case 'c': write_cns = 1; break;
78 }
79 }
80 if (argc == optind) {
81 fprintf(stderr, "\nUsage: ace2sam [-pc] <in.ace>\n\n");
82 fprintf(stderr, "Options: -p output padded SAM\n");
83 fprintf(stderr, " -c write the contig sequence in SAM\n\n");
84 fprintf(stderr, "Notes: 1. Fields must appear in the following order: (CO->[BQ]->(AF)->(RD->QA))\n");
85 fprintf(stderr, " 2. The order of reads in AF and in RD must be identical\n");
86 fprintf(stderr, " 3. Except in BQ, words and numbers must be separated by a single SPACE or TAB\n");
87 fprintf(stderr, " 4. This program writes the headerless SAM to stdout and header to stderr\n\n");
88 return 1;
89 }
90
91 s.l = s.m = 0; s.s = 0;
92 af_n = af_max = af_i = 0; af = 0;
93 for (i = 0; i < N_TMPSTR; ++i) t[i].l = t[i].m = 0, t[i].s = 0;
94 fp = strcmp(argv[1], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
95 ks = ks_init(fp);
96 while (ks_getuntil(ks, 0, &s, &dret) >= 0) {
97 if (strcmp(s.s, "CO") == 0) { // contig sequence
98 kstring_t *cns;
99 t[0].l = t[1].l = t[2].l = t[3].l = t[4].l = 0; // 0: name; 1: padded ctg; 2: unpadded ctg/padded read; 3: unpadded read; 4: SAM line
100 af_n = af_i = 0; // reset the af array
101 ks_getuntil(ks, 0, &s, &dret); kputs(s.s, &t[0]); // contig name
102 ks_getuntil(ks, '\n', &s, &dret); // read the whole line
103 while (ks_getuntil(ks, '\n', &s, &dret) >= 0 && s.l > 0) kputsn(s.s, s.l, &t[1]); // read the padded consensus sequence
104 remove_pads(&t[1], &t[2]); // construct the unpadded sequence
105 // compute the array for mapping padded positions to unpadded positions
106 p2u = realloc(p2u, t[1].m * sizeof(int));
107 for (i = k = 0; i < t[1].l; ++i) {
108 p2u[i] = k;
109 if (t[1].s[i] != '*') ++k;
110 }
111 // write out the SAM header and contig sequences
112 fprintf(stderr, "H @SQ\tSN:%s\tLN:%ld\n", t[0].s, t[is_padded?1:2].l); // The SAM header line
113 cns = &t[is_padded?1:2];
114 fprintf(stderr, "S >%s\n", t[0].s);
115 for (i = 0; i < cns->l; i += LINE_LEN) {
116 fputs("S ", stderr);
117 for (k = 0; k < LINE_LEN && i + k < cns->l; ++k)
118 fputc(cns->s[i + k], stderr);
119 fputc('\n', stderr);
120 }
121
122 #define __padded2cigar(sp) do { \
123 int i, l_M = 0, l_D = 0; \
124 for (i = 0; i < sp.l; ++i) { \
125 if (sp.s[i] == '*') { \
126 if (l_M) write_cigar(cigar, n_cigar, m_cigar, l_M<<4); \
127 ++l_D; l_M = 0; \
128 } else { \
129 if (l_D) write_cigar(cigar, n_cigar, m_cigar, l_D<<4 | 2); \
130 ++l_M; l_D = 0; \
131 } \
132 } \
133 if (l_M) write_cigar(cigar, n_cigar, m_cigar, l_M<<4); \
134 else write_cigar(cigar, n_cigar, m_cigar, l_D<<4 | 2); \
135 } while (0)
136
137 if (write_cns) { // write the consensus SAM line (dummy read)
138 n_cigar = 0;
139 if (is_padded) __padded2cigar(t[1]);
140 else write_cigar(cigar, n_cigar, m_cigar, t[2].l<<4);
141 kputsn(t[0].s, t[0].l, &t[4]); kputs("\t516\t", &t[4]); kputsn(t[0].s, t[0].l, &t[4]); kputs("\t1\t60\t", &t[4]);
142 for (i = 0; i < n_cigar; ++i) {
143 kputw(cigar[i]>>4, &t[4]); kputc("MIDNSHP=X"[cigar[i]&0xf], &t[4]);
144 }
145 kputs("\t*\t0\t0\t", &t[4]); kputsn(t[2].s, t[2].l, &t[4]); kputs("\t*", &t[4]);
146 }
147 } else if (strcmp(s.s, "BQ") == 0) { // contig quality
148 if (t[0].l == 0) fatal("come to 'BQ' before reading 'CO'");
149 if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); // read the entire "BQ" line
150 if (write_cns) t[4].s[--t[4].l] = 0; // remove the trailing "*"
151 for (i = 0; i < t[2].l; ++i) { // read the consensus quality
152 int q;
153 if (ks_getuntil(ks, 0, &s, &dret) < 0) fprintf(stderr, "E truncated contig quality\n");
154 if (s.l) {
155 q = atoi(s.s) + 33;
156 if (q > 126) q = 126;
157 if (write_cns) kputc(q, &t[4]);
158 } else --i;
159 }
160 if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
161 ks_getuntil(ks, '\n', &s, &dret); // skip the empty line
162 if (write_cns) puts(t[4].s); t[4].l = 0;
163 } else if (strcmp(s.s, "AF") == 0) { // padded read position
164 int reversed, neg, pos;
165 if (t[0].l == 0) fatal("come to 'AF' before reading 'CO'");
166 if (write_cns) {
167 if (t[4].l) puts(t[4].s);
168 t[4].l = 0;
169 }
170 ks_getuntil(ks, 0, &s, &dret); // read name
171 ks_getuntil(ks, 0, &s, &dret); reversed = s.s[0] == 'C'? 1 : 0; // strand
172 ks_getuntil(ks, 0, &s, &dret); pos = atoi(s.s); neg = pos < 0? 1 : 0; pos = pos < 0? -pos : pos; // position
173 if (af_n == af_max) { // double the af array
174 af_max = af_max? af_max<<1 : 4;
175 af = realloc(af, af_max * sizeof(unsigned));
176 }
177 af[af_n++] = pos << 2 | neg << 1 | reversed; // keep the placement information
178 } else if (strcmp(s.s, "RD") == 0) { // read sequence
179 if (af_i >= af_n) fatal("more 'RD' records than 'AF'");
180 t[2].l = t[3].l = t[4].l = 0;
181 ks_getuntil(ks, 0, &t[4], &dret); // QNAME
182 if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); // read the entire RD line
183 while (ks_getuntil(ks, '\n', &s, &dret) >= 0 && s.l > 0) kputs(s.s, &t[2]); // read the read sequence
184 } else if (strcmp(s.s, "QA") == 0) { // clipping
185 if (af_i >= af_n) fatal("more 'QA' records than 'AF'");
186 int beg, end, pos, op;
187 ks_getuntil(ks, 0, &s, &dret); ks_getuntil(ks, 0, &s, &dret); // skip quality clipping
188 ks_getuntil(ks, 0, &s, &dret); beg = atoi(s.s) - 1; // align clipping start
189 ks_getuntil(ks, 0, &s, &dret); end = atoi(s.s); // clipping end
190 if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
191 // compute 1-based POS
192 pos = af[af_i]>>2; // retrieve the position information
193 if (af[af_i]>>1&1) pos = -pos;
194 pos += beg; // now pos is the true padded position
195 // generate CIGAR
196 remove_pads(&t[2], &t[3]); // backup the unpadded read sequence
197 n_cigar = 0;
198 if (beg) write_cigar(cigar, n_cigar, m_cigar, beg<<4|4);
199 if (is_padded) {
200 __padded2cigar(t[2]);
201 if (beg && n_cigar > 1) cigar[1] -= beg<<4; // fix the left-hand CIGAR
202 if (end < t[2].l && n_cigar) cigar[n_cigar-1] -= (t[2].l - end)<<4; // fix the right-hand CIGAR
203 } else {
204 // generate flattened CIGAR string
205 for (i = beg, k = pos - 1; i < end; ++i, ++k)
206 t[2].s[i] = t[2].s[i] != '*'? (t[1].s[k] != '*'? 0 : 1) : (t[1].s[k] != '*'? 2 : 6);
207 // generate the proper CIGAR
208 for (i = beg + 1, k = 1, op = t[2].s[beg]; i < end; ++i) {
209 if (op != t[2].s[i]) {
210 write_cigar(cigar, n_cigar, m_cigar, k<<4|op);
211 op = t[2].s[i]; k = 1;
212 } else ++k;
213 }
214 write_cigar(cigar, n_cigar, m_cigar, k<<4|op);
215 // remove unnecessary "P" and possibly merge adjacent operations
216 for (i = 2; i < n_cigar; ++i) {
217 if ((cigar[i]&0xf) != 1 && (cigar[i-1]&0xf) == 6 && (cigar[i-2]&0xf) != 1) {
218 cigar[i-1] = 0;
219 if ((cigar[i]&0xf) == (cigar[i-2]&0xf)) // merge operations
220 cigar[i] += cigar[i-2], cigar[i-2] = 0;
221 }
222 }
223 for (i = k = 0; i < n_cigar; ++i) // squeeze out dumb operations
224 if (cigar[i]) cigar[k++] = cigar[i];
225 n_cigar = k;
226 }
227 if (end < t[2].l) write_cigar(cigar, n_cigar, m_cigar, (t[2].l - end)<<4|4);
228 // write the SAM line for the read
229 kputc('\t', &t[4]); // QNAME has already been written
230 kputw((af[af_i]&1)? 16 : 0, &t[4]); kputc('\t', &t[4]); // FLAG
231 kputsn(t[0].s, t[0].l, &t[4]); kputc('\t', &t[4]); // RNAME
232 kputw(is_padded? pos : p2u[pos-1]+1, &t[4]); // POS
233 kputs("\t60\t", &t[4]); // MAPQ
234 for (i = 0; i < n_cigar; ++i) { // CIGAR
235 kputw(cigar[i]>>4, &t[4]); kputc("MIDNSHP=X"[cigar[i]&0xf], &t[4]);
236 }
237 kputs("\t*\t0\t0\t", &t[4]); // empty MRNM, MPOS and TLEN
238 kputsn(t[3].s, t[3].l, &t[4]); // unpadded SEQ
239 kputs("\t*", &t[4]); // QUAL
240 puts(t[4].s); // print to stdout
241 ++af_i;
242 } else if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
243 }
244 ks_destroy(ks);
245 gzclose(fp);
246 free(af); free(s.s); free(cigar); free(p2u);
247 for (i = 0; i < N_TMPSTR; ++i) free(t[i].s);
248 return 0;
249 }