annotate PsiCLASS-1.0.2/samtools-0.1.19/misc/ace2sam.c @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1 /* The MIT License
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
2
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
3 Copyright (c) 2011 Heng Li <lh3@live.co.uk>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
4
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
5 Permission is hereby granted, free of charge, to any person obtaining
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
6 a copy of this software and associated documentation files (the
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
7 "Software"), to deal in the Software without restriction, including
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
8 without limitation the rights to use, copy, modify, merge, publish,
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
9 distribute, sublicense, and/or sell copies of the Software, and to
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
10 permit persons to whom the Software is furnished to do so, subject to
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
11 the following conditions:
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
12
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
13 The above copyright notice and this permission notice shall be
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
14 included in all copies or substantial portions of the Software.
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
15
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
23 SOFTWARE.
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
24 */
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
25
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
26 #include <stdio.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
27 #include <stdlib.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
28 #include <string.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
29 #include <unistd.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
30 #include <zlib.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
31 #include "kstring.h"
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
32 #include "kseq.h"
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
33 KSTREAM_INIT(gzFile, gzread, 16384)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
34
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
35 #define N_TMPSTR 5
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
36 #define LINE_LEN 60
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
37
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
38 // append a CIGAR operation plus length
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
39 #define write_cigar(_c, _n, _m, _v) do { \
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
40 if (_n == _m) { \
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
41 _m = _m? _m<<1 : 4; \
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
42 _c = realloc(_c, _m * sizeof(unsigned)); \
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
43 } \
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
44 _c[_n++] = (_v); \
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
45 } while (0)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
46
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
47 // a fatal error
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
48 static void fatal(const char *msg)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
49 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
50 fprintf(stderr, "E %s\n", msg);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
51 exit(1);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
52 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
53 // remove pads
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
54 static void remove_pads(const kstring_t *src, kstring_t *dst)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
55 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
56 int i, j;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
57 dst->l = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
58 kputsn(src->s, src->l, dst);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
59 for (i = j = 0; i < dst->l; ++i)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
60 if (dst->s[i] != '*') dst->s[j++] = dst->s[i];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
61 dst->s[j] = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
62 dst->l = j;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
63 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
64
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
65 int main(int argc, char *argv[])
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
66 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
67 gzFile fp;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
68 kstream_t *ks;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
69 kstring_t s, t[N_TMPSTR];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
70 int dret, i, k, af_n, af_max, af_i, c, is_padded = 0, write_cns = 0, *p2u = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
71 long m_cigar = 0, n_cigar = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
72 unsigned *af, *cigar = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
73
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
74 while ((c = getopt(argc, argv, "pc")) >= 0) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
75 switch (c) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
76 case 'p': is_padded = 1; break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
77 case 'c': write_cns = 1; break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
78 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
79 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
80 if (argc == optind) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
81 fprintf(stderr, "\nUsage: ace2sam [-pc] <in.ace>\n\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
82 fprintf(stderr, "Options: -p output padded SAM\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
83 fprintf(stderr, " -c write the contig sequence in SAM\n\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
84 fprintf(stderr, "Notes: 1. Fields must appear in the following order: (CO->[BQ]->(AF)->(RD->QA))\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
85 fprintf(stderr, " 2. The order of reads in AF and in RD must be identical\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
86 fprintf(stderr, " 3. Except in BQ, words and numbers must be separated by a single SPACE or TAB\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
87 fprintf(stderr, " 4. This program writes the headerless SAM to stdout and header to stderr\n\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
88 return 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
89 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
90
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
91 s.l = s.m = 0; s.s = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
92 af_n = af_max = af_i = 0; af = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
93 for (i = 0; i < N_TMPSTR; ++i) t[i].l = t[i].m = 0, t[i].s = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
94 fp = strcmp(argv[1], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
95 ks = ks_init(fp);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
96 while (ks_getuntil(ks, 0, &s, &dret) >= 0) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
97 if (strcmp(s.s, "CO") == 0) { // contig sequence
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
98 kstring_t *cns;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
99 t[0].l = t[1].l = t[2].l = t[3].l = t[4].l = 0; // 0: name; 1: padded ctg; 2: unpadded ctg/padded read; 3: unpadded read; 4: SAM line
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
100 af_n = af_i = 0; // reset the af array
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
101 ks_getuntil(ks, 0, &s, &dret); kputs(s.s, &t[0]); // contig name
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
102 ks_getuntil(ks, '\n', &s, &dret); // read the whole line
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
103 while (ks_getuntil(ks, '\n', &s, &dret) >= 0 && s.l > 0) kputsn(s.s, s.l, &t[1]); // read the padded consensus sequence
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
104 remove_pads(&t[1], &t[2]); // construct the unpadded sequence
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
105 // compute the array for mapping padded positions to unpadded positions
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
106 p2u = realloc(p2u, t[1].m * sizeof(int));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
107 for (i = k = 0; i < t[1].l; ++i) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
108 p2u[i] = k;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
109 if (t[1].s[i] != '*') ++k;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
110 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
111 // write out the SAM header and contig sequences
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
112 fprintf(stderr, "H @SQ\tSN:%s\tLN:%ld\n", t[0].s, t[is_padded?1:2].l); // The SAM header line
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
113 cns = &t[is_padded?1:2];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
114 fprintf(stderr, "S >%s\n", t[0].s);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
115 for (i = 0; i < cns->l; i += LINE_LEN) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
116 fputs("S ", stderr);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
117 for (k = 0; k < LINE_LEN && i + k < cns->l; ++k)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
118 fputc(cns->s[i + k], stderr);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
119 fputc('\n', stderr);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
120 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
121
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
122 #define __padded2cigar(sp) do { \
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
123 int i, l_M = 0, l_D = 0; \
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
124 for (i = 0; i < sp.l; ++i) { \
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
125 if (sp.s[i] == '*') { \
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
126 if (l_M) write_cigar(cigar, n_cigar, m_cigar, l_M<<4); \
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
127 ++l_D; l_M = 0; \
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
128 } else { \
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
129 if (l_D) write_cigar(cigar, n_cigar, m_cigar, l_D<<4 | 2); \
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
130 ++l_M; l_D = 0; \
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
131 } \
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
132 } \
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
133 if (l_M) write_cigar(cigar, n_cigar, m_cigar, l_M<<4); \
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
134 else write_cigar(cigar, n_cigar, m_cigar, l_D<<4 | 2); \
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
135 } while (0)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
136
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
137 if (write_cns) { // write the consensus SAM line (dummy read)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
138 n_cigar = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
139 if (is_padded) __padded2cigar(t[1]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
140 else write_cigar(cigar, n_cigar, m_cigar, t[2].l<<4);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
141 kputsn(t[0].s, t[0].l, &t[4]); kputs("\t516\t", &t[4]); kputsn(t[0].s, t[0].l, &t[4]); kputs("\t1\t60\t", &t[4]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
142 for (i = 0; i < n_cigar; ++i) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
143 kputw(cigar[i]>>4, &t[4]); kputc("MIDNSHP=X"[cigar[i]&0xf], &t[4]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
144 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
145 kputs("\t*\t0\t0\t", &t[4]); kputsn(t[2].s, t[2].l, &t[4]); kputs("\t*", &t[4]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
146 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
147 } else if (strcmp(s.s, "BQ") == 0) { // contig quality
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
148 if (t[0].l == 0) fatal("come to 'BQ' before reading 'CO'");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
149 if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); // read the entire "BQ" line
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
150 if (write_cns) t[4].s[--t[4].l] = 0; // remove the trailing "*"
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
151 for (i = 0; i < t[2].l; ++i) { // read the consensus quality
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
152 int q;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
153 if (ks_getuntil(ks, 0, &s, &dret) < 0) fprintf(stderr, "E truncated contig quality\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
154 if (s.l) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
155 q = atoi(s.s) + 33;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
156 if (q > 126) q = 126;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
157 if (write_cns) kputc(q, &t[4]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
158 } else --i;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
159 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
160 if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
161 ks_getuntil(ks, '\n', &s, &dret); // skip the empty line
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
162 if (write_cns) puts(t[4].s); t[4].l = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
163 } else if (strcmp(s.s, "AF") == 0) { // padded read position
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
164 int reversed, neg, pos;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
165 if (t[0].l == 0) fatal("come to 'AF' before reading 'CO'");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
166 if (write_cns) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
167 if (t[4].l) puts(t[4].s);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
168 t[4].l = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
169 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
170 ks_getuntil(ks, 0, &s, &dret); // read name
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
171 ks_getuntil(ks, 0, &s, &dret); reversed = s.s[0] == 'C'? 1 : 0; // strand
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
172 ks_getuntil(ks, 0, &s, &dret); pos = atoi(s.s); neg = pos < 0? 1 : 0; pos = pos < 0? -pos : pos; // position
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
173 if (af_n == af_max) { // double the af array
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
174 af_max = af_max? af_max<<1 : 4;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
175 af = realloc(af, af_max * sizeof(unsigned));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
176 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
177 af[af_n++] = pos << 2 | neg << 1 | reversed; // keep the placement information
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
178 } else if (strcmp(s.s, "RD") == 0) { // read sequence
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
179 if (af_i >= af_n) fatal("more 'RD' records than 'AF'");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
180 t[2].l = t[3].l = t[4].l = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
181 ks_getuntil(ks, 0, &t[4], &dret); // QNAME
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
182 if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); // read the entire RD line
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
183 while (ks_getuntil(ks, '\n', &s, &dret) >= 0 && s.l > 0) kputs(s.s, &t[2]); // read the read sequence
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
184 } else if (strcmp(s.s, "QA") == 0) { // clipping
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
185 if (af_i >= af_n) fatal("more 'QA' records than 'AF'");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
186 int beg, end, pos, op;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
187 ks_getuntil(ks, 0, &s, &dret); ks_getuntil(ks, 0, &s, &dret); // skip quality clipping
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
188 ks_getuntil(ks, 0, &s, &dret); beg = atoi(s.s) - 1; // align clipping start
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
189 ks_getuntil(ks, 0, &s, &dret); end = atoi(s.s); // clipping end
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
190 if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
191 // compute 1-based POS
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
192 pos = af[af_i]>>2; // retrieve the position information
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
193 if (af[af_i]>>1&1) pos = -pos;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
194 pos += beg; // now pos is the true padded position
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
195 // generate CIGAR
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
196 remove_pads(&t[2], &t[3]); // backup the unpadded read sequence
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
197 n_cigar = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
198 if (beg) write_cigar(cigar, n_cigar, m_cigar, beg<<4|4);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
199 if (is_padded) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
200 __padded2cigar(t[2]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
201 if (beg && n_cigar > 1) cigar[1] -= beg<<4; // fix the left-hand CIGAR
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
202 if (end < t[2].l && n_cigar) cigar[n_cigar-1] -= (t[2].l - end)<<4; // fix the right-hand CIGAR
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
203 } else {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
204 // generate flattened CIGAR string
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
205 for (i = beg, k = pos - 1; i < end; ++i, ++k)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
206 t[2].s[i] = t[2].s[i] != '*'? (t[1].s[k] != '*'? 0 : 1) : (t[1].s[k] != '*'? 2 : 6);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
207 // generate the proper CIGAR
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
208 for (i = beg + 1, k = 1, op = t[2].s[beg]; i < end; ++i) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
209 if (op != t[2].s[i]) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
210 write_cigar(cigar, n_cigar, m_cigar, k<<4|op);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
211 op = t[2].s[i]; k = 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
212 } else ++k;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
213 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
214 write_cigar(cigar, n_cigar, m_cigar, k<<4|op);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
215 // remove unnecessary "P" and possibly merge adjacent operations
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
216 for (i = 2; i < n_cigar; ++i) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
217 if ((cigar[i]&0xf) != 1 && (cigar[i-1]&0xf) == 6 && (cigar[i-2]&0xf) != 1) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
218 cigar[i-1] = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
219 if ((cigar[i]&0xf) == (cigar[i-2]&0xf)) // merge operations
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
220 cigar[i] += cigar[i-2], cigar[i-2] = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
221 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
222 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
223 for (i = k = 0; i < n_cigar; ++i) // squeeze out dumb operations
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
224 if (cigar[i]) cigar[k++] = cigar[i];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
225 n_cigar = k;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
226 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
227 if (end < t[2].l) write_cigar(cigar, n_cigar, m_cigar, (t[2].l - end)<<4|4);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
228 // write the SAM line for the read
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
229 kputc('\t', &t[4]); // QNAME has already been written
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
230 kputw((af[af_i]&1)? 16 : 0, &t[4]); kputc('\t', &t[4]); // FLAG
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
231 kputsn(t[0].s, t[0].l, &t[4]); kputc('\t', &t[4]); // RNAME
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
232 kputw(is_padded? pos : p2u[pos-1]+1, &t[4]); // POS
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
233 kputs("\t60\t", &t[4]); // MAPQ
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
234 for (i = 0; i < n_cigar; ++i) { // CIGAR
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
235 kputw(cigar[i]>>4, &t[4]); kputc("MIDNSHP=X"[cigar[i]&0xf], &t[4]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
236 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
237 kputs("\t*\t0\t0\t", &t[4]); // empty MRNM, MPOS and TLEN
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
238 kputsn(t[3].s, t[3].l, &t[4]); // unpadded SEQ
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
239 kputs("\t*", &t[4]); // QUAL
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
240 puts(t[4].s); // print to stdout
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
241 ++af_i;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
242 } else if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
243 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
244 ks_destroy(ks);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
245 gzclose(fp);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
246 free(af); free(s.s); free(cigar); free(p2u);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
247 for (i = 0; i < N_TMPSTR; ++i) free(t[i].s);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
248 return 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
249 }