Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/samtools-0.1.19/misc/ace2sam.c @ 0:903fc43d6227 draft default tip
Uploaded
| author | lsong10 |
|---|---|
| date | Fri, 26 Mar 2021 16:52:45 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:903fc43d6227 |
|---|---|
| 1 /* The MIT License | |
| 2 | |
| 3 Copyright (c) 2011 Heng Li <lh3@live.co.uk> | |
| 4 | |
| 5 Permission is hereby granted, free of charge, to any person obtaining | |
| 6 a copy of this software and associated documentation files (the | |
| 7 "Software"), to deal in the Software without restriction, including | |
| 8 without limitation the rights to use, copy, modify, merge, publish, | |
| 9 distribute, sublicense, and/or sell copies of the Software, and to | |
| 10 permit persons to whom the Software is furnished to do so, subject to | |
| 11 the following conditions: | |
| 12 | |
| 13 The above copyright notice and this permission notice shall be | |
| 14 included in all copies or substantial portions of the Software. | |
| 15 | |
| 16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
| 17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
| 18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
| 19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
| 20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
| 21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
| 22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
| 23 SOFTWARE. | |
| 24 */ | |
| 25 | |
| 26 #include <stdio.h> | |
| 27 #include <stdlib.h> | |
| 28 #include <string.h> | |
| 29 #include <unistd.h> | |
| 30 #include <zlib.h> | |
| 31 #include "kstring.h" | |
| 32 #include "kseq.h" | |
| 33 KSTREAM_INIT(gzFile, gzread, 16384) | |
| 34 | |
| 35 #define N_TMPSTR 5 | |
| 36 #define LINE_LEN 60 | |
| 37 | |
| 38 // append a CIGAR operation plus length | |
| 39 #define write_cigar(_c, _n, _m, _v) do { \ | |
| 40 if (_n == _m) { \ | |
| 41 _m = _m? _m<<1 : 4; \ | |
| 42 _c = realloc(_c, _m * sizeof(unsigned)); \ | |
| 43 } \ | |
| 44 _c[_n++] = (_v); \ | |
| 45 } while (0) | |
| 46 | |
| 47 // a fatal error | |
| 48 static void fatal(const char *msg) | |
| 49 { | |
| 50 fprintf(stderr, "E %s\n", msg); | |
| 51 exit(1); | |
| 52 } | |
| 53 // remove pads | |
| 54 static void remove_pads(const kstring_t *src, kstring_t *dst) | |
| 55 { | |
| 56 int i, j; | |
| 57 dst->l = 0; | |
| 58 kputsn(src->s, src->l, dst); | |
| 59 for (i = j = 0; i < dst->l; ++i) | |
| 60 if (dst->s[i] != '*') dst->s[j++] = dst->s[i]; | |
| 61 dst->s[j] = 0; | |
| 62 dst->l = j; | |
| 63 } | |
| 64 | |
| 65 int main(int argc, char *argv[]) | |
| 66 { | |
| 67 gzFile fp; | |
| 68 kstream_t *ks; | |
| 69 kstring_t s, t[N_TMPSTR]; | |
| 70 int dret, i, k, af_n, af_max, af_i, c, is_padded = 0, write_cns = 0, *p2u = 0; | |
| 71 long m_cigar = 0, n_cigar = 0; | |
| 72 unsigned *af, *cigar = 0; | |
| 73 | |
| 74 while ((c = getopt(argc, argv, "pc")) >= 0) { | |
| 75 switch (c) { | |
| 76 case 'p': is_padded = 1; break; | |
| 77 case 'c': write_cns = 1; break; | |
| 78 } | |
| 79 } | |
| 80 if (argc == optind) { | |
| 81 fprintf(stderr, "\nUsage: ace2sam [-pc] <in.ace>\n\n"); | |
| 82 fprintf(stderr, "Options: -p output padded SAM\n"); | |
| 83 fprintf(stderr, " -c write the contig sequence in SAM\n\n"); | |
| 84 fprintf(stderr, "Notes: 1. Fields must appear in the following order: (CO->[BQ]->(AF)->(RD->QA))\n"); | |
| 85 fprintf(stderr, " 2. The order of reads in AF and in RD must be identical\n"); | |
| 86 fprintf(stderr, " 3. Except in BQ, words and numbers must be separated by a single SPACE or TAB\n"); | |
| 87 fprintf(stderr, " 4. This program writes the headerless SAM to stdout and header to stderr\n\n"); | |
| 88 return 1; | |
| 89 } | |
| 90 | |
| 91 s.l = s.m = 0; s.s = 0; | |
| 92 af_n = af_max = af_i = 0; af = 0; | |
| 93 for (i = 0; i < N_TMPSTR; ++i) t[i].l = t[i].m = 0, t[i].s = 0; | |
| 94 fp = strcmp(argv[1], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); | |
| 95 ks = ks_init(fp); | |
| 96 while (ks_getuntil(ks, 0, &s, &dret) >= 0) { | |
| 97 if (strcmp(s.s, "CO") == 0) { // contig sequence | |
| 98 kstring_t *cns; | |
| 99 t[0].l = t[1].l = t[2].l = t[3].l = t[4].l = 0; // 0: name; 1: padded ctg; 2: unpadded ctg/padded read; 3: unpadded read; 4: SAM line | |
| 100 af_n = af_i = 0; // reset the af array | |
| 101 ks_getuntil(ks, 0, &s, &dret); kputs(s.s, &t[0]); // contig name | |
| 102 ks_getuntil(ks, '\n', &s, &dret); // read the whole line | |
| 103 while (ks_getuntil(ks, '\n', &s, &dret) >= 0 && s.l > 0) kputsn(s.s, s.l, &t[1]); // read the padded consensus sequence | |
| 104 remove_pads(&t[1], &t[2]); // construct the unpadded sequence | |
| 105 // compute the array for mapping padded positions to unpadded positions | |
| 106 p2u = realloc(p2u, t[1].m * sizeof(int)); | |
| 107 for (i = k = 0; i < t[1].l; ++i) { | |
| 108 p2u[i] = k; | |
| 109 if (t[1].s[i] != '*') ++k; | |
| 110 } | |
| 111 // write out the SAM header and contig sequences | |
| 112 fprintf(stderr, "H @SQ\tSN:%s\tLN:%ld\n", t[0].s, t[is_padded?1:2].l); // The SAM header line | |
| 113 cns = &t[is_padded?1:2]; | |
| 114 fprintf(stderr, "S >%s\n", t[0].s); | |
| 115 for (i = 0; i < cns->l; i += LINE_LEN) { | |
| 116 fputs("S ", stderr); | |
| 117 for (k = 0; k < LINE_LEN && i + k < cns->l; ++k) | |
| 118 fputc(cns->s[i + k], stderr); | |
| 119 fputc('\n', stderr); | |
| 120 } | |
| 121 | |
| 122 #define __padded2cigar(sp) do { \ | |
| 123 int i, l_M = 0, l_D = 0; \ | |
| 124 for (i = 0; i < sp.l; ++i) { \ | |
| 125 if (sp.s[i] == '*') { \ | |
| 126 if (l_M) write_cigar(cigar, n_cigar, m_cigar, l_M<<4); \ | |
| 127 ++l_D; l_M = 0; \ | |
| 128 } else { \ | |
| 129 if (l_D) write_cigar(cigar, n_cigar, m_cigar, l_D<<4 | 2); \ | |
| 130 ++l_M; l_D = 0; \ | |
| 131 } \ | |
| 132 } \ | |
| 133 if (l_M) write_cigar(cigar, n_cigar, m_cigar, l_M<<4); \ | |
| 134 else write_cigar(cigar, n_cigar, m_cigar, l_D<<4 | 2); \ | |
| 135 } while (0) | |
| 136 | |
| 137 if (write_cns) { // write the consensus SAM line (dummy read) | |
| 138 n_cigar = 0; | |
| 139 if (is_padded) __padded2cigar(t[1]); | |
| 140 else write_cigar(cigar, n_cigar, m_cigar, t[2].l<<4); | |
| 141 kputsn(t[0].s, t[0].l, &t[4]); kputs("\t516\t", &t[4]); kputsn(t[0].s, t[0].l, &t[4]); kputs("\t1\t60\t", &t[4]); | |
| 142 for (i = 0; i < n_cigar; ++i) { | |
| 143 kputw(cigar[i]>>4, &t[4]); kputc("MIDNSHP=X"[cigar[i]&0xf], &t[4]); | |
| 144 } | |
| 145 kputs("\t*\t0\t0\t", &t[4]); kputsn(t[2].s, t[2].l, &t[4]); kputs("\t*", &t[4]); | |
| 146 } | |
| 147 } else if (strcmp(s.s, "BQ") == 0) { // contig quality | |
| 148 if (t[0].l == 0) fatal("come to 'BQ' before reading 'CO'"); | |
| 149 if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); // read the entire "BQ" line | |
| 150 if (write_cns) t[4].s[--t[4].l] = 0; // remove the trailing "*" | |
| 151 for (i = 0; i < t[2].l; ++i) { // read the consensus quality | |
| 152 int q; | |
| 153 if (ks_getuntil(ks, 0, &s, &dret) < 0) fprintf(stderr, "E truncated contig quality\n"); | |
| 154 if (s.l) { | |
| 155 q = atoi(s.s) + 33; | |
| 156 if (q > 126) q = 126; | |
| 157 if (write_cns) kputc(q, &t[4]); | |
| 158 } else --i; | |
| 159 } | |
| 160 if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); | |
| 161 ks_getuntil(ks, '\n', &s, &dret); // skip the empty line | |
| 162 if (write_cns) puts(t[4].s); t[4].l = 0; | |
| 163 } else if (strcmp(s.s, "AF") == 0) { // padded read position | |
| 164 int reversed, neg, pos; | |
| 165 if (t[0].l == 0) fatal("come to 'AF' before reading 'CO'"); | |
| 166 if (write_cns) { | |
| 167 if (t[4].l) puts(t[4].s); | |
| 168 t[4].l = 0; | |
| 169 } | |
| 170 ks_getuntil(ks, 0, &s, &dret); // read name | |
| 171 ks_getuntil(ks, 0, &s, &dret); reversed = s.s[0] == 'C'? 1 : 0; // strand | |
| 172 ks_getuntil(ks, 0, &s, &dret); pos = atoi(s.s); neg = pos < 0? 1 : 0; pos = pos < 0? -pos : pos; // position | |
| 173 if (af_n == af_max) { // double the af array | |
| 174 af_max = af_max? af_max<<1 : 4; | |
| 175 af = realloc(af, af_max * sizeof(unsigned)); | |
| 176 } | |
| 177 af[af_n++] = pos << 2 | neg << 1 | reversed; // keep the placement information | |
| 178 } else if (strcmp(s.s, "RD") == 0) { // read sequence | |
| 179 if (af_i >= af_n) fatal("more 'RD' records than 'AF'"); | |
| 180 t[2].l = t[3].l = t[4].l = 0; | |
| 181 ks_getuntil(ks, 0, &t[4], &dret); // QNAME | |
| 182 if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); // read the entire RD line | |
| 183 while (ks_getuntil(ks, '\n', &s, &dret) >= 0 && s.l > 0) kputs(s.s, &t[2]); // read the read sequence | |
| 184 } else if (strcmp(s.s, "QA") == 0) { // clipping | |
| 185 if (af_i >= af_n) fatal("more 'QA' records than 'AF'"); | |
| 186 int beg, end, pos, op; | |
| 187 ks_getuntil(ks, 0, &s, &dret); ks_getuntil(ks, 0, &s, &dret); // skip quality clipping | |
| 188 ks_getuntil(ks, 0, &s, &dret); beg = atoi(s.s) - 1; // align clipping start | |
| 189 ks_getuntil(ks, 0, &s, &dret); end = atoi(s.s); // clipping end | |
| 190 if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); | |
| 191 // compute 1-based POS | |
| 192 pos = af[af_i]>>2; // retrieve the position information | |
| 193 if (af[af_i]>>1&1) pos = -pos; | |
| 194 pos += beg; // now pos is the true padded position | |
| 195 // generate CIGAR | |
| 196 remove_pads(&t[2], &t[3]); // backup the unpadded read sequence | |
| 197 n_cigar = 0; | |
| 198 if (beg) write_cigar(cigar, n_cigar, m_cigar, beg<<4|4); | |
| 199 if (is_padded) { | |
| 200 __padded2cigar(t[2]); | |
| 201 if (beg && n_cigar > 1) cigar[1] -= beg<<4; // fix the left-hand CIGAR | |
| 202 if (end < t[2].l && n_cigar) cigar[n_cigar-1] -= (t[2].l - end)<<4; // fix the right-hand CIGAR | |
| 203 } else { | |
| 204 // generate flattened CIGAR string | |
| 205 for (i = beg, k = pos - 1; i < end; ++i, ++k) | |
| 206 t[2].s[i] = t[2].s[i] != '*'? (t[1].s[k] != '*'? 0 : 1) : (t[1].s[k] != '*'? 2 : 6); | |
| 207 // generate the proper CIGAR | |
| 208 for (i = beg + 1, k = 1, op = t[2].s[beg]; i < end; ++i) { | |
| 209 if (op != t[2].s[i]) { | |
| 210 write_cigar(cigar, n_cigar, m_cigar, k<<4|op); | |
| 211 op = t[2].s[i]; k = 1; | |
| 212 } else ++k; | |
| 213 } | |
| 214 write_cigar(cigar, n_cigar, m_cigar, k<<4|op); | |
| 215 // remove unnecessary "P" and possibly merge adjacent operations | |
| 216 for (i = 2; i < n_cigar; ++i) { | |
| 217 if ((cigar[i]&0xf) != 1 && (cigar[i-1]&0xf) == 6 && (cigar[i-2]&0xf) != 1) { | |
| 218 cigar[i-1] = 0; | |
| 219 if ((cigar[i]&0xf) == (cigar[i-2]&0xf)) // merge operations | |
| 220 cigar[i] += cigar[i-2], cigar[i-2] = 0; | |
| 221 } | |
| 222 } | |
| 223 for (i = k = 0; i < n_cigar; ++i) // squeeze out dumb operations | |
| 224 if (cigar[i]) cigar[k++] = cigar[i]; | |
| 225 n_cigar = k; | |
| 226 } | |
| 227 if (end < t[2].l) write_cigar(cigar, n_cigar, m_cigar, (t[2].l - end)<<4|4); | |
| 228 // write the SAM line for the read | |
| 229 kputc('\t', &t[4]); // QNAME has already been written | |
| 230 kputw((af[af_i]&1)? 16 : 0, &t[4]); kputc('\t', &t[4]); // FLAG | |
| 231 kputsn(t[0].s, t[0].l, &t[4]); kputc('\t', &t[4]); // RNAME | |
| 232 kputw(is_padded? pos : p2u[pos-1]+1, &t[4]); // POS | |
| 233 kputs("\t60\t", &t[4]); // MAPQ | |
| 234 for (i = 0; i < n_cigar; ++i) { // CIGAR | |
| 235 kputw(cigar[i]>>4, &t[4]); kputc("MIDNSHP=X"[cigar[i]&0xf], &t[4]); | |
| 236 } | |
| 237 kputs("\t*\t0\t0\t", &t[4]); // empty MRNM, MPOS and TLEN | |
| 238 kputsn(t[3].s, t[3].l, &t[4]); // unpadded SEQ | |
| 239 kputs("\t*", &t[4]); // QUAL | |
| 240 puts(t[4].s); // print to stdout | |
| 241 ++af_i; | |
| 242 } else if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); | |
| 243 } | |
| 244 ks_destroy(ks); | |
| 245 gzclose(fp); | |
| 246 free(af); free(s.s); free(cigar); free(p2u); | |
| 247 for (i = 0; i < N_TMPSTR; ++i) free(t[i].s); | |
| 248 return 0; | |
| 249 } |
