Mercurial > repos > siyuan > prada
diff pyPRADA_1.2/tools/bwa-0.5.7-mh/bwtmisc.c @ 0:acc2ca1a3ba4
Uploaded
author | siyuan |
---|---|
date | Thu, 20 Feb 2014 00:44:58 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyPRADA_1.2/tools/bwa-0.5.7-mh/bwtmisc.c Thu Feb 20 00:44:58 2014 -0500 @@ -0,0 +1,267 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li <lh3@sanger.ac.uk> */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include "bntseq.h" +#include "utils.h" +#include "main.h" +#include "bwt.h" + +#ifdef _DIVBWT +#include "divsufsort.h" +#endif + +int is_bwt(ubyte_t *T, int n); + +int64_t bwa_seq_len(const char *fn_pac) +{ + FILE *fp; + int64_t pac_len; + ubyte_t c; + fp = xopen(fn_pac, "rb"); + fseek(fp, -1, SEEK_END); + pac_len = ftell(fp); + fread(&c, 1, 1, fp); + fclose(fp); + return (pac_len - 1) * 4 + (int)c; +} + +bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is) +{ + bwt_t *bwt; + ubyte_t *buf, *buf2; + int i, pac_size; + FILE *fp; + + // initialization + bwt = (bwt_t*)calloc(1, sizeof(bwt_t)); + bwt->seq_len = bwa_seq_len(fn_pac); + bwt->bwt_size = (bwt->seq_len + 15) >> 4; + fp = xopen(fn_pac, "rb"); + + // prepare sequence + pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1); + buf2 = (ubyte_t*)calloc(pac_size, 1); + fread(buf2, 1, pac_size, fp); + fclose(fp); + memset(bwt->L2, 0, 5 * 4); + buf = (ubyte_t*)calloc(bwt->seq_len + 1, 1); + for (i = 0; i < bwt->seq_len; ++i) { + buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3; + ++bwt->L2[1+buf[i]]; + } + for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1]; + free(buf2); + + // Burrows-Wheeler Transform + if (use_is) { + bwt->primary = is_bwt(buf, bwt->seq_len); + } else { +#ifdef _DIVBWT + bwt->primary = divbwt(buf, buf, 0, bwt->seq_len); +#else + err_fatal_simple("libdivsufsort is not compiled in."); +#endif + } + bwt->bwt = (u_int32_t*)calloc(bwt->bwt_size, 4); + for (i = 0; i < bwt->seq_len; ++i) + bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1); + free(buf); + return bwt; +} + +int bwa_pac2bwt(int argc, char *argv[]) +{ + bwt_t *bwt; + int c, use_is = 1; + while ((c = getopt(argc, argv, "d")) >= 0) { + switch (c) { + case 'd': use_is = 0; break; + default: return 1; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "Usage: bwa pac2bwt [-d] <in.pac> <out.bwt>\n"); + return 1; + } + bwt = bwt_pac2bwt(argv[optind], use_is); + bwt_dump_bwt(argv[optind+1], bwt); + bwt_destroy(bwt); + return 0; +} + +#define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3) + +void bwt_bwtupdate_core(bwt_t *bwt) +{ + bwtint_t i, k, c[4], n_occ; + uint32_t *buf; + + n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; + bwt->bwt_size += n_occ * 4; // the new size + buf = (uint32_t*)calloc(bwt->bwt_size, 4); // will be the new bwt + c[0] = c[1] = c[2] = c[3] = 0; + for (i = k = 0; i < bwt->seq_len; ++i) { + if (i % OCC_INTERVAL == 0) { + memcpy(buf + k, c, sizeof(bwtint_t) * 4); + k += 4; + } + if (i % 16 == 0) buf[k++] = bwt->bwt[i/16]; + ++c[bwt_B00(bwt, i)]; + } + // the last element + memcpy(buf + k, c, sizeof(bwtint_t) * 4); + xassert(k + 4 == bwt->bwt_size, "inconsistent bwt_size"); + // update bwt + free(bwt->bwt); bwt->bwt = buf; +} + +int bwa_bwtupdate(int argc, char *argv[]) +{ + bwt_t *bwt; + if (argc < 2) { + fprintf(stderr, "Usage: bwa bwtupdate <the.bwt>\n"); + return 1; + } + bwt = bwt_restore_bwt(argv[1]); + bwt_bwtupdate_core(bwt); + bwt_dump_bwt(argv[1], bwt); + bwt_destroy(bwt); + return 0; +} + +void bwa_pac_rev_core(const char *fn, const char *fn_rev) +{ + int64_t seq_len, i; + bwtint_t pac_len, j; + ubyte_t *bufin, *bufout, ct; + FILE *fp; + seq_len = bwa_seq_len(fn); + pac_len = (seq_len >> 2) + 1; + bufin = (ubyte_t*)calloc(pac_len, 1); + bufout = (ubyte_t*)calloc(pac_len, 1); + fp = xopen(fn, "rb"); + fread(bufin, 1, pac_len, fp); + fclose(fp); + for (i = seq_len - 1, j = 0; i >= 0; --i) { + int c = bufin[i>>2] >> ((~i&3)<<1) & 3; + bwtint_t j = seq_len - 1 - i; + bufout[j>>2] |= c << ((~j&3)<<1); + } + free(bufin); + fp = xopen(fn_rev, "wb"); + fwrite(bufout, 1, pac_len, fp); + ct = seq_len % 4; + fwrite(&ct, 1, 1, fp); + fclose(fp); + free(bufout); +} + +int bwa_pac_rev(int argc, char *argv[]) +{ + if (argc < 3) { + fprintf(stderr, "Usage: bwa pac_rev <in.pac> <out.pac>\n"); + return 1; + } + bwa_pac_rev_core(argv[1], argv[2]); + return 0; +} + +const int nst_color_space_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4}; + +/* this function is not memory efficient, but this will make life easier + Ideally we should also change .amb files as one 'N' in the nucleotide + sequence leads to two ambiguous colors. I may do this later... */ +uint8_t *bwa_pac2cspac_core(const bntseq_t *bns) +{ + uint8_t *pac, *cspac; + bwtint_t i; + int c1, c2; + pac = (uint8_t*)calloc(bns->l_pac/4 + 1, 1); + cspac = (uint8_t*)calloc(bns->l_pac/4 + 1, 1); + fread(pac, 1, bns->l_pac/4+1, bns->fp_pac); + rewind(bns->fp_pac); + c1 = pac[0]>>6; cspac[0] = c1<<6; + for (i = 1; i < bns->l_pac; ++i) { + c2 = pac[i>>2] >> (~i&3)*2 & 3; + cspac[i>>2] |= nst_color_space_table[(1<<c1)|(1<<c2)] << (~i&3)*2; + c1 = c2; + } + free(pac); + return cspac; +} + +int bwa_pac2cspac(int argc, char *argv[]) +{ + bntseq_t *bns; + uint8_t *cspac, ct; + char *str; + FILE *fp; + + if (argc < 3) { + fprintf(stderr, "Usage: bwa pac2cspac <in.nt.prefix> <out.cs.prefix>\n"); + return 1; + } + bns = bns_restore(argv[1]); + cspac = bwa_pac2cspac_core(bns); + bns_dump(bns, argv[2]); + // now write cspac + str = (char*)calloc(strlen(argv[2]) + 5, 1); + strcat(strcpy(str, argv[2]), ".pac"); + fp = xopen(str, "wb"); + fwrite(cspac, 1, bns->l_pac/4 + 1, fp); + ct = bns->l_pac % 4; + fwrite(&ct, 1, 1, fp); + fclose(fp); + bns_destroy(bns); + free(cspac); + return 0; +} + +int bwa_bwt2sa(int argc, char *argv[]) +{ + bwt_t *bwt; + int c, sa_intv = 32; + while ((c = getopt(argc, argv, "i:")) >= 0) { + switch (c) { + case 'i': sa_intv = atoi(optarg); break; + default: return 1; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "Usage: bwa bwt2sa [-i %d] <in.bwt> <out.sa>\n", sa_intv); + return 1; + } + bwt = bwt_restore_bwt(argv[optind]); + bwt_cal_sa(bwt, sa_intv); + bwt_dump_sa(argv[optind+1], bwt); + bwt_destroy(bwt); + return 0; +}