diff ezBAMQC/src/htslib/tabix.c @ 0:dfa3745e5fd8

Uploaded
author youngkim
date Thu, 24 Mar 2016 17:12:52 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ezBAMQC/src/htslib/tabix.c	Thu Mar 24 17:12:52 2016 -0400
@@ -0,0 +1,519 @@
+/*  tabix.c -- Generic indexer for TAB-delimited genome position files.
+
+    Copyright (C) 2009-2011 Broad Institute.
+    Copyright (C) 2010-2012, 2014 Genome Research Ltd.
+
+    Author: Heng Li <lh3@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.  */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <getopt.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <errno.h>
+#include "htslib/tbx.h"
+#include "htslib/sam.h"
+#include "htslib/vcf.h"
+#include "htslib/kseq.h"
+#include "htslib/bgzf.h"
+#include "htslib/hts.h"
+#include "htslib/regidx.h"
+
+typedef struct
+{
+    char *regions_fname, *targets_fname;
+    int print_header, header_only;
+}
+args_t;
+
+static void error(const char *format, ...)
+{
+    va_list ap;
+    va_start(ap, format);
+    vfprintf(stderr, format, ap);
+    va_end(ap);
+    exit(EXIT_FAILURE);
+}
+
+#define IS_GFF  (1<<0)
+#define IS_BED  (1<<1)
+#define IS_SAM  (1<<2)
+#define IS_VCF  (1<<3)
+#define IS_BCF  (1<<4)
+#define IS_BAM  (1<<5)
+#define IS_CRAM (1<<6)
+#define IS_TXT  (IS_GFF|IS_BED|IS_SAM|IS_VCF)
+
+int file_type(const char *fname)
+{
+    int l = strlen(fname);
+    int strcasecmp(const char *s1, const char *s2);
+    if (l>=7 && strcasecmp(fname+l-7, ".gff.gz") == 0) return IS_GFF;
+    else if (l>=7 && strcasecmp(fname+l-7, ".bed.gz") == 0) return IS_BED;
+    else if (l>=7 && strcasecmp(fname+l-7, ".sam.gz") == 0) return IS_SAM;
+    else if (l>=7 && strcasecmp(fname+l-7, ".vcf.gz") == 0) return IS_VCF;
+    else if (l>=4 && strcasecmp(fname+l-4, ".bcf") == 0) return IS_BCF;
+    else if (l>=4 && strcasecmp(fname+l-4, ".bam") == 0) return IS_BAM;
+    else if (l>=4 && strcasecmp(fname+l-5, ".cram") == 0) return IS_CRAM;
+
+    htsFile *fp = hts_open(fname,"r");
+    enum htsExactFormat format = fp->format.format;
+    hts_close(fp);
+    if ( format == bcf ) return IS_BCF;
+    if ( format == bam ) return IS_BAM;
+    if ( format == cram ) return IS_CRAM;
+    if ( format == vcf ) return IS_VCF;
+
+    return 0;
+}
+
+static char **parse_regions(char *regions_fname, char **argv, int argc, int *nregs)
+{
+    kstring_t str = {0,0,0};
+    int iseq = 0, ireg = 0;
+    char **regs = NULL;
+    *nregs = argc;
+
+    if ( regions_fname )
+    {
+        // improve me: this is a too heavy machinery for parsing regions...
+
+        regidx_t *idx = regidx_init(regions_fname, NULL, NULL, 0, NULL);
+        if ( !idx ) error("Could not read %s\n", regions_fname);
+
+        (*nregs) += regidx_nregs(idx);
+        regs = (char**) malloc(sizeof(char*)*(*nregs));
+
+        int nseq;
+        char **seqs = regidx_seq_names(idx, &nseq);
+        for (iseq=0; iseq<nseq; iseq++)
+        {
+            regitr_t itr;
+            regidx_overlap(idx, seqs[iseq], 0, UINT32_MAX, &itr);
+            while ( itr.i < itr.n )
+            {
+                str.l = 0;
+                ksprintf(&str, "%s:%d-%d", seqs[iseq], REGITR_START(itr)+1, REGITR_END(itr)+1);
+                regs[ireg++] = strdup(str.s);
+                itr.i++;
+            }
+        }
+        regidx_destroy(idx);
+    }
+    free(str.s);
+
+    if ( !ireg )
+    {
+        if ( argc )
+            regs = (char**) malloc(sizeof(char*)*argc);
+        else
+        {
+            regs = (char**) malloc(sizeof(char*));
+            regs[0] = strdup(".");
+            *nregs = 1;
+        }
+    }
+
+    for (iseq=0; iseq<argc; iseq++) regs[ireg++] = strdup(argv[iseq]);
+    return regs;
+}
+static int query_regions(args_t *args, char *fname, char **regs, int nregs)
+{
+    int i;
+    htsFile *fp = hts_open(fname,"r");
+    if ( !fp ) error("Could not read %s\n", fname);
+    enum htsExactFormat format = hts_get_format(fp)->format;
+
+    regidx_t *reg_idx = NULL;
+    if ( args->targets_fname )
+    {
+        reg_idx = regidx_init(args->targets_fname, NULL, NULL, 0, NULL);
+        if ( !reg_idx ) error("Could not read %s\n", args->targets_fname);
+    }
+
+    if ( format == bcf )
+    {
+        htsFile *out = hts_open("-","w");
+        if ( !out ) error("Could not open stdout\n", fname);
+        hts_idx_t *idx = bcf_index_load(fname);
+        if ( !idx ) error("Could not load .csi index of %s\n", fname);
+        bcf_hdr_t *hdr = bcf_hdr_read(fp);
+        if ( !hdr ) error("Could not read the header: %s\n", fname);
+        if ( args->print_header )
+            bcf_hdr_write(out,hdr);
+        if ( !args->header_only )
+        {
+            bcf1_t *rec = bcf_init();
+            for (i=0; i<nregs; i++)
+            {
+                hts_itr_t *itr = bcf_itr_querys(idx,hdr,regs[i]);
+                while ( bcf_itr_next(fp, itr, rec) >=0 )
+                {
+                    if ( reg_idx && !regidx_overlap(reg_idx, bcf_seqname(hdr,rec),rec->pos,rec->pos+rec->rlen-1, NULL) ) continue; 
+                    bcf_write(out,hdr,rec);
+                }
+                tbx_itr_destroy(itr);
+            }
+            bcf_destroy(rec);
+        }
+        if ( hts_close(out) ) error("hts_close returned non-zero status for stdout\n");
+        bcf_hdr_destroy(hdr);
+        hts_idx_destroy(idx);
+    }
+    else if ( format==vcf || format==sam || format==unknown_format )
+    {
+        tbx_t *tbx = tbx_index_load(fname);
+        if ( !tbx ) error("Could not load .tbi/.csi index of %s\n", fname);
+        kstring_t str = {0,0,0};
+        if ( args->print_header )
+        {
+            while ( hts_getline(fp, KS_SEP_LINE, &str) >= 0 )
+            {
+                if ( !str.l || str.s[0]!=tbx->conf.meta_char ) break;
+                puts(str.s);
+            }
+        }
+        if ( !args->header_only )
+        {
+            int nseq;
+            const char **seq = NULL;
+            if ( reg_idx ) seq = tbx_seqnames(tbx, &nseq);
+            for (i=0; i<nregs; i++)
+            {
+                hts_itr_t *itr = tbx_itr_querys(tbx, regs[i]);
+                if ( !itr ) continue;
+                while (tbx_itr_next(fp, tbx, itr, &str) >= 0)
+                {
+                    if ( reg_idx && !regidx_overlap(reg_idx,seq[itr->curr_tid],itr->curr_beg,itr->curr_end, NULL) ) continue;
+                    puts(str.s);
+                }
+                tbx_itr_destroy(itr);
+            }
+            free(seq);
+        }
+        free(str.s);
+        tbx_destroy(tbx);
+    }
+    else if ( format==bam )
+        error("Please use \"samtools view\" for querying BAM files.\n");
+
+    if ( reg_idx ) regidx_destroy(reg_idx);
+    if ( hts_close(fp) ) error("hts_close returned non-zero status: %s\n", fname);
+
+    for (i=0; i<nregs; i++) free(regs[i]);
+    free(regs);
+    return 0;
+}
+static int query_chroms(char *fname)
+{
+    const char **seq;
+    int i, nseq, ftype = file_type(fname);
+    if ( ftype & IS_TXT || !ftype )
+    {
+        tbx_t *tbx = tbx_index_load(fname);
+        if ( !tbx ) error("Could not load .tbi index of %s\n", fname);
+        seq = tbx_seqnames(tbx, &nseq);
+        for (i=0; i<nseq; i++)
+            printf("%s\n", seq[i]);
+        free(seq);
+        tbx_destroy(tbx);
+    }
+    else if ( ftype==IS_BCF )
+    {
+        htsFile *fp = hts_open(fname,"r");
+        if ( !fp ) error("Could not read %s\n", fname);
+        bcf_hdr_t *hdr = bcf_hdr_read(fp);
+        if ( !hdr ) error("Could not read the header: %s\n", fname);
+        hts_close(fp);
+        hts_idx_t *idx = bcf_index_load(fname);
+        if ( !idx ) error("Could not load .csi index of %s\n", fname);
+        seq = bcf_index_seqnames(idx, hdr, &nseq);
+        for (i=0; i<nseq; i++)
+            printf("%s\n", seq[i]);
+        free(seq);
+        bcf_hdr_destroy(hdr);
+        hts_idx_destroy(idx);
+    }
+    else if ( ftype==IS_BAM )   // todo: BAM
+        error("BAM: todo\n");
+    return 0;
+}
+
+int reheader_file(const char *fname, const char *header, int ftype, tbx_conf_t *conf)
+{
+    if ( ftype & IS_TXT || !ftype )
+    {
+        BGZF *fp = bgzf_open(fname,"r");
+        if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length ) return -1;
+
+        char *buffer = fp->uncompressed_block;
+        int skip_until = 0;
+
+        // Skip the header: find out the position of the data block
+        if ( buffer[0]==conf->meta_char )
+        {
+            skip_until = 1;
+            while (1)
+            {
+                if ( buffer[skip_until]=='\n' )
+                {
+                    skip_until++;
+                    if ( skip_until>=fp->block_length )
+                    {
+                        if ( bgzf_read_block(fp) != 0 || !fp->block_length ) error("FIXME: No body in the file: %s\n", fname);
+                        skip_until = 0;
+                    }
+                    // The header has finished
+                    if ( buffer[skip_until]!=conf->meta_char ) break;
+                }
+                skip_until++;
+                if ( skip_until>=fp->block_length )
+                {
+                    if (bgzf_read_block(fp) != 0 || !fp->block_length) error("FIXME: No body in the file: %s\n", fname);
+                    skip_until = 0;
+                }
+            }
+        }
+
+        // Output the new header
+        FILE *hdr  = fopen(header,"r");
+        if ( !hdr ) error("%s: %s", header,strerror(errno));
+        int page_size = getpagesize();
+        char *buf = valloc(page_size);
+        BGZF *bgzf_out = bgzf_dopen(fileno(stdout), "w");
+        ssize_t nread;
+        while ( (nread=fread(buf,1,page_size-1,hdr))>0 )
+        {
+            if ( nread<page_size-1 && buf[nread-1]!='\n' ) buf[nread++] = '\n';
+            if (bgzf_write(bgzf_out, buf, nread) < 0) error("Error: %d\n",bgzf_out->errcode);
+        }
+        if ( fclose(hdr) ) error("close failed: %s\n", header);
+
+        // Output all remainig data read with the header block
+        if ( fp->block_length - skip_until > 0 )
+        {
+            if (bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until) < 0) error("Error: %d\n",fp->errcode);
+        }
+        if (bgzf_flush(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode);
+
+        while (1)
+        {
+            nread = bgzf_raw_read(fp, buf, page_size);
+            if ( nread<=0 ) break;
+
+            int count = bgzf_raw_write(bgzf_out, buf, nread);
+            if (count != nread) error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread);
+        }
+        if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode);
+        if (bgzf_close(fp) < 0) error("Error: %d\n",fp->errcode);
+    }
+    else
+        error("todo: reheader BCF, BAM\n");  // BCF is difficult, records contain pointers to the header.
+    return 0;
+}
+
+static int usage(void)
+{
+    fprintf(stderr, "\n");
+    fprintf(stderr, "Version: %s\n", hts_version());
+    fprintf(stderr, "Usage:   tabix [OPTIONS] [FILE] [REGION [...]]\n");
+    fprintf(stderr, "\n");
+    fprintf(stderr, "Indexing Options:\n");
+    fprintf(stderr, "   -0, --zero-based           coordinates are zero-based\n");
+    fprintf(stderr, "   -b, --begin INT            column number for region start [4]\n");
+    fprintf(stderr, "   -c, --comment CHAR         skip comment lines starting with CHAR [null]\n");
+    fprintf(stderr, "   -C, --csi                  generate CSI index for VCF (default is TBI)\n");
+    fprintf(stderr, "   -e, --end INT              column number for region end (if no end, set INT to -b) [5]\n");
+    fprintf(stderr, "   -f, --force                overwrite existing index without asking\n");
+    fprintf(stderr, "   -m, --min-shift INT        set minimal interval size for CSI indices to 2^INT [14]\n");
+    fprintf(stderr, "   -p, --preset STR           gff, bed, sam, vcf\n");
+    fprintf(stderr, "   -s, --sequence INT         column number for sequence names (suppressed by -p) [1]\n");
+    fprintf(stderr, "   -S, --skip-lines INT       skip first INT lines [0]\n");
+    fprintf(stderr, "\n");
+    fprintf(stderr, "Querying and other options:\n");
+    fprintf(stderr, "   -h, --print-header         print also the header lines\n");
+    fprintf(stderr, "   -H, --only-header          print only the header lines\n");
+    fprintf(stderr, "   -l, --list-chroms          list chromosome names\n");
+    fprintf(stderr, "   -r, --reheader FILE        replace the header with the content of FILE\n");
+    fprintf(stderr, "   -R, --regions FILE         restrict to regions listed in the file\n");
+    fprintf(stderr, "   -T, --targets FILE         similar to -R but streams rather than index-jumps\n");
+    fprintf(stderr, "\n");
+    return 1;
+}
+
+int main(int argc, char *argv[])
+{
+    int c, min_shift = 0, is_force = 0, list_chroms = 0, do_csi = 0;
+    tbx_conf_t conf = tbx_conf_gff, *conf_ptr = NULL;
+    char *reheader = NULL;
+    args_t args;
+    memset(&args,0,sizeof(args_t));
+
+    static struct option loptions[] =
+    {
+        {"help",0,0,'h'},
+        {"regions",1,0,'R'},
+        {"targets",1,0,'T'},
+        {"csi",0,0,'C'},
+        {"zero-based",0,0,'0'},
+        {"print-header",0,0,'h'},
+        {"only-header",0,0,'H'},
+        {"begin",1,0,'b'},
+        {"comment",1,0,'c'},
+        {"end",1,0,'e'},
+        {"force",0,0,'f'},
+        {"preset",1,0,'p'},
+        {"sequence",1,0,'s'},
+        {"skip-lines",1,0,'S'},
+        {"list-chroms",0,0,'l'},
+        {"reheader",1,0,'r'},
+        {0,0,0,0}
+    };
+
+    while ((c = getopt_long(argc, argv, "hH?0b:c:e:fm:p:s:S:lr:CR:T:", loptions,NULL)) >= 0)
+    {
+        switch (c)
+        {
+            case 'R': args.regions_fname = optarg; break;
+            case 'T': args.targets_fname = optarg; break;
+            case 'C': do_csi = 1; break;
+            case 'r': reheader = optarg; break;
+            case 'h': args.print_header = 1; break;
+            case 'H': args.header_only = 1; break;
+            case 'l': list_chroms = 1; break;
+            case '0': conf.preset |= TBX_UCSC; break;
+            case 'b': conf.bc = atoi(optarg); break;
+            case 'e': conf.ec = atoi(optarg); break;
+            case 'c': conf.meta_char = *optarg; break;
+            case 'f': is_force = 1; break;
+            case 'm': min_shift = atoi(optarg); break;
+            case 'p':
+                      if (strcmp(optarg, "gff") == 0) conf_ptr = &tbx_conf_gff;
+                      else if (strcmp(optarg, "bed") == 0) conf_ptr = &tbx_conf_bed;
+                      else if (strcmp(optarg, "sam") == 0) conf_ptr = &tbx_conf_sam;
+                      else if (strcmp(optarg, "vcf") == 0) conf_ptr = &tbx_conf_vcf;
+                      else if (strcmp(optarg, "bcf") == 0) ;    // bcf is autodetected, preset is not needed
+                      else if (strcmp(optarg, "bam") == 0) ;    // same as bcf
+                      else error("The preset string not recognised: '%s'\n", optarg);
+                      break;
+            case 's': conf.sc = atoi(optarg); break;
+            case 'S': conf.line_skip = atoi(optarg); break;
+            default: return usage();
+        }
+    }
+
+    if ( optind==argc ) return usage();
+
+    if ( list_chroms )
+        return query_chroms(argv[optind]);
+
+    if ( argc > optind+1 || args.header_only || args.regions_fname || args.targets_fname )
+    {
+        int nregs = 0;
+        char **regs = NULL;
+        if ( !args.header_only )
+            regs = parse_regions(args.regions_fname, argv+optind+1, argc-optind-1, &nregs);
+        return query_regions(&args, argv[optind], regs, nregs);
+    }
+
+    char *fname = argv[optind];
+    int ftype = file_type(fname);
+    if ( !conf_ptr )    // no preset given
+    {
+        if ( ftype==IS_GFF ) conf_ptr = &tbx_conf_gff;
+        else if ( ftype==IS_BED ) conf_ptr = &tbx_conf_bed;
+        else if ( ftype==IS_SAM ) conf_ptr = &tbx_conf_sam;
+        else if ( ftype==IS_VCF )
+        {
+            conf_ptr = &tbx_conf_vcf;
+            if ( !min_shift && do_csi ) min_shift = 14;
+        }
+        else if ( ftype==IS_BCF )
+        {
+            if ( !min_shift ) min_shift = 14;
+        }
+        else if ( ftype==IS_BAM )
+        {
+            if ( !min_shift ) min_shift = 14;
+        }
+    }
+    if ( do_csi )
+    {
+        if ( !min_shift ) min_shift = 14;
+        min_shift *= do_csi;  // positive for CSIv2, negative for CSIv1
+    }
+    if ( min_shift!=0 && !do_csi ) do_csi = 1;
+
+    if ( reheader )
+        return reheader_file(fname, reheader, ftype, conf_ptr);
+
+    if ( conf_ptr )
+        conf = *conf_ptr;
+
+    char *suffix = ".tbi";
+    if ( do_csi ) suffix = ".csi";
+    else if ( ftype==IS_BAM ) suffix = ".bai";
+    else if ( ftype==IS_CRAM ) suffix = ".crai";
+
+    char *idx_fname = calloc(strlen(fname) + 5, 1);
+    strcat(strcpy(idx_fname, fname), suffix);
+
+    struct stat stat_tbi, stat_file;
+    if ( !is_force && stat(idx_fname, &stat_tbi)==0 )
+    {
+        // Before complaining about existing index, check if the VCF file isn't
+        // newer. This is a common source of errors, people tend not to notice
+        // that tabix failed
+        stat(fname, &stat_file);
+        if ( stat_file.st_mtime <= stat_tbi.st_mtime )
+            error("[tabix] the index file exists. Please use '-f' to overwrite.\n");
+    }
+    free(idx_fname);
+
+    if ( ftype==IS_CRAM )
+    {
+        if ( bam_index_build(fname, min_shift)!=0 ) error("bam_index_build failed: %s\n", fname);
+        return 0;
+    }
+    else if ( do_csi )
+    {
+        if ( ftype==IS_BCF )
+        {
+            if ( bcf_index_build(fname, min_shift)!=0 ) error("bcf_index_build failed: %s\n", fname);
+            return 0;
+        }
+        if ( ftype==IS_BAM )
+        {
+            if ( bam_index_build(fname, min_shift)!=0 ) error("bam_index_build failed: %s\n", fname);
+            return 0;
+        }
+        if ( tbx_index_build(fname, min_shift, &conf)!=0 ) error("tbx_index_build failed: %s\n", fname);
+        return 0;
+    }
+    else    // TBI index
+    {
+        if ( tbx_index_build(fname, min_shift, &conf) ) error("tbx_index_build failed: %s\n", fname);
+        return 0;
+    }
+    return 0;
+}