annotate ezBAMQC/src/htslib/synced_bcf_reader.c @ 8:82bb8c455761

Uploaded
author cshl-bsr
date Wed, 30 Mar 2016 12:03:10 -0400
parents dfa3745e5fd8
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1 /* synced_bcf_reader.c -- stream through multiple VCF files.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3 Copyright (C) 2012-2014 Genome Research Ltd.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
4
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
5 Author: Petr Danecek <pd3@sanger.ac.uk>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
6
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
7 Permission is hereby granted, free of charge, to any person obtaining a copy
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
8 of this software and associated documentation files (the "Software"), to deal
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
9 in the Software without restriction, including without limitation the rights
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
11 copies of the Software, and to permit persons to whom the Software is
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
12 furnished to do so, subject to the following conditions:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
13
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
14 The above copyright notice and this permission notice shall be included in
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
15 all copies or substantial portions of the Software.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
16
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
23 DEALINGS IN THE SOFTWARE. */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
24
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
25 #include <stdio.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
26 #include <unistd.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
27 #include <string.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
28 #include <limits.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
29 #include <errno.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
30 #include <ctype.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
31 #include <sys/stat.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
32 #include "htslib/synced_bcf_reader.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
33 #include "htslib/kseq.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
34 #include "htslib/khash_str2int.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
35
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
36 #define MAX_CSI_COOR 0x7fffffff // maximum indexable coordinate of .csi
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
37
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
38 typedef struct
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
39 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
40 uint32_t start, end;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
41 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
42 region1_t;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
43
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
44 typedef struct _region_t
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
45 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
46 region1_t *regs;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
47 int nregs, mregs, creg;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
48 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
49 region_t;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
50
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
51 static void _regions_add(bcf_sr_regions_t *reg, const char *chr, int start, int end);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
52 static bcf_sr_regions_t *_regions_init_string(const char *str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
53 static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
54
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
55 char *bcf_sr_strerror(int errnum)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
56 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
57 switch (errnum)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
58 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
59 case open_failed:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
60 return strerror(errno); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
61 case not_bgzf:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
62 return "not compressed with bgzip"; break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
63 case idx_load_failed:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
64 return "could not load index"; break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
65 case file_type_error:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
66 return "unknown file type"; break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
67 case api_usage_error:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
68 return "API usage error"; break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
69 case header_error:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
70 return "could not parse header"; break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
71 default: return "";
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
72 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
73 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
74
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
75 static int *init_filters(bcf_hdr_t *hdr, const char *filters, int *nfilters)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
76 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
77 kstring_t str = {0,0,0};
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
78 const char *tmp = filters, *prev = filters;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
79 int nout = 0, *out = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
80 while ( 1 )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
81 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
82 if ( *tmp==',' || !*tmp )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
83 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
84 out = (int*) realloc(out, (nout+1)*sizeof(int));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
85 if ( tmp-prev==1 && *prev=='.' )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
86 out[nout] = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
87 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
88 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
89 str.l = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
90 kputsn(prev, tmp-prev, &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
91 out[nout] = bcf_hdr_id2int(hdr, BCF_DT_ID, str.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
92 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
93 nout++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
94 if ( !*tmp ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
95 prev = tmp+1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
96 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
97 tmp++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
98 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
99 if ( str.m ) free(str.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
100 *nfilters = nout;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
101 return out;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
102 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
103
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
104 int bcf_sr_set_regions(bcf_srs_t *readers, const char *regions, int is_file)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
105 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
106 assert( !readers->regions );
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
107 if ( readers->nreaders )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
108 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
109 fprintf(stderr,"[%s:%d %s] Error: bcf_sr_set_regions() must be called before bcf_sr_add_reader()\n", __FILE__,__LINE__,__FUNCTION__);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
110 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
111 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
112 readers->regions = bcf_sr_regions_init(regions,is_file,0,1,-2);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
113 if ( !readers->regions ) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
114 readers->explicit_regs = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
115 readers->require_index = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
116 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
117 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
118 int bcf_sr_set_targets(bcf_srs_t *readers, const char *targets, int is_file, int alleles)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
119 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
120 assert( !readers->targets );
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
121 if ( targets[0]=='^' )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
122 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
123 readers->targets_exclude = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
124 targets++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
125 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
126 readers->targets = bcf_sr_regions_init(targets,is_file,0,1,-2);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
127 if ( !readers->targets ) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
128 readers->targets_als = alleles;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
129 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
130 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
131
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
132 int bcf_sr_add_reader(bcf_srs_t *files, const char *fname)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
133 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
134 htsFile* file_ptr = hts_open(fname, "r");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
135 if ( ! file_ptr ) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
136 files->errnum = open_failed;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
137 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
138 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
139
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
140 files->has_line = (int*) realloc(files->has_line, sizeof(int)*(files->nreaders+1));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
141 files->has_line[files->nreaders] = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
142 files->readers = (bcf_sr_t*) realloc(files->readers, sizeof(bcf_sr_t)*(files->nreaders+1));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
143 bcf_sr_t *reader = &files->readers[files->nreaders++];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
144 memset(reader,0,sizeof(bcf_sr_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
145
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
146 reader->file = file_ptr;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
147
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
148 files->errnum = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
149
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
150 if ( files->require_index )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
151 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
152 if ( reader->file->format.format==vcf )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
153 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
154 if ( reader->file->format.compression!=bgzf )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
155 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
156 files->errnum = not_bgzf;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
157 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
158 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
159
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
160 reader->tbx_idx = tbx_index_load(fname);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
161 if ( !reader->tbx_idx )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
162 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
163 files->errnum = idx_load_failed;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
164 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
165 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
166
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
167 reader->header = bcf_hdr_read(reader->file);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
168 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
169 else if ( reader->file->format.format==bcf )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
170 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
171 if ( reader->file->format.compression!=bgzf )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
172 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
173 files->errnum = not_bgzf;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
174 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
175 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
176
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
177 reader->header = bcf_hdr_read(reader->file);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
178
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
179 reader->bcf_idx = bcf_index_load(fname);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
180 if ( !reader->bcf_idx )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
181 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
182 files->errnum = idx_load_failed;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
183 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
184 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
185 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
186 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
187 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
188 files->errnum = file_type_error;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
189 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
190 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
191 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
192 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
193 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
194 if ( reader->file->format.format==bcf || reader->file->format.format==vcf )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
195 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
196 reader->header = bcf_hdr_read(reader->file);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
197 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
198 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
199 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
200 files->errnum = file_type_error;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
201 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
202 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
203 files->streaming = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
204 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
205 if ( files->streaming && files->nreaders>1 )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
206 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
207 files->errnum = api_usage_error;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
208 fprintf(stderr,"[%s:%d %s] Error: %d readers, yet require_index not set\n", __FILE__,__LINE__,__FUNCTION__,files->nreaders);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
209 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
210 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
211 if ( files->streaming && files->regions )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
212 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
213 files->errnum = api_usage_error;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
214 fprintf(stderr,"[%s:%d %s] Error: cannot tabix-jump in streaming mode\n", __FILE__,__LINE__,__FUNCTION__);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
215 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
216 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
217 if ( !reader->header )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
218 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
219 files->errnum = header_error;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
220 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
221 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
222
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
223 reader->fname = fname;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
224 if ( files->apply_filters )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
225 reader->filter_ids = init_filters(reader->header, files->apply_filters, &reader->nfilter_ids);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
226
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
227 // Update list of chromosomes
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
228 if ( !files->explicit_regs && !files->streaming )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
229 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
230 int n,i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
231 const char **names = reader->tbx_idx ? tbx_seqnames(reader->tbx_idx, &n) : bcf_hdr_seqnames(reader->header, &n);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
232 for (i=0; i<n; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
233 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
234 if ( !files->regions )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
235 files->regions = _regions_init_string(names[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
236 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
237 _regions_add(files->regions, names[i], -1, -1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
238 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
239 free(names);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
240 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
241
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
242 return 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
243 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
244
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
245 bcf_srs_t *bcf_sr_init(void)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
246 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
247 bcf_srs_t *files = (bcf_srs_t*) calloc(1,sizeof(bcf_srs_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
248 return files;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
249 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
250
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
251 static void bcf_sr_destroy1(bcf_sr_t *reader)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
252 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
253 if ( reader->tbx_idx ) tbx_destroy(reader->tbx_idx);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
254 if ( reader->bcf_idx ) hts_idx_destroy(reader->bcf_idx);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
255 bcf_hdr_destroy(reader->header);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
256 hts_close(reader->file);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
257 if ( reader->itr ) tbx_itr_destroy(reader->itr);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
258 int j;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
259 for (j=0; j<reader->mbuffer; j++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
260 bcf_destroy1(reader->buffer[j]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
261 free(reader->buffer);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
262 free(reader->samples);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
263 free(reader->filter_ids);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
264 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
265 void bcf_sr_destroy(bcf_srs_t *files)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
266 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
267 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
268 for (i=0; i<files->nreaders; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
269 bcf_sr_destroy1(&files->readers[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
270 free(files->has_line);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
271 free(files->readers);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
272 for (i=0; i<files->n_smpl; i++) free(files->samples[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
273 free(files->samples);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
274 if (files->targets) bcf_sr_regions_destroy(files->targets);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
275 if (files->regions) bcf_sr_regions_destroy(files->regions);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
276 if ( files->tmps.m ) free(files->tmps.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
277 free(files);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
278 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
279
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
280 void bcf_sr_remove_reader(bcf_srs_t *files, int i)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
281 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
282 assert( !files->samples ); // not ready for this yet
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
283 bcf_sr_destroy1(&files->readers[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
284 if ( i+1 < files->nreaders )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
285 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
286 memmove(&files->readers[i], &files->readers[i+1], (files->nreaders-i-1)*sizeof(bcf_sr_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
287 memmove(&files->has_line[i], &files->has_line[i+1], (files->nreaders-i-1)*sizeof(int));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
288 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
289 files->nreaders--;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
290 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
291
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
292
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
293 /*
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
294 Removes duplicate records from the buffer. The meaning of "duplicate" is
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
295 controlled by the $collapse variable, which can cause that from multiple
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
296 <indel|snp|any> lines only the first is considered and the rest is ignored.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
297 The removal is done by setting the redundant lines' positions to -1 and
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
298 moving these lines at the end of the buffer.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
299 */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
300 static void collapse_buffer(bcf_srs_t *files, bcf_sr_t *reader)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
301 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
302 int irec,jrec, has_snp=0, has_indel=0, has_any=0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
303 for (irec=1; irec<=reader->nbuffer; irec++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
304 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
305 bcf1_t *line = reader->buffer[irec];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
306 if ( line->pos != reader->buffer[1]->pos ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
307 if ( files->collapse&COLLAPSE_ANY )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
308 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
309 if ( !has_any ) has_any = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
310 else line->pos = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
311 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
312 int line_type = bcf_get_variant_types(line);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
313 if ( files->collapse&COLLAPSE_SNPS && line_type&(VCF_SNP|VCF_MNP) )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
314 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
315 if ( !has_snp ) has_snp = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
316 else line->pos = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
317 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
318 if ( files->collapse&COLLAPSE_INDELS && line_type&VCF_INDEL )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
319 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
320 if ( !has_indel ) has_indel = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
321 else line->pos = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
322 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
323 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
324 bcf1_t *tmp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
325 irec = jrec = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
326 while ( irec<=reader->nbuffer && jrec<=reader->nbuffer )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
327 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
328 if ( reader->buffer[irec]->pos != -1 ) { irec++; continue; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
329 if ( jrec<=irec ) jrec = irec+1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
330 while ( jrec<=reader->nbuffer && reader->buffer[jrec]->pos==-1 ) jrec++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
331 if ( jrec<=reader->nbuffer )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
332 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
333 tmp = reader->buffer[irec]; reader->buffer[irec] = reader->buffer[jrec]; reader->buffer[jrec] = tmp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
334 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
335 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
336 reader->nbuffer = irec - 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
337 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
338
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
339 void debug_buffer(FILE *fp, bcf_sr_t *reader)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
340 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
341 int j;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
342 for (j=0; j<=reader->nbuffer; j++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
343 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
344 bcf1_t *line = reader->buffer[j];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
345 fprintf(fp,"%s%s\t%s:%d\t%s ", reader->fname,j==0?"*":"",reader->header->id[BCF_DT_CTG][line->rid].key,line->pos+1,line->n_allele?line->d.allele[0]:"");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
346 int k;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
347 for (k=1; k<line->n_allele; k++) fprintf(fp," %s", line->d.allele[k]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
348 fprintf(fp,"\n");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
349 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
350 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
351
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
352 void debug_buffers(FILE *fp, bcf_srs_t *files)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
353 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
354 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
355 for (i=0; i<files->nreaders; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
356 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
357 fprintf(fp, "has_line: %d\t%s\n", bcf_sr_has_line(files,i),files->readers[i].fname);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
358 debug_buffer(fp, &files->readers[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
359 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
360 fprintf(fp,"\n");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
361 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
362
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
363 static inline int has_filter(bcf_sr_t *reader, bcf1_t *line)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
364 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
365 int i, j;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
366 if ( !line->d.n_flt )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
367 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
368 for (j=0; j<reader->nfilter_ids; j++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
369 if ( reader->filter_ids[j]<0 ) return 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
370 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
371 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
372 for (i=0; i<line->d.n_flt; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
373 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
374 for (j=0; j<reader->nfilter_ids; j++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
375 if ( line->d.flt[i]==reader->filter_ids[j] ) return 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
376 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
377 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
378 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
379
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
380 static int _reader_seek(bcf_sr_t *reader, const char *seq, int start, int end)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
381 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
382 if ( end>=MAX_CSI_COOR )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
383 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
384 fprintf(stderr,"The coordinate is out of csi index limit: %d\n", end+1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
385 exit(1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
386 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
387 if ( reader->itr )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
388 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
389 hts_itr_destroy(reader->itr);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
390 reader->itr = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
391 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
392 reader->nbuffer = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
393 if ( reader->tbx_idx )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
394 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
395 int tid = tbx_name2id(reader->tbx_idx, seq);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
396 if ( tid==-1 ) return -1; // the sequence not present in this file
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
397 reader->itr = tbx_itr_queryi(reader->tbx_idx,tid,start,end+1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
398 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
399 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
400 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
401 int tid = bcf_hdr_name2id(reader->header, seq);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
402 if ( tid==-1 ) return -1; // the sequence not present in this file
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
403 reader->itr = bcf_itr_queryi(reader->bcf_idx,tid,start,end+1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
404 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
405 assert(reader->itr);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
406 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
407 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
408
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
409 /*
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
410 * _readers_next_region() - jumps to next region if necessary
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
411 * Returns 0 on success or -1 when there are no more regions left
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
412 */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
413 static int _readers_next_region(bcf_srs_t *files)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
414 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
415 // Need to open new chromosome? Check number of lines in all readers' buffers
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
416 int i, eos = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
417 for (i=0; i<files->nreaders; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
418 if ( !files->readers[i].itr && !files->readers[i].nbuffer ) eos++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
419
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
420 if ( eos!=files->nreaders )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
421 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
422 // Some of the readers still has buffered lines
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
423 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
424 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
425
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
426 // No lines in the buffer, need to open new region or quit
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
427 if ( bcf_sr_regions_next(files->regions)<0 ) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
428
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
429 for (i=0; i<files->nreaders; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
430 _reader_seek(&files->readers[i],files->regions->seq_names[files->regions->iseq],files->regions->start,files->regions->end);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
431
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
432 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
433 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
434
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
435 /*
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
436 * _reader_fill_buffer() - buffers all records with the same coordinate
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
437 */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
438 static void _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
439 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
440 // Return if the buffer is full: the coordinate of the last buffered record differs
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
441 if ( reader->nbuffer && reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) return;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
442
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
443 // No iterator (sequence not present in this file) and not streaming
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
444 if ( !reader->itr && !files->streaming ) return;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
445
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
446 // Fill the buffer with records starting at the same position
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
447 int i, ret = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
448 while (1)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
449 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
450 if ( reader->nbuffer+1 >= reader->mbuffer )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
451 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
452 // Increase buffer size
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
453 reader->mbuffer += 8;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
454 reader->buffer = (bcf1_t**) realloc(reader->buffer, sizeof(bcf1_t*)*reader->mbuffer);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
455 for (i=8; i>0; i--) // initialize
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
456 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
457 reader->buffer[reader->mbuffer-i] = bcf_init1();
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
458 reader->buffer[reader->mbuffer-i]->max_unpack = files->max_unpack;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
459 reader->buffer[reader->mbuffer-i]->pos = -1; // for rare cases when VCF starts from 1
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
460 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
461 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
462 if ( files->streaming )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
463 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
464 if ( reader->file->format.format==vcf )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
465 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
466 if ( (ret=hts_getline(reader->file, KS_SEP_LINE, &files->tmps)) < 0 ) break; // no more lines
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
467 int ret = vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
468 if ( ret<0 ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
469 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
470 else if ( reader->file->format.format==bcf )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
471 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
472 if ( (ret=bcf_read1(reader->file, reader->header, reader->buffer[reader->nbuffer+1])) < 0 ) break; // no more lines
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
473 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
474 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
475 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
476 fprintf(stderr,"[%s:%d %s] fixme: not ready for this\n", __FILE__,__LINE__,__FUNCTION__);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
477 exit(1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
478 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
479 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
480 else if ( reader->tbx_idx )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
481 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
482 if ( (ret=tbx_itr_next(reader->file, reader->tbx_idx, reader->itr, &files->tmps)) < 0 ) break; // no more lines
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
483 vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
484 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
485 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
486 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
487 if ( (ret=bcf_itr_next(reader->file, reader->itr, reader->buffer[reader->nbuffer+1])) < 0 ) break; // no more lines
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
488 bcf_subset_format(reader->header,reader->buffer[reader->nbuffer+1]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
489 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
490
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
491 // apply filter
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
492 if ( !reader->nfilter_ids )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
493 bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
494 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
495 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
496 bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR|BCF_UN_FLT);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
497 if ( !has_filter(reader, reader->buffer[reader->nbuffer+1]) ) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
498 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
499 reader->nbuffer++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
500
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
501 if ( reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) break; // the buffer is full
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
502 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
503 if ( ret<0 )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
504 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
505 // done for this region
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
506 tbx_itr_destroy(reader->itr);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
507 reader->itr = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
508 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
509 if ( files->collapse && reader->nbuffer>=2 && reader->buffer[1]->pos==reader->buffer[2]->pos )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
510 collapse_buffer(files, reader);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
511 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
512
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
513 /*
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
514 * _readers_shift_buffer() - removes the first line and all subsequent lines with the same position
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
515 */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
516 static void _reader_shift_buffer(bcf_sr_t *reader)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
517 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
518 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
519 for (i=2; i<=reader->nbuffer; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
520 if ( reader->buffer[i]->pos!=reader->buffer[1]->pos ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
521 if ( i<=reader->nbuffer )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
522 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
523 // A record with a different position follows, swap it. Because of the reader's logic,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
524 // only one such line can be present.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
525 bcf1_t *tmp = reader->buffer[1]; reader->buffer[1] = reader->buffer[i]; reader->buffer[i] = tmp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
526 reader->nbuffer = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
527 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
528 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
529 reader->nbuffer = 0; // no other line
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
530 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
531
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
532 /*
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
533 * _reader_match_alleles() - from multiple buffered lines selects the one which
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
534 * corresponds best to the template line. The logic is controlled by COLLAPSE_*
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
535 * Returns 0 on success or -1 when no good matching line is found.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
536 */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
537 static int _reader_match_alleles(bcf_srs_t *files, bcf_sr_t *reader, bcf1_t *tmpl)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
538 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
539 int i, irec = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
540
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
541 // if no template given, use the first available record
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
542 if ( !tmpl )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
543 irec = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
544 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
545 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
546 int tmpl_type = bcf_get_variant_types(tmpl);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
547 for (i=1; i<=reader->nbuffer; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
548 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
549 bcf1_t *line = reader->buffer[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
550 if ( line->pos != reader->buffer[1]->pos ) break; // done with this reader
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
551
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
552 // Easiest case: matching by position only
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
553 if ( files->collapse&COLLAPSE_ANY ) { irec=i; break; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
554
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
555 int line_type = bcf_get_variant_types(line);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
556
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
557 // No matter what the alleles are, as long as they are both SNPs
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
558 if ( files->collapse&COLLAPSE_SNPS && tmpl_type&VCF_SNP && line_type&VCF_SNP ) { irec=i; break; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
559 // ... or indels
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
560 if ( files->collapse&COLLAPSE_INDELS && tmpl_type&VCF_INDEL && line_type&VCF_INDEL ) { irec=i; break; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
561
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
562 // More thorough checking: REFs must match
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
563 if ( tmpl->rlen != line->rlen ) continue; // different length
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
564 if ( strcmp(tmpl->d.allele[0], line->d.allele[0]) ) continue; // the strings do not match
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
565
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
566 int ial,jal;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
567 if ( files->collapse==COLLAPSE_NONE )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
568 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
569 // Exact match, all alleles must be identical
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
570 if ( tmpl->n_allele!=line->n_allele ) continue; // different number of alleles, skip
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
571
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
572 int nmatch = 1; // REF has been already checked
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
573 for (ial=1; ial<tmpl->n_allele; ial++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
574 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
575 for (jal=1; jal<line->n_allele; jal++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
576 if ( !strcmp(tmpl->d.allele[ial], line->d.allele[jal]) ) { nmatch++; break; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
577 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
578 if ( nmatch==tmpl->n_allele ) { irec=i; break; } // found: exact match
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
579 continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
580 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
581
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
582 if ( line->n_allele==1 && tmpl->n_allele==1 ) { irec=i; break; } // both sites are non-variant
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
583
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
584 // COLLAPSE_SOME: at least some ALTs must match
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
585 for (ial=1; ial<tmpl->n_allele; ial++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
586 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
587 for (jal=1; jal<line->n_allele; jal++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
588 if ( !strcmp(tmpl->d.allele[ial], line->d.allele[jal]) ) { irec=i; break; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
589 if ( irec>=1 ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
590 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
591 if ( irec>=1 ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
592 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
593 if ( irec==-1 ) return -1; // no matching line was found
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
594 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
595
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
596 // Set the selected line (irec) as active: set it to buffer[0], move the remaining lines forward
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
597 // and put the old bcf1_t record at the end.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
598 bcf1_t *tmp = reader->buffer[0];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
599 reader->buffer[0] = reader->buffer[irec];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
600 for (i=irec+1; i<=reader->nbuffer; i++) reader->buffer[i-1] = reader->buffer[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
601 reader->buffer[ reader->nbuffer ] = tmp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
602 reader->nbuffer--;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
603
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
604 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
605 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
606
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
607 int _reader_next_line(bcf_srs_t *files)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
608 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
609 int i, min_pos = INT_MAX;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
610
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
611 // Loop until next suitable line is found or all readers have finished
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
612 while ( 1 )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
613 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
614 // Get all readers ready for the next region.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
615 if ( files->regions && _readers_next_region(files)<0 ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
616
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
617 // Fill buffers
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
618 const char *chr = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
619 for (i=0; i<files->nreaders; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
620 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
621 _reader_fill_buffer(files, &files->readers[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
622
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
623 // Update the minimum coordinate
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
624 if ( !files->readers[i].nbuffer ) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
625 if ( min_pos > files->readers[i].buffer[1]->pos )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
626 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
627 min_pos = files->readers[i].buffer[1]->pos;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
628 chr = bcf_seqname(files->readers[i].header, files->readers[i].buffer[1]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
629 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
630 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
631 if ( min_pos==INT_MAX )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
632 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
633 if ( !files->regions ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
634 continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
635 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
636
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
637 // Skip this position if not present in targets
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
638 if ( files->targets )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
639 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
640 int ret = bcf_sr_regions_overlap(files->targets, chr, min_pos, min_pos);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
641 if ( (!files->targets_exclude && ret<0) || (files->targets_exclude && !ret) )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
642 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
643 // Remove all lines with this position from the buffer
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
644 for (i=0; i<files->nreaders; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
645 if ( files->readers[i].nbuffer && files->readers[i].buffer[1]->pos==min_pos )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
646 _reader_shift_buffer(&files->readers[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
647 min_pos = INT_MAX;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
648 continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
649 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
650 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
651
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
652 break; // done: min_pos is set
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
653 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
654
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
655 // There can be records with duplicate positions. Set the active line intelligently so that
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
656 // the alleles match.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
657 int nret = 0; // number of readers sharing the position
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
658 bcf1_t *first = NULL; // record which will be used for allele matching
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
659 for (i=0; i<files->nreaders; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
660 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
661 files->has_line[i] = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
662
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
663 // Skip readers with no records at this position
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
664 if ( !files->readers[i].nbuffer || files->readers[i].buffer[1]->pos!=min_pos ) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
665
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
666 // Until now buffer[0] of all reader was empty and the lines started at buffer[1].
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
667 // Now lines which are ready to be output will be moved to buffer[0].
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
668 if ( _reader_match_alleles(files, &files->readers[i], first) < 0 ) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
669 if ( !first ) first = files->readers[i].buffer[0];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
670
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
671 nret++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
672 files->has_line[i] = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
673 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
674 return nret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
675 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
676
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
677 int bcf_sr_next_line(bcf_srs_t *files)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
678 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
679 if ( !files->targets_als )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
680 return _reader_next_line(files);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
681
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
682 while (1)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
683 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
684 int i, ret = _reader_next_line(files);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
685 if ( !ret ) return ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
686
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
687 for (i=0; i<files->nreaders; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
688 if ( files->has_line[i] ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
689
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
690 if ( _regions_match_alleles(files->targets, files->targets_als-1, files->readers[i].buffer[0]) ) return ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
691
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
692 // Check if there are more duplicate lines in the buffers. If not, return this line as if it
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
693 // matched the targets, even if there is a type mismatch
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
694 for (i=0; i<files->nreaders; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
695 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
696 if ( !files->has_line[i] ) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
697 if ( files->readers[i].nbuffer==0 || files->readers[i].buffer[1]->pos!=files->readers[i].buffer[0]->pos ) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
698 break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
699 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
700 if ( i==files->nreaders ) return ret; // no more lines left, output even if target alleles are not of the same type
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
701 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
702 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
703
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
704 static void bcf_sr_seek_start(bcf_srs_t *readers)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
705 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
706 bcf_sr_regions_t *reg = readers->regions;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
707 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
708 for (i=0; i<reg->nseqs; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
709 reg->regs[i].creg = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
710 reg->iseq = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
711 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
712
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
713
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
714 int bcf_sr_seek(bcf_srs_t *readers, const char *seq, int pos)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
715 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
716 if ( !seq && !pos )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
717 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
718 // seek to start
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
719 bcf_sr_seek_start(readers);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
720 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
721 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
722
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
723 bcf_sr_regions_overlap(readers->regions, seq, pos, pos);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
724 int i, nret = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
725 for (i=0; i<readers->nreaders; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
726 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
727 nret += _reader_seek(&readers->readers[i],seq,pos,MAX_CSI_COOR-1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
728 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
729 return nret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
730 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
731
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
732 int bcf_sr_set_samples(bcf_srs_t *files, const char *fname, int is_file)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
733 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
734 int i, j, nsmpl, free_smpl = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
735 char **smpl = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
736
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
737 void *exclude = (fname[0]=='^') ? khash_str2int_init() : NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
738 if ( exclude || strcmp("-",fname) ) // "-" stands for all samples
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
739 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
740 smpl = hts_readlist(fname, is_file, &nsmpl);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
741 if ( !smpl )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
742 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
743 fprintf(stderr,"Could not read the file: \"%s\"\n", fname);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
744 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
745 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
746 if ( exclude )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
747 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
748 for (i=0; i<nsmpl; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
749 khash_str2int_inc(exclude, smpl[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
750 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
751 free_smpl = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
752 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
753 if ( !smpl )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
754 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
755 smpl = files->readers[0].header->samples; // intersection of all samples
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
756 nsmpl = bcf_hdr_nsamples(files->readers[0].header);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
757 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
758
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
759 files->samples = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
760 files->n_smpl = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
761 for (i=0; i<nsmpl; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
762 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
763 if ( exclude && khash_str2int_has_key(exclude,smpl[i]) ) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
764
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
765 int n_isec = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
766 for (j=0; j<files->nreaders; j++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
767 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
768 if ( bcf_hdr_id2int(files->readers[j].header, BCF_DT_SAMPLE, smpl[i])<0 ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
769 n_isec++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
770 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
771 if ( n_isec!=files->nreaders )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
772 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
773 fprintf(stderr,"Warning: The sample \"%s\" was not found in %s, skipping\n", smpl[i], files->readers[n_isec].fname);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
774 continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
775 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
776
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
777 files->samples = (char**) realloc(files->samples, (files->n_smpl+1)*sizeof(const char*));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
778 files->samples[files->n_smpl++] = strdup(smpl[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
779 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
780
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
781 if ( exclude ) khash_str2int_destroy(exclude);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
782 if ( free_smpl )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
783 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
784 for (i=0; i<nsmpl; i++) free(smpl[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
785 free(smpl);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
786 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
787
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
788 if ( !files->n_smpl )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
789 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
790 if ( files->nreaders>1 )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
791 fprintf(stderr,"No samples in common.\n");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
792 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
793 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
794 for (i=0; i<files->nreaders; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
795 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
796 bcf_sr_t *reader = &files->readers[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
797 reader->samples = (int*) malloc(sizeof(int)*files->n_smpl);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
798 reader->n_smpl = files->n_smpl;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
799 for (j=0; j<files->n_smpl; j++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
800 reader->samples[j] = bcf_hdr_id2int(reader->header, BCF_DT_SAMPLE, files->samples[j]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
801 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
802 return 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
803 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
804
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
805 // Add a new region into a list sorted by start,end. On input the coordinates
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
806 // are 1-based, stored 0-based, inclusive.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
807 static void _regions_add(bcf_sr_regions_t *reg, const char *chr, int start, int end)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
808 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
809 if ( start==-1 && end==-1 )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
810 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
811 start = 0; end = MAX_CSI_COOR-1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
812 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
813 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
814 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
815 start--; end--; // store 0-based coordinates
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
816 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
817
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
818 if ( !reg->seq_hash )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
819 reg->seq_hash = khash_str2int_init();
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
820
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
821 int iseq;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
822 if ( khash_str2int_get(reg->seq_hash, chr, &iseq)<0 )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
823 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
824 // the chromosome block does not exist
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
825 iseq = reg->nseqs++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
826 reg->seq_names = (char**) realloc(reg->seq_names,sizeof(char*)*reg->nseqs);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
827 reg->regs = (region_t*) realloc(reg->regs,sizeof(region_t)*reg->nseqs);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
828 memset(&reg->regs[reg->nseqs-1],0,sizeof(region_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
829 reg->seq_names[iseq] = strdup(chr);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
830 reg->regs[iseq].creg = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
831 khash_str2int_set(reg->seq_hash,reg->seq_names[iseq],iseq);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
832 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
833
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
834 region_t *creg = &reg->regs[iseq];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
835
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
836 // the regions may not be sorted on input: binary search
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
837 int i, min = 0, max = creg->nregs - 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
838 while ( min<=max )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
839 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
840 i = (max+min)/2;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
841 if ( start < creg->regs[i].start ) max = i - 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
842 else if ( start > creg->regs[i].start ) min = i + 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
843 else break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
844 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
845 if ( min>max || creg->regs[i].start!=start || creg->regs[i].end!=end )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
846 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
847 // no such region, insert a new one just after max
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
848 hts_expand(region1_t,creg->nregs+1,creg->mregs,creg->regs);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
849 if ( ++max < creg->nregs )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
850 memmove(&creg->regs[max+1],&creg->regs[max],(creg->nregs - max)*sizeof(region1_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
851 creg->regs[max].start = start;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
852 creg->regs[max].end = end;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
853 creg->nregs++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
854 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
855 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
856
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
857 // File name or a list of genomic locations. If file name, NULL is returned.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
858 static bcf_sr_regions_t *_regions_init_string(const char *str)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
859 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
860 bcf_sr_regions_t *reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
861 reg->start = reg->end = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
862 reg->prev_start = reg->prev_seq = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
863
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
864 kstring_t tmp = {0,0,0};
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
865 const char *sp = str, *ep = str;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
866 int from, to;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
867 while ( 1 )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
868 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
869 while ( *ep && *ep!=',' && *ep!=':' ) ep++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
870 tmp.l = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
871 kputsn(sp,ep-sp,&tmp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
872 if ( *ep==':' )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
873 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
874 sp = ep+1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
875 from = strtol(sp,(char**)&ep,10);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
876 if ( sp==ep )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
877 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
878 fprintf(stderr,"[%s:%d %s] Could not parse the region(s): %s\n", __FILE__,__LINE__,__FUNCTION__,str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
879 free(reg); free(tmp.s); return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
880 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
881 if ( !*ep || *ep==',' )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
882 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
883 _regions_add(reg, tmp.s, from, from);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
884 sp = ep;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
885 continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
886 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
887 if ( *ep!='-' )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
888 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
889 fprintf(stderr,"[%s:%d %s] Could not parse the region(s): %s\n", __FILE__,__LINE__,__FUNCTION__,str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
890 free(reg); free(tmp.s); return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
891 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
892 ep++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
893 sp = ep;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
894 to = strtol(sp,(char**)&ep,10);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
895 if ( *ep && *ep!=',' )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
896 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
897 fprintf(stderr,"[%s:%d %s] Could not parse the region(s): %s\n", __FILE__,__LINE__,__FUNCTION__,str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
898 free(reg); free(tmp.s); return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
899 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
900 if ( sp==ep ) to = MAX_CSI_COOR-1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
901 _regions_add(reg, tmp.s, from, to);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
902 if ( !*ep ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
903 sp = ep;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
904 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
905 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
906 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
907 if ( tmp.l ) _regions_add(reg, tmp.s, -1, -1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
908 if ( !*ep ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
909 sp = ++ep;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
910 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
911 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
912 free(tmp.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
913 return reg;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
914 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
915
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
916 // ichr,ifrom,ito are 0-based;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
917 // returns -1 on error, 0 if the line is a comment line, 1 on success
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
918 static int _regions_parse_line(char *line, int ichr,int ifrom,int ito, char **chr,char **chr_end,int *from,int *to)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
919 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
920 *chr_end = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
921
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
922 if ( line[0]=='#' ) return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
923
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
924 int k,l; // index of the start and end column of the tab-delimited file
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
925 if ( ifrom <= ito )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
926 k = ifrom, l = ito;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
927 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
928 l = ifrom, k = ito;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
929
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
930 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
931 char *se = line, *ss = NULL; // start and end
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
932 char *tmp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
933 for (i=0; i<=k && *se; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
934 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
935 ss = i==0 ? se++ : ++se;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
936 while (*se && *se!='\t') se++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
937 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
938 if ( i<=k ) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
939 if ( k==l )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
940 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
941 *from = *to = strtol(ss, &tmp, 10);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
942 if ( tmp==ss ) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
943 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
944 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
945 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
946 if ( k==ifrom )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
947 *from = strtol(ss, &tmp, 10);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
948 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
949 *to = strtol(ss, &tmp, 10);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
950 if ( ss==tmp ) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
951
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
952 for (i=k; i<l && *se; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
953 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
954 ss = ++se;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
955 while (*se && *se!='\t') se++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
956 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
957 if ( i<l ) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
958 if ( k==ifrom )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
959 *to = strtol(ss, &tmp, 10);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
960 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
961 *from = strtol(ss, &tmp, 10);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
962 if ( ss==tmp ) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
963 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
964
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
965 ss = se = line;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
966 for (i=0; i<=ichr && *se; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
967 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
968 if ( i>0 ) ss = ++se;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
969 while (*se && *se!='\t') se++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
970 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
971 if ( i<=ichr ) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
972 *chr_end = se;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
973 *chr = ss;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
974 return 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
975 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
976
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
977 bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr, int ifrom, int ito)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
978 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
979 bcf_sr_regions_t *reg;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
980 if ( !is_file ) return _regions_init_string(regions);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
981
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
982 reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
983 reg->start = reg->end = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
984 reg->prev_start = reg->prev_seq = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
985
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
986 reg->file = hts_open(regions, "rb");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
987 if ( !reg->file )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
988 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
989 fprintf(stderr,"[%s:%d %s] Could not open file: %s\n", __FILE__,__LINE__,__FUNCTION__,regions);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
990 free(reg);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
991 return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
992 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
993
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
994 reg->tbx = tbx_index_load(regions);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
995 if ( !reg->tbx )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
996 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
997 int len = strlen(regions);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
998 int is_bed = strcasecmp(".bed",regions+len-4) ? 0 : 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
999 if ( !is_bed && !strcasecmp(".bed.gz",regions+len-7) ) is_bed = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1000
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1001 if ( reg->file->format.format==vcf ) ito = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1002
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1003 // read the whole file, tabix index is not present
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1004 while ( hts_getline(reg->file, KS_SEP_LINE, &reg->line) > 0 )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1005 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1006 char *chr, *chr_end;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1007 int from, to, ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1008 ret = _regions_parse_line(reg->line.s, ichr,ifrom,abs(ito), &chr,&chr_end,&from,&to);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1009 if ( ret < 0 )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1010 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1011 if ( ito<0 )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1012 ret = _regions_parse_line(reg->line.s, ichr,ifrom,ifrom, &chr,&chr_end,&from,&to);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1013 if ( ret<0 )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1014 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1015 fprintf(stderr,"[%s:%d] Could not parse the file %s, using the columns %d,%d[,%d]\n", __FILE__,__LINE__,regions,ichr+1,ifrom+1,ito+1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1016 hts_close(reg->file); reg->file = NULL; free(reg);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1017 return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1018 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1019 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1020 if ( !ret ) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1021 if ( is_bed ) from++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1022 *chr_end = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1023 _regions_add(reg, chr, from, to);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1024 *chr_end = '\t';
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1025 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1026 hts_close(reg->file); reg->file = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1027 if ( !reg->nseqs ) { free(reg); return NULL; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1028 return reg;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1029 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1030
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1031 reg->seq_names = (char**) tbx_seqnames(reg->tbx, &reg->nseqs);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1032 if ( !reg->seq_hash )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1033 reg->seq_hash = khash_str2int_init();
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1034 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1035 for (i=0; i<reg->nseqs; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1036 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1037 khash_str2int_set(reg->seq_hash,reg->seq_names[i],i);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1038 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1039 reg->fname = strdup(regions);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1040 reg->is_bin = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1041 return reg;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1042 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1043
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1044 void bcf_sr_regions_destroy(bcf_sr_regions_t *reg)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1045 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1046 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1047 free(reg->fname);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1048 if ( reg->itr ) tbx_itr_destroy(reg->itr);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1049 if ( reg->tbx ) tbx_destroy(reg->tbx);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1050 if ( reg->file ) hts_close(reg->file);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1051 if ( reg->als ) free(reg->als);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1052 if ( reg->als_str.s ) free(reg->als_str.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1053 free(reg->line.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1054 if ( reg->regs )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1055 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1056 // free only in-memory names, tbx names are const
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1057 for (i=0; i<reg->nseqs; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1058 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1059 free(reg->seq_names[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1060 free(reg->regs[i].regs);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1061 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1062 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1063 free(reg->regs);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1064 free(reg->seq_names);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1065 khash_str2int_destroy(reg->seq_hash);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1066 free(reg);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1067 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1068
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1069 int bcf_sr_regions_seek(bcf_sr_regions_t *reg, const char *seq)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1070 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1071 reg->iseq = reg->start = reg->end = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1072 if ( khash_str2int_get(reg->seq_hash, seq, &reg->iseq) < 0 ) return -1; // sequence seq not in regions
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1073
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1074 // using in-memory regions
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1075 if ( reg->regs )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1076 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1077 reg->regs[reg->iseq].creg = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1078 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1079 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1080
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1081 // reading regions from tabix
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1082 if ( reg->itr ) tbx_itr_destroy(reg->itr);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1083 reg->itr = tbx_itr_querys(reg->tbx, seq);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1084 if ( reg->itr ) return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1085
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1086 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1087 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1088
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1089 int bcf_sr_regions_next(bcf_sr_regions_t *reg)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1090 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1091 if ( reg->iseq<0 ) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1092 reg->start = reg->end = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1093 reg->nals = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1094
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1095 // using in-memory regions
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1096 if ( reg->regs )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1097 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1098 while ( reg->iseq < reg->nseqs )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1099 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1100 reg->regs[reg->iseq].creg++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1101 if ( reg->regs[reg->iseq].creg < reg->regs[reg->iseq].nregs ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1102 reg->iseq++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1103 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1104 if ( reg->iseq >= reg->nseqs ) { reg->iseq = -1; return -1; } // no more regions left
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1105 region1_t *creg = &reg->regs[reg->iseq].regs[reg->regs[reg->iseq].creg];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1106 reg->start = creg->start;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1107 reg->end = creg->end;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1108 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1109 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1110
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1111 // reading from tabix
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1112 char *chr, *chr_end;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1113 int ichr = 0, ifrom = 1, ito = 2, is_bed = 0, from, to;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1114 if ( reg->tbx )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1115 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1116 ichr = reg->tbx->conf.sc-1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1117 ifrom = reg->tbx->conf.bc-1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1118 ito = reg->tbx->conf.ec-1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1119 if ( ito<0 ) ito = ifrom;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1120 is_bed = reg->tbx->conf.preset==TBX_UCSC ? 1 : 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1121 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1122
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1123 int ret = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1124 while ( !ret )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1125 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1126 if ( reg->itr )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1127 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1128 // tabix index present, reading a chromosome block
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1129 ret = tbx_itr_next(reg->file, reg->tbx, reg->itr, &reg->line);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1130 if ( ret<0 ) { reg->iseq = -1; return -1; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1131 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1132 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1133 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1134 if ( reg->is_bin )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1135 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1136 // Waited for seek which never came. Reopen in text mode and stream
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1137 // through the regions, otherwise hts_getline would fail
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1138 hts_close(reg->file);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1139 reg->file = hts_open(reg->fname, "r");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1140 if ( !reg->file )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1141 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1142 fprintf(stderr,"[%s:%d %s] Could not open file: %s\n", __FILE__,__LINE__,__FUNCTION__,reg->fname);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1143 reg->file = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1144 bcf_sr_regions_destroy(reg);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1145 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1146 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1147 reg->is_bin = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1148 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1149
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1150 // tabix index absent, reading the whole file
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1151 ret = hts_getline(reg->file, KS_SEP_LINE, &reg->line);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1152 if ( ret<0 ) { reg->iseq = -1; return -1; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1153 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1154 ret = _regions_parse_line(reg->line.s, ichr,ifrom,ito, &chr,&chr_end,&from,&to);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1155 if ( ret<0 )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1156 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1157 fprintf(stderr,"[%s:%d] Could not parse the file %s, using the columns %d,%d,%d\n", __FILE__,__LINE__,reg->fname,ichr+1,ifrom+1,ito+1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1158 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1159 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1160 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1161 if ( is_bed ) from++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1162
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1163 *chr_end = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1164 if ( khash_str2int_get(reg->seq_hash, chr, &reg->iseq)<0 )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1165 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1166 fprintf(stderr,"Broken tabix index? The sequence \"%s\" not in dictionary [%s]\n", chr,reg->line.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1167 exit(1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1168 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1169 *chr_end = '\t';
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1170
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1171 reg->start = from - 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1172 reg->end = to - 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1173 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1174 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1175
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1176 static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1177 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1178 int i = 0, max_len = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1179 if ( !reg->nals )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1180 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1181 char *ss = reg->line.s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1182 while ( i<als_idx && *ss )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1183 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1184 if ( *ss=='\t' ) i++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1185 ss++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1186 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1187 char *se = ss;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1188 reg->nals = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1189 while ( *se && *se!='\t' )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1190 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1191 if ( *se==',' ) reg->nals++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1192 se++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1193 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1194 ks_resize(&reg->als_str, se-ss+1+reg->nals);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1195 reg->als_str.l = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1196 hts_expand(char*,reg->nals,reg->mals,reg->als);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1197 reg->nals = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1198
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1199 se = ss;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1200 while ( *(++se) )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1201 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1202 if ( *se=='\t' ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1203 if ( *se!=',' ) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1204 reg->als[reg->nals] = &reg->als_str.s[reg->als_str.l];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1205 kputsn(ss,se-ss,&reg->als_str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1206 if ( &reg->als_str.s[reg->als_str.l] - reg->als[reg->nals] > max_len ) max_len = &reg->als_str.s[reg->als_str.l] - reg->als[reg->nals];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1207 reg->als_str.l++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1208 reg->nals++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1209 ss = ++se;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1210 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1211 reg->als[reg->nals] = &reg->als_str.s[reg->als_str.l];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1212 kputsn(ss,se-ss,&reg->als_str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1213 if ( &reg->als_str.s[reg->als_str.l] - reg->als[reg->nals] > max_len ) max_len = &reg->als_str.s[reg->als_str.l] - reg->als[reg->nals];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1214 reg->nals++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1215 reg->als_type = max_len > 1 ? VCF_INDEL : VCF_SNP; // this is a simplified check, see vcf.c:bcf_set_variant_types
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1216 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1217 int type = bcf_get_variant_types(rec);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1218 if ( reg->als_type & VCF_INDEL )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1219 return type & VCF_INDEL ? 1 : 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1220 return !(type & VCF_INDEL) ? 1 : 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1221 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1222
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1223 int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, int start, int end)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1224 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1225 int iseq;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1226 if ( khash_str2int_get(reg->seq_hash, seq, &iseq)<0 ) return -1; // no such sequence
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1227
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1228 if ( reg->prev_seq==-1 || iseq!=reg->prev_seq || reg->prev_start > start ) // new chromosome or after a seek
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1229 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1230 // flush regions left on previous chromosome
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1231 if ( reg->missed_reg_handler && reg->prev_seq!=-1 && reg->iseq!=-1 )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1232 bcf_sr_regions_flush(reg);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1233
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1234 bcf_sr_regions_seek(reg, seq);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1235 reg->start = reg->end = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1236 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1237 if ( reg->prev_seq==iseq && reg->iseq!=iseq ) return -2; // no more regions on this chromosome
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1238 reg->prev_seq = reg->iseq;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1239 reg->prev_start = start;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1240
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1241 while ( iseq==reg->iseq && reg->end < start )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1242 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1243 if ( bcf_sr_regions_next(reg) < 0 ) return -2; // no more regions left
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1244 if ( reg->iseq != iseq ) return -1; // does not overlap any regions
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1245 if ( reg->missed_reg_handler && reg->end < start ) reg->missed_reg_handler(reg, reg->missed_reg_data);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1246 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1247 if ( reg->start <= end ) return 0; // region overlap
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1248 return -1; // no overlap
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1249 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1250
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1251 void bcf_sr_regions_flush(bcf_sr_regions_t *reg)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1252 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1253 if ( !reg->missed_reg_handler || reg->prev_seq==-1 ) return;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1254 while ( !bcf_sr_regions_next(reg) ) reg->missed_reg_handler(reg, reg->missed_reg_data);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1255 return;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1256 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1257