0
|
1 /* synced_bcf_reader.c -- stream through multiple VCF files.
|
|
2
|
|
3 Copyright (C) 2012-2014 Genome Research Ltd.
|
|
4
|
|
5 Author: Petr Danecek <pd3@sanger.ac.uk>
|
|
6
|
|
7 Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
8 of this software and associated documentation files (the "Software"), to deal
|
|
9 in the Software without restriction, including without limitation the rights
|
|
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
11 copies of the Software, and to permit persons to whom the Software is
|
|
12 furnished to do so, subject to the following conditions:
|
|
13
|
|
14 The above copyright notice and this permission notice shall be included in
|
|
15 all copies or substantial portions of the Software.
|
|
16
|
|
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
23 DEALINGS IN THE SOFTWARE. */
|
|
24
|
|
25 #include <stdio.h>
|
|
26 #include <unistd.h>
|
|
27 #include <string.h>
|
|
28 #include <limits.h>
|
|
29 #include <errno.h>
|
|
30 #include <ctype.h>
|
|
31 #include <sys/stat.h>
|
|
32 #include "htslib/synced_bcf_reader.h"
|
|
33 #include "htslib/kseq.h"
|
|
34 #include "htslib/khash_str2int.h"
|
|
35
|
|
36 #define MAX_CSI_COOR 0x7fffffff // maximum indexable coordinate of .csi
|
|
37
|
|
38 typedef struct
|
|
39 {
|
|
40 uint32_t start, end;
|
|
41 }
|
|
42 region1_t;
|
|
43
|
|
44 typedef struct _region_t
|
|
45 {
|
|
46 region1_t *regs;
|
|
47 int nregs, mregs, creg;
|
|
48 }
|
|
49 region_t;
|
|
50
|
|
51 static void _regions_add(bcf_sr_regions_t *reg, const char *chr, int start, int end);
|
|
52 static bcf_sr_regions_t *_regions_init_string(const char *str);
|
|
53 static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec);
|
|
54
|
|
55 char *bcf_sr_strerror(int errnum)
|
|
56 {
|
|
57 switch (errnum)
|
|
58 {
|
|
59 case open_failed:
|
|
60 return strerror(errno); break;
|
|
61 case not_bgzf:
|
|
62 return "not compressed with bgzip"; break;
|
|
63 case idx_load_failed:
|
|
64 return "could not load index"; break;
|
|
65 case file_type_error:
|
|
66 return "unknown file type"; break;
|
|
67 case api_usage_error:
|
|
68 return "API usage error"; break;
|
|
69 case header_error:
|
|
70 return "could not parse header"; break;
|
|
71 default: return "";
|
|
72 }
|
|
73 }
|
|
74
|
|
75 static int *init_filters(bcf_hdr_t *hdr, const char *filters, int *nfilters)
|
|
76 {
|
|
77 kstring_t str = {0,0,0};
|
|
78 const char *tmp = filters, *prev = filters;
|
|
79 int nout = 0, *out = NULL;
|
|
80 while ( 1 )
|
|
81 {
|
|
82 if ( *tmp==',' || !*tmp )
|
|
83 {
|
|
84 out = (int*) realloc(out, (nout+1)*sizeof(int));
|
|
85 if ( tmp-prev==1 && *prev=='.' )
|
|
86 out[nout] = -1;
|
|
87 else
|
|
88 {
|
|
89 str.l = 0;
|
|
90 kputsn(prev, tmp-prev, &str);
|
|
91 out[nout] = bcf_hdr_id2int(hdr, BCF_DT_ID, str.s);
|
|
92 }
|
|
93 nout++;
|
|
94 if ( !*tmp ) break;
|
|
95 prev = tmp+1;
|
|
96 }
|
|
97 tmp++;
|
|
98 }
|
|
99 if ( str.m ) free(str.s);
|
|
100 *nfilters = nout;
|
|
101 return out;
|
|
102 }
|
|
103
|
|
104 int bcf_sr_set_regions(bcf_srs_t *readers, const char *regions, int is_file)
|
|
105 {
|
|
106 assert( !readers->regions );
|
|
107 if ( readers->nreaders )
|
|
108 {
|
|
109 fprintf(stderr,"[%s:%d %s] Error: bcf_sr_set_regions() must be called before bcf_sr_add_reader()\n", __FILE__,__LINE__,__FUNCTION__);
|
|
110 return -1;
|
|
111 }
|
|
112 readers->regions = bcf_sr_regions_init(regions,is_file,0,1,-2);
|
|
113 if ( !readers->regions ) return -1;
|
|
114 readers->explicit_regs = 1;
|
|
115 readers->require_index = 1;
|
|
116 return 0;
|
|
117 }
|
|
118 int bcf_sr_set_targets(bcf_srs_t *readers, const char *targets, int is_file, int alleles)
|
|
119 {
|
|
120 assert( !readers->targets );
|
|
121 if ( targets[0]=='^' )
|
|
122 {
|
|
123 readers->targets_exclude = 1;
|
|
124 targets++;
|
|
125 }
|
|
126 readers->targets = bcf_sr_regions_init(targets,is_file,0,1,-2);
|
|
127 if ( !readers->targets ) return -1;
|
|
128 readers->targets_als = alleles;
|
|
129 return 0;
|
|
130 }
|
|
131
|
|
132 int bcf_sr_add_reader(bcf_srs_t *files, const char *fname)
|
|
133 {
|
|
134 htsFile* file_ptr = hts_open(fname, "r");
|
|
135 if ( ! file_ptr ) {
|
|
136 files->errnum = open_failed;
|
|
137 return 0;
|
|
138 }
|
|
139
|
|
140 files->has_line = (int*) realloc(files->has_line, sizeof(int)*(files->nreaders+1));
|
|
141 files->has_line[files->nreaders] = 0;
|
|
142 files->readers = (bcf_sr_t*) realloc(files->readers, sizeof(bcf_sr_t)*(files->nreaders+1));
|
|
143 bcf_sr_t *reader = &files->readers[files->nreaders++];
|
|
144 memset(reader,0,sizeof(bcf_sr_t));
|
|
145
|
|
146 reader->file = file_ptr;
|
|
147
|
|
148 files->errnum = 0;
|
|
149
|
|
150 if ( files->require_index )
|
|
151 {
|
|
152 if ( reader->file->format.format==vcf )
|
|
153 {
|
|
154 if ( reader->file->format.compression!=bgzf )
|
|
155 {
|
|
156 files->errnum = not_bgzf;
|
|
157 return 0;
|
|
158 }
|
|
159
|
|
160 reader->tbx_idx = tbx_index_load(fname);
|
|
161 if ( !reader->tbx_idx )
|
|
162 {
|
|
163 files->errnum = idx_load_failed;
|
|
164 return 0;
|
|
165 }
|
|
166
|
|
167 reader->header = bcf_hdr_read(reader->file);
|
|
168 }
|
|
169 else if ( reader->file->format.format==bcf )
|
|
170 {
|
|
171 if ( reader->file->format.compression!=bgzf )
|
|
172 {
|
|
173 files->errnum = not_bgzf;
|
|
174 return 0;
|
|
175 }
|
|
176
|
|
177 reader->header = bcf_hdr_read(reader->file);
|
|
178
|
|
179 reader->bcf_idx = bcf_index_load(fname);
|
|
180 if ( !reader->bcf_idx )
|
|
181 {
|
|
182 files->errnum = idx_load_failed;
|
|
183 return 0;
|
|
184 }
|
|
185 }
|
|
186 else
|
|
187 {
|
|
188 files->errnum = file_type_error;
|
|
189 return 0;
|
|
190 }
|
|
191 }
|
|
192 else
|
|
193 {
|
|
194 if ( reader->file->format.format==bcf || reader->file->format.format==vcf )
|
|
195 {
|
|
196 reader->header = bcf_hdr_read(reader->file);
|
|
197 }
|
|
198 else
|
|
199 {
|
|
200 files->errnum = file_type_error;
|
|
201 return 0;
|
|
202 }
|
|
203 files->streaming = 1;
|
|
204 }
|
|
205 if ( files->streaming && files->nreaders>1 )
|
|
206 {
|
|
207 files->errnum = api_usage_error;
|
|
208 fprintf(stderr,"[%s:%d %s] Error: %d readers, yet require_index not set\n", __FILE__,__LINE__,__FUNCTION__,files->nreaders);
|
|
209 return 0;
|
|
210 }
|
|
211 if ( files->streaming && files->regions )
|
|
212 {
|
|
213 files->errnum = api_usage_error;
|
|
214 fprintf(stderr,"[%s:%d %s] Error: cannot tabix-jump in streaming mode\n", __FILE__,__LINE__,__FUNCTION__);
|
|
215 return 0;
|
|
216 }
|
|
217 if ( !reader->header )
|
|
218 {
|
|
219 files->errnum = header_error;
|
|
220 return 0;
|
|
221 }
|
|
222
|
|
223 reader->fname = fname;
|
|
224 if ( files->apply_filters )
|
|
225 reader->filter_ids = init_filters(reader->header, files->apply_filters, &reader->nfilter_ids);
|
|
226
|
|
227 // Update list of chromosomes
|
|
228 if ( !files->explicit_regs && !files->streaming )
|
|
229 {
|
|
230 int n,i;
|
|
231 const char **names = reader->tbx_idx ? tbx_seqnames(reader->tbx_idx, &n) : bcf_hdr_seqnames(reader->header, &n);
|
|
232 for (i=0; i<n; i++)
|
|
233 {
|
|
234 if ( !files->regions )
|
|
235 files->regions = _regions_init_string(names[i]);
|
|
236 else
|
|
237 _regions_add(files->regions, names[i], -1, -1);
|
|
238 }
|
|
239 free(names);
|
|
240 }
|
|
241
|
|
242 return 1;
|
|
243 }
|
|
244
|
|
245 bcf_srs_t *bcf_sr_init(void)
|
|
246 {
|
|
247 bcf_srs_t *files = (bcf_srs_t*) calloc(1,sizeof(bcf_srs_t));
|
|
248 return files;
|
|
249 }
|
|
250
|
|
251 static void bcf_sr_destroy1(bcf_sr_t *reader)
|
|
252 {
|
|
253 if ( reader->tbx_idx ) tbx_destroy(reader->tbx_idx);
|
|
254 if ( reader->bcf_idx ) hts_idx_destroy(reader->bcf_idx);
|
|
255 bcf_hdr_destroy(reader->header);
|
|
256 hts_close(reader->file);
|
|
257 if ( reader->itr ) tbx_itr_destroy(reader->itr);
|
|
258 int j;
|
|
259 for (j=0; j<reader->mbuffer; j++)
|
|
260 bcf_destroy1(reader->buffer[j]);
|
|
261 free(reader->buffer);
|
|
262 free(reader->samples);
|
|
263 free(reader->filter_ids);
|
|
264 }
|
|
265 void bcf_sr_destroy(bcf_srs_t *files)
|
|
266 {
|
|
267 int i;
|
|
268 for (i=0; i<files->nreaders; i++)
|
|
269 bcf_sr_destroy1(&files->readers[i]);
|
|
270 free(files->has_line);
|
|
271 free(files->readers);
|
|
272 for (i=0; i<files->n_smpl; i++) free(files->samples[i]);
|
|
273 free(files->samples);
|
|
274 if (files->targets) bcf_sr_regions_destroy(files->targets);
|
|
275 if (files->regions) bcf_sr_regions_destroy(files->regions);
|
|
276 if ( files->tmps.m ) free(files->tmps.s);
|
|
277 free(files);
|
|
278 }
|
|
279
|
|
280 void bcf_sr_remove_reader(bcf_srs_t *files, int i)
|
|
281 {
|
|
282 assert( !files->samples ); // not ready for this yet
|
|
283 bcf_sr_destroy1(&files->readers[i]);
|
|
284 if ( i+1 < files->nreaders )
|
|
285 {
|
|
286 memmove(&files->readers[i], &files->readers[i+1], (files->nreaders-i-1)*sizeof(bcf_sr_t));
|
|
287 memmove(&files->has_line[i], &files->has_line[i+1], (files->nreaders-i-1)*sizeof(int));
|
|
288 }
|
|
289 files->nreaders--;
|
|
290 }
|
|
291
|
|
292
|
|
293 /*
|
|
294 Removes duplicate records from the buffer. The meaning of "duplicate" is
|
|
295 controlled by the $collapse variable, which can cause that from multiple
|
|
296 <indel|snp|any> lines only the first is considered and the rest is ignored.
|
|
297 The removal is done by setting the redundant lines' positions to -1 and
|
|
298 moving these lines at the end of the buffer.
|
|
299 */
|
|
300 static void collapse_buffer(bcf_srs_t *files, bcf_sr_t *reader)
|
|
301 {
|
|
302 int irec,jrec, has_snp=0, has_indel=0, has_any=0;
|
|
303 for (irec=1; irec<=reader->nbuffer; irec++)
|
|
304 {
|
|
305 bcf1_t *line = reader->buffer[irec];
|
|
306 if ( line->pos != reader->buffer[1]->pos ) break;
|
|
307 if ( files->collapse&COLLAPSE_ANY )
|
|
308 {
|
|
309 if ( !has_any ) has_any = 1;
|
|
310 else line->pos = -1;
|
|
311 }
|
|
312 int line_type = bcf_get_variant_types(line);
|
|
313 if ( files->collapse&COLLAPSE_SNPS && line_type&(VCF_SNP|VCF_MNP) )
|
|
314 {
|
|
315 if ( !has_snp ) has_snp = 1;
|
|
316 else line->pos = -1;
|
|
317 }
|
|
318 if ( files->collapse&COLLAPSE_INDELS && line_type&VCF_INDEL )
|
|
319 {
|
|
320 if ( !has_indel ) has_indel = 1;
|
|
321 else line->pos = -1;
|
|
322 }
|
|
323 }
|
|
324 bcf1_t *tmp;
|
|
325 irec = jrec = 1;
|
|
326 while ( irec<=reader->nbuffer && jrec<=reader->nbuffer )
|
|
327 {
|
|
328 if ( reader->buffer[irec]->pos != -1 ) { irec++; continue; }
|
|
329 if ( jrec<=irec ) jrec = irec+1;
|
|
330 while ( jrec<=reader->nbuffer && reader->buffer[jrec]->pos==-1 ) jrec++;
|
|
331 if ( jrec<=reader->nbuffer )
|
|
332 {
|
|
333 tmp = reader->buffer[irec]; reader->buffer[irec] = reader->buffer[jrec]; reader->buffer[jrec] = tmp;
|
|
334 }
|
|
335 }
|
|
336 reader->nbuffer = irec - 1;
|
|
337 }
|
|
338
|
|
339 void debug_buffer(FILE *fp, bcf_sr_t *reader)
|
|
340 {
|
|
341 int j;
|
|
342 for (j=0; j<=reader->nbuffer; j++)
|
|
343 {
|
|
344 bcf1_t *line = reader->buffer[j];
|
|
345 fprintf(fp,"%s%s\t%s:%d\t%s ", reader->fname,j==0?"*":"",reader->header->id[BCF_DT_CTG][line->rid].key,line->pos+1,line->n_allele?line->d.allele[0]:"");
|
|
346 int k;
|
|
347 for (k=1; k<line->n_allele; k++) fprintf(fp," %s", line->d.allele[k]);
|
|
348 fprintf(fp,"\n");
|
|
349 }
|
|
350 }
|
|
351
|
|
352 void debug_buffers(FILE *fp, bcf_srs_t *files)
|
|
353 {
|
|
354 int i;
|
|
355 for (i=0; i<files->nreaders; i++)
|
|
356 {
|
|
357 fprintf(fp, "has_line: %d\t%s\n", bcf_sr_has_line(files,i),files->readers[i].fname);
|
|
358 debug_buffer(fp, &files->readers[i]);
|
|
359 }
|
|
360 fprintf(fp,"\n");
|
|
361 }
|
|
362
|
|
363 static inline int has_filter(bcf_sr_t *reader, bcf1_t *line)
|
|
364 {
|
|
365 int i, j;
|
|
366 if ( !line->d.n_flt )
|
|
367 {
|
|
368 for (j=0; j<reader->nfilter_ids; j++)
|
|
369 if ( reader->filter_ids[j]<0 ) return 1;
|
|
370 return 0;
|
|
371 }
|
|
372 for (i=0; i<line->d.n_flt; i++)
|
|
373 {
|
|
374 for (j=0; j<reader->nfilter_ids; j++)
|
|
375 if ( line->d.flt[i]==reader->filter_ids[j] ) return 1;
|
|
376 }
|
|
377 return 0;
|
|
378 }
|
|
379
|
|
380 static int _reader_seek(bcf_sr_t *reader, const char *seq, int start, int end)
|
|
381 {
|
|
382 if ( end>=MAX_CSI_COOR )
|
|
383 {
|
|
384 fprintf(stderr,"The coordinate is out of csi index limit: %d\n", end+1);
|
|
385 exit(1);
|
|
386 }
|
|
387 if ( reader->itr )
|
|
388 {
|
|
389 hts_itr_destroy(reader->itr);
|
|
390 reader->itr = NULL;
|
|
391 }
|
|
392 reader->nbuffer = 0;
|
|
393 if ( reader->tbx_idx )
|
|
394 {
|
|
395 int tid = tbx_name2id(reader->tbx_idx, seq);
|
|
396 if ( tid==-1 ) return -1; // the sequence not present in this file
|
|
397 reader->itr = tbx_itr_queryi(reader->tbx_idx,tid,start,end+1);
|
|
398 }
|
|
399 else
|
|
400 {
|
|
401 int tid = bcf_hdr_name2id(reader->header, seq);
|
|
402 if ( tid==-1 ) return -1; // the sequence not present in this file
|
|
403 reader->itr = bcf_itr_queryi(reader->bcf_idx,tid,start,end+1);
|
|
404 }
|
|
405 assert(reader->itr);
|
|
406 return 0;
|
|
407 }
|
|
408
|
|
409 /*
|
|
410 * _readers_next_region() - jumps to next region if necessary
|
|
411 * Returns 0 on success or -1 when there are no more regions left
|
|
412 */
|
|
413 static int _readers_next_region(bcf_srs_t *files)
|
|
414 {
|
|
415 // Need to open new chromosome? Check number of lines in all readers' buffers
|
|
416 int i, eos = 0;
|
|
417 for (i=0; i<files->nreaders; i++)
|
|
418 if ( !files->readers[i].itr && !files->readers[i].nbuffer ) eos++;
|
|
419
|
|
420 if ( eos!=files->nreaders )
|
|
421 {
|
|
422 // Some of the readers still has buffered lines
|
|
423 return 0;
|
|
424 }
|
|
425
|
|
426 // No lines in the buffer, need to open new region or quit
|
|
427 if ( bcf_sr_regions_next(files->regions)<0 ) return -1;
|
|
428
|
|
429 for (i=0; i<files->nreaders; i++)
|
|
430 _reader_seek(&files->readers[i],files->regions->seq_names[files->regions->iseq],files->regions->start,files->regions->end);
|
|
431
|
|
432 return 0;
|
|
433 }
|
|
434
|
|
435 /*
|
|
436 * _reader_fill_buffer() - buffers all records with the same coordinate
|
|
437 */
|
|
438 static void _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader)
|
|
439 {
|
|
440 // Return if the buffer is full: the coordinate of the last buffered record differs
|
|
441 if ( reader->nbuffer && reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) return;
|
|
442
|
|
443 // No iterator (sequence not present in this file) and not streaming
|
|
444 if ( !reader->itr && !files->streaming ) return;
|
|
445
|
|
446 // Fill the buffer with records starting at the same position
|
|
447 int i, ret = 0;
|
|
448 while (1)
|
|
449 {
|
|
450 if ( reader->nbuffer+1 >= reader->mbuffer )
|
|
451 {
|
|
452 // Increase buffer size
|
|
453 reader->mbuffer += 8;
|
|
454 reader->buffer = (bcf1_t**) realloc(reader->buffer, sizeof(bcf1_t*)*reader->mbuffer);
|
|
455 for (i=8; i>0; i--) // initialize
|
|
456 {
|
|
457 reader->buffer[reader->mbuffer-i] = bcf_init1();
|
|
458 reader->buffer[reader->mbuffer-i]->max_unpack = files->max_unpack;
|
|
459 reader->buffer[reader->mbuffer-i]->pos = -1; // for rare cases when VCF starts from 1
|
|
460 }
|
|
461 }
|
|
462 if ( files->streaming )
|
|
463 {
|
|
464 if ( reader->file->format.format==vcf )
|
|
465 {
|
|
466 if ( (ret=hts_getline(reader->file, KS_SEP_LINE, &files->tmps)) < 0 ) break; // no more lines
|
|
467 int ret = vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]);
|
|
468 if ( ret<0 ) break;
|
|
469 }
|
|
470 else if ( reader->file->format.format==bcf )
|
|
471 {
|
|
472 if ( (ret=bcf_read1(reader->file, reader->header, reader->buffer[reader->nbuffer+1])) < 0 ) break; // no more lines
|
|
473 }
|
|
474 else
|
|
475 {
|
|
476 fprintf(stderr,"[%s:%d %s] fixme: not ready for this\n", __FILE__,__LINE__,__FUNCTION__);
|
|
477 exit(1);
|
|
478 }
|
|
479 }
|
|
480 else if ( reader->tbx_idx )
|
|
481 {
|
|
482 if ( (ret=tbx_itr_next(reader->file, reader->tbx_idx, reader->itr, &files->tmps)) < 0 ) break; // no more lines
|
|
483 vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]);
|
|
484 }
|
|
485 else
|
|
486 {
|
|
487 if ( (ret=bcf_itr_next(reader->file, reader->itr, reader->buffer[reader->nbuffer+1])) < 0 ) break; // no more lines
|
|
488 bcf_subset_format(reader->header,reader->buffer[reader->nbuffer+1]);
|
|
489 }
|
|
490
|
|
491 // apply filter
|
|
492 if ( !reader->nfilter_ids )
|
|
493 bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR);
|
|
494 else
|
|
495 {
|
|
496 bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR|BCF_UN_FLT);
|
|
497 if ( !has_filter(reader, reader->buffer[reader->nbuffer+1]) ) continue;
|
|
498 }
|
|
499 reader->nbuffer++;
|
|
500
|
|
501 if ( reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) break; // the buffer is full
|
|
502 }
|
|
503 if ( ret<0 )
|
|
504 {
|
|
505 // done for this region
|
|
506 tbx_itr_destroy(reader->itr);
|
|
507 reader->itr = NULL;
|
|
508 }
|
|
509 if ( files->collapse && reader->nbuffer>=2 && reader->buffer[1]->pos==reader->buffer[2]->pos )
|
|
510 collapse_buffer(files, reader);
|
|
511 }
|
|
512
|
|
513 /*
|
|
514 * _readers_shift_buffer() - removes the first line and all subsequent lines with the same position
|
|
515 */
|
|
516 static void _reader_shift_buffer(bcf_sr_t *reader)
|
|
517 {
|
|
518 int i;
|
|
519 for (i=2; i<=reader->nbuffer; i++)
|
|
520 if ( reader->buffer[i]->pos!=reader->buffer[1]->pos ) break;
|
|
521 if ( i<=reader->nbuffer )
|
|
522 {
|
|
523 // A record with a different position follows, swap it. Because of the reader's logic,
|
|
524 // only one such line can be present.
|
|
525 bcf1_t *tmp = reader->buffer[1]; reader->buffer[1] = reader->buffer[i]; reader->buffer[i] = tmp;
|
|
526 reader->nbuffer = 1;
|
|
527 }
|
|
528 else
|
|
529 reader->nbuffer = 0; // no other line
|
|
530 }
|
|
531
|
|
532 /*
|
|
533 * _reader_match_alleles() - from multiple buffered lines selects the one which
|
|
534 * corresponds best to the template line. The logic is controlled by COLLAPSE_*
|
|
535 * Returns 0 on success or -1 when no good matching line is found.
|
|
536 */
|
|
537 static int _reader_match_alleles(bcf_srs_t *files, bcf_sr_t *reader, bcf1_t *tmpl)
|
|
538 {
|
|
539 int i, irec = -1;
|
|
540
|
|
541 // if no template given, use the first available record
|
|
542 if ( !tmpl )
|
|
543 irec = 1;
|
|
544 else
|
|
545 {
|
|
546 int tmpl_type = bcf_get_variant_types(tmpl);
|
|
547 for (i=1; i<=reader->nbuffer; i++)
|
|
548 {
|
|
549 bcf1_t *line = reader->buffer[i];
|
|
550 if ( line->pos != reader->buffer[1]->pos ) break; // done with this reader
|
|
551
|
|
552 // Easiest case: matching by position only
|
|
553 if ( files->collapse&COLLAPSE_ANY ) { irec=i; break; }
|
|
554
|
|
555 int line_type = bcf_get_variant_types(line);
|
|
556
|
|
557 // No matter what the alleles are, as long as they are both SNPs
|
|
558 if ( files->collapse&COLLAPSE_SNPS && tmpl_type&VCF_SNP && line_type&VCF_SNP ) { irec=i; break; }
|
|
559 // ... or indels
|
|
560 if ( files->collapse&COLLAPSE_INDELS && tmpl_type&VCF_INDEL && line_type&VCF_INDEL ) { irec=i; break; }
|
|
561
|
|
562 // More thorough checking: REFs must match
|
|
563 if ( tmpl->rlen != line->rlen ) continue; // different length
|
|
564 if ( strcmp(tmpl->d.allele[0], line->d.allele[0]) ) continue; // the strings do not match
|
|
565
|
|
566 int ial,jal;
|
|
567 if ( files->collapse==COLLAPSE_NONE )
|
|
568 {
|
|
569 // Exact match, all alleles must be identical
|
|
570 if ( tmpl->n_allele!=line->n_allele ) continue; // different number of alleles, skip
|
|
571
|
|
572 int nmatch = 1; // REF has been already checked
|
|
573 for (ial=1; ial<tmpl->n_allele; ial++)
|
|
574 {
|
|
575 for (jal=1; jal<line->n_allele; jal++)
|
|
576 if ( !strcmp(tmpl->d.allele[ial], line->d.allele[jal]) ) { nmatch++; break; }
|
|
577 }
|
|
578 if ( nmatch==tmpl->n_allele ) { irec=i; break; } // found: exact match
|
|
579 continue;
|
|
580 }
|
|
581
|
|
582 if ( line->n_allele==1 && tmpl->n_allele==1 ) { irec=i; break; } // both sites are non-variant
|
|
583
|
|
584 // COLLAPSE_SOME: at least some ALTs must match
|
|
585 for (ial=1; ial<tmpl->n_allele; ial++)
|
|
586 {
|
|
587 for (jal=1; jal<line->n_allele; jal++)
|
|
588 if ( !strcmp(tmpl->d.allele[ial], line->d.allele[jal]) ) { irec=i; break; }
|
|
589 if ( irec>=1 ) break;
|
|
590 }
|
|
591 if ( irec>=1 ) break;
|
|
592 }
|
|
593 if ( irec==-1 ) return -1; // no matching line was found
|
|
594 }
|
|
595
|
|
596 // Set the selected line (irec) as active: set it to buffer[0], move the remaining lines forward
|
|
597 // and put the old bcf1_t record at the end.
|
|
598 bcf1_t *tmp = reader->buffer[0];
|
|
599 reader->buffer[0] = reader->buffer[irec];
|
|
600 for (i=irec+1; i<=reader->nbuffer; i++) reader->buffer[i-1] = reader->buffer[i];
|
|
601 reader->buffer[ reader->nbuffer ] = tmp;
|
|
602 reader->nbuffer--;
|
|
603
|
|
604 return 0;
|
|
605 }
|
|
606
|
|
607 int _reader_next_line(bcf_srs_t *files)
|
|
608 {
|
|
609 int i, min_pos = INT_MAX;
|
|
610
|
|
611 // Loop until next suitable line is found or all readers have finished
|
|
612 while ( 1 )
|
|
613 {
|
|
614 // Get all readers ready for the next region.
|
|
615 if ( files->regions && _readers_next_region(files)<0 ) break;
|
|
616
|
|
617 // Fill buffers
|
|
618 const char *chr = NULL;
|
|
619 for (i=0; i<files->nreaders; i++)
|
|
620 {
|
|
621 _reader_fill_buffer(files, &files->readers[i]);
|
|
622
|
|
623 // Update the minimum coordinate
|
|
624 if ( !files->readers[i].nbuffer ) continue;
|
|
625 if ( min_pos > files->readers[i].buffer[1]->pos )
|
|
626 {
|
|
627 min_pos = files->readers[i].buffer[1]->pos;
|
|
628 chr = bcf_seqname(files->readers[i].header, files->readers[i].buffer[1]);
|
|
629 }
|
|
630 }
|
|
631 if ( min_pos==INT_MAX )
|
|
632 {
|
|
633 if ( !files->regions ) break;
|
|
634 continue;
|
|
635 }
|
|
636
|
|
637 // Skip this position if not present in targets
|
|
638 if ( files->targets )
|
|
639 {
|
|
640 int ret = bcf_sr_regions_overlap(files->targets, chr, min_pos, min_pos);
|
|
641 if ( (!files->targets_exclude && ret<0) || (files->targets_exclude && !ret) )
|
|
642 {
|
|
643 // Remove all lines with this position from the buffer
|
|
644 for (i=0; i<files->nreaders; i++)
|
|
645 if ( files->readers[i].nbuffer && files->readers[i].buffer[1]->pos==min_pos )
|
|
646 _reader_shift_buffer(&files->readers[i]);
|
|
647 min_pos = INT_MAX;
|
|
648 continue;
|
|
649 }
|
|
650 }
|
|
651
|
|
652 break; // done: min_pos is set
|
|
653 }
|
|
654
|
|
655 // There can be records with duplicate positions. Set the active line intelligently so that
|
|
656 // the alleles match.
|
|
657 int nret = 0; // number of readers sharing the position
|
|
658 bcf1_t *first = NULL; // record which will be used for allele matching
|
|
659 for (i=0; i<files->nreaders; i++)
|
|
660 {
|
|
661 files->has_line[i] = 0;
|
|
662
|
|
663 // Skip readers with no records at this position
|
|
664 if ( !files->readers[i].nbuffer || files->readers[i].buffer[1]->pos!=min_pos ) continue;
|
|
665
|
|
666 // Until now buffer[0] of all reader was empty and the lines started at buffer[1].
|
|
667 // Now lines which are ready to be output will be moved to buffer[0].
|
|
668 if ( _reader_match_alleles(files, &files->readers[i], first) < 0 ) continue;
|
|
669 if ( !first ) first = files->readers[i].buffer[0];
|
|
670
|
|
671 nret++;
|
|
672 files->has_line[i] = 1;
|
|
673 }
|
|
674 return nret;
|
|
675 }
|
|
676
|
|
677 int bcf_sr_next_line(bcf_srs_t *files)
|
|
678 {
|
|
679 if ( !files->targets_als )
|
|
680 return _reader_next_line(files);
|
|
681
|
|
682 while (1)
|
|
683 {
|
|
684 int i, ret = _reader_next_line(files);
|
|
685 if ( !ret ) return ret;
|
|
686
|
|
687 for (i=0; i<files->nreaders; i++)
|
|
688 if ( files->has_line[i] ) break;
|
|
689
|
|
690 if ( _regions_match_alleles(files->targets, files->targets_als-1, files->readers[i].buffer[0]) ) return ret;
|
|
691
|
|
692 // Check if there are more duplicate lines in the buffers. If not, return this line as if it
|
|
693 // matched the targets, even if there is a type mismatch
|
|
694 for (i=0; i<files->nreaders; i++)
|
|
695 {
|
|
696 if ( !files->has_line[i] ) continue;
|
|
697 if ( files->readers[i].nbuffer==0 || files->readers[i].buffer[1]->pos!=files->readers[i].buffer[0]->pos ) continue;
|
|
698 break;
|
|
699 }
|
|
700 if ( i==files->nreaders ) return ret; // no more lines left, output even if target alleles are not of the same type
|
|
701 }
|
|
702 }
|
|
703
|
|
704 static void bcf_sr_seek_start(bcf_srs_t *readers)
|
|
705 {
|
|
706 bcf_sr_regions_t *reg = readers->regions;
|
|
707 int i;
|
|
708 for (i=0; i<reg->nseqs; i++)
|
|
709 reg->regs[i].creg = -1;
|
|
710 reg->iseq = 0;
|
|
711 }
|
|
712
|
|
713
|
|
714 int bcf_sr_seek(bcf_srs_t *readers, const char *seq, int pos)
|
|
715 {
|
|
716 if ( !seq && !pos )
|
|
717 {
|
|
718 // seek to start
|
|
719 bcf_sr_seek_start(readers);
|
|
720 return 0;
|
|
721 }
|
|
722
|
|
723 bcf_sr_regions_overlap(readers->regions, seq, pos, pos);
|
|
724 int i, nret = 0;
|
|
725 for (i=0; i<readers->nreaders; i++)
|
|
726 {
|
|
727 nret += _reader_seek(&readers->readers[i],seq,pos,MAX_CSI_COOR-1);
|
|
728 }
|
|
729 return nret;
|
|
730 }
|
|
731
|
|
732 int bcf_sr_set_samples(bcf_srs_t *files, const char *fname, int is_file)
|
|
733 {
|
|
734 int i, j, nsmpl, free_smpl = 0;
|
|
735 char **smpl = NULL;
|
|
736
|
|
737 void *exclude = (fname[0]=='^') ? khash_str2int_init() : NULL;
|
|
738 if ( exclude || strcmp("-",fname) ) // "-" stands for all samples
|
|
739 {
|
|
740 smpl = hts_readlist(fname, is_file, &nsmpl);
|
|
741 if ( !smpl )
|
|
742 {
|
|
743 fprintf(stderr,"Could not read the file: \"%s\"\n", fname);
|
|
744 return 0;
|
|
745 }
|
|
746 if ( exclude )
|
|
747 {
|
|
748 for (i=0; i<nsmpl; i++)
|
|
749 khash_str2int_inc(exclude, smpl[i]);
|
|
750 }
|
|
751 free_smpl = 1;
|
|
752 }
|
|
753 if ( !smpl )
|
|
754 {
|
|
755 smpl = files->readers[0].header->samples; // intersection of all samples
|
|
756 nsmpl = bcf_hdr_nsamples(files->readers[0].header);
|
|
757 }
|
|
758
|
|
759 files->samples = NULL;
|
|
760 files->n_smpl = 0;
|
|
761 for (i=0; i<nsmpl; i++)
|
|
762 {
|
|
763 if ( exclude && khash_str2int_has_key(exclude,smpl[i]) ) continue;
|
|
764
|
|
765 int n_isec = 0;
|
|
766 for (j=0; j<files->nreaders; j++)
|
|
767 {
|
|
768 if ( bcf_hdr_id2int(files->readers[j].header, BCF_DT_SAMPLE, smpl[i])<0 ) break;
|
|
769 n_isec++;
|
|
770 }
|
|
771 if ( n_isec!=files->nreaders )
|
|
772 {
|
|
773 fprintf(stderr,"Warning: The sample \"%s\" was not found in %s, skipping\n", smpl[i], files->readers[n_isec].fname);
|
|
774 continue;
|
|
775 }
|
|
776
|
|
777 files->samples = (char**) realloc(files->samples, (files->n_smpl+1)*sizeof(const char*));
|
|
778 files->samples[files->n_smpl++] = strdup(smpl[i]);
|
|
779 }
|
|
780
|
|
781 if ( exclude ) khash_str2int_destroy(exclude);
|
|
782 if ( free_smpl )
|
|
783 {
|
|
784 for (i=0; i<nsmpl; i++) free(smpl[i]);
|
|
785 free(smpl);
|
|
786 }
|
|
787
|
|
788 if ( !files->n_smpl )
|
|
789 {
|
|
790 if ( files->nreaders>1 )
|
|
791 fprintf(stderr,"No samples in common.\n");
|
|
792 return 0;
|
|
793 }
|
|
794 for (i=0; i<files->nreaders; i++)
|
|
795 {
|
|
796 bcf_sr_t *reader = &files->readers[i];
|
|
797 reader->samples = (int*) malloc(sizeof(int)*files->n_smpl);
|
|
798 reader->n_smpl = files->n_smpl;
|
|
799 for (j=0; j<files->n_smpl; j++)
|
|
800 reader->samples[j] = bcf_hdr_id2int(reader->header, BCF_DT_SAMPLE, files->samples[j]);
|
|
801 }
|
|
802 return 1;
|
|
803 }
|
|
804
|
|
805 // Add a new region into a list sorted by start,end. On input the coordinates
|
|
806 // are 1-based, stored 0-based, inclusive.
|
|
807 static void _regions_add(bcf_sr_regions_t *reg, const char *chr, int start, int end)
|
|
808 {
|
|
809 if ( start==-1 && end==-1 )
|
|
810 {
|
|
811 start = 0; end = MAX_CSI_COOR-1;
|
|
812 }
|
|
813 else
|
|
814 {
|
|
815 start--; end--; // store 0-based coordinates
|
|
816 }
|
|
817
|
|
818 if ( !reg->seq_hash )
|
|
819 reg->seq_hash = khash_str2int_init();
|
|
820
|
|
821 int iseq;
|
|
822 if ( khash_str2int_get(reg->seq_hash, chr, &iseq)<0 )
|
|
823 {
|
|
824 // the chromosome block does not exist
|
|
825 iseq = reg->nseqs++;
|
|
826 reg->seq_names = (char**) realloc(reg->seq_names,sizeof(char*)*reg->nseqs);
|
|
827 reg->regs = (region_t*) realloc(reg->regs,sizeof(region_t)*reg->nseqs);
|
|
828 memset(®->regs[reg->nseqs-1],0,sizeof(region_t));
|
|
829 reg->seq_names[iseq] = strdup(chr);
|
|
830 reg->regs[iseq].creg = -1;
|
|
831 khash_str2int_set(reg->seq_hash,reg->seq_names[iseq],iseq);
|
|
832 }
|
|
833
|
|
834 region_t *creg = ®->regs[iseq];
|
|
835
|
|
836 // the regions may not be sorted on input: binary search
|
|
837 int i, min = 0, max = creg->nregs - 1;
|
|
838 while ( min<=max )
|
|
839 {
|
|
840 i = (max+min)/2;
|
|
841 if ( start < creg->regs[i].start ) max = i - 1;
|
|
842 else if ( start > creg->regs[i].start ) min = i + 1;
|
|
843 else break;
|
|
844 }
|
|
845 if ( min>max || creg->regs[i].start!=start || creg->regs[i].end!=end )
|
|
846 {
|
|
847 // no such region, insert a new one just after max
|
|
848 hts_expand(region1_t,creg->nregs+1,creg->mregs,creg->regs);
|
|
849 if ( ++max < creg->nregs )
|
|
850 memmove(&creg->regs[max+1],&creg->regs[max],(creg->nregs - max)*sizeof(region1_t));
|
|
851 creg->regs[max].start = start;
|
|
852 creg->regs[max].end = end;
|
|
853 creg->nregs++;
|
|
854 }
|
|
855 }
|
|
856
|
|
857 // File name or a list of genomic locations. If file name, NULL is returned.
|
|
858 static bcf_sr_regions_t *_regions_init_string(const char *str)
|
|
859 {
|
|
860 bcf_sr_regions_t *reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t));
|
|
861 reg->start = reg->end = -1;
|
|
862 reg->prev_start = reg->prev_seq = -1;
|
|
863
|
|
864 kstring_t tmp = {0,0,0};
|
|
865 const char *sp = str, *ep = str;
|
|
866 int from, to;
|
|
867 while ( 1 )
|
|
868 {
|
|
869 while ( *ep && *ep!=',' && *ep!=':' ) ep++;
|
|
870 tmp.l = 0;
|
|
871 kputsn(sp,ep-sp,&tmp);
|
|
872 if ( *ep==':' )
|
|
873 {
|
|
874 sp = ep+1;
|
|
875 from = strtol(sp,(char**)&ep,10);
|
|
876 if ( sp==ep )
|
|
877 {
|
|
878 fprintf(stderr,"[%s:%d %s] Could not parse the region(s): %s\n", __FILE__,__LINE__,__FUNCTION__,str);
|
|
879 free(reg); free(tmp.s); return NULL;
|
|
880 }
|
|
881 if ( !*ep || *ep==',' )
|
|
882 {
|
|
883 _regions_add(reg, tmp.s, from, from);
|
|
884 sp = ep;
|
|
885 continue;
|
|
886 }
|
|
887 if ( *ep!='-' )
|
|
888 {
|
|
889 fprintf(stderr,"[%s:%d %s] Could not parse the region(s): %s\n", __FILE__,__LINE__,__FUNCTION__,str);
|
|
890 free(reg); free(tmp.s); return NULL;
|
|
891 }
|
|
892 ep++;
|
|
893 sp = ep;
|
|
894 to = strtol(sp,(char**)&ep,10);
|
|
895 if ( *ep && *ep!=',' )
|
|
896 {
|
|
897 fprintf(stderr,"[%s:%d %s] Could not parse the region(s): %s\n", __FILE__,__LINE__,__FUNCTION__,str);
|
|
898 free(reg); free(tmp.s); return NULL;
|
|
899 }
|
|
900 if ( sp==ep ) to = MAX_CSI_COOR-1;
|
|
901 _regions_add(reg, tmp.s, from, to);
|
|
902 if ( !*ep ) break;
|
|
903 sp = ep;
|
|
904 }
|
|
905 else
|
|
906 {
|
|
907 if ( tmp.l ) _regions_add(reg, tmp.s, -1, -1);
|
|
908 if ( !*ep ) break;
|
|
909 sp = ++ep;
|
|
910 }
|
|
911 }
|
|
912 free(tmp.s);
|
|
913 return reg;
|
|
914 }
|
|
915
|
|
916 // ichr,ifrom,ito are 0-based;
|
|
917 // returns -1 on error, 0 if the line is a comment line, 1 on success
|
|
918 static int _regions_parse_line(char *line, int ichr,int ifrom,int ito, char **chr,char **chr_end,int *from,int *to)
|
|
919 {
|
|
920 *chr_end = NULL;
|
|
921
|
|
922 if ( line[0]=='#' ) return 0;
|
|
923
|
|
924 int k,l; // index of the start and end column of the tab-delimited file
|
|
925 if ( ifrom <= ito )
|
|
926 k = ifrom, l = ito;
|
|
927 else
|
|
928 l = ifrom, k = ito;
|
|
929
|
|
930 int i;
|
|
931 char *se = line, *ss = NULL; // start and end
|
|
932 char *tmp;
|
|
933 for (i=0; i<=k && *se; i++)
|
|
934 {
|
|
935 ss = i==0 ? se++ : ++se;
|
|
936 while (*se && *se!='\t') se++;
|
|
937 }
|
|
938 if ( i<=k ) return -1;
|
|
939 if ( k==l )
|
|
940 {
|
|
941 *from = *to = strtol(ss, &tmp, 10);
|
|
942 if ( tmp==ss ) return -1;
|
|
943 }
|
|
944 else
|
|
945 {
|
|
946 if ( k==ifrom )
|
|
947 *from = strtol(ss, &tmp, 10);
|
|
948 else
|
|
949 *to = strtol(ss, &tmp, 10);
|
|
950 if ( ss==tmp ) return -1;
|
|
951
|
|
952 for (i=k; i<l && *se; i++)
|
|
953 {
|
|
954 ss = ++se;
|
|
955 while (*se && *se!='\t') se++;
|
|
956 }
|
|
957 if ( i<l ) return -1;
|
|
958 if ( k==ifrom )
|
|
959 *to = strtol(ss, &tmp, 10);
|
|
960 else
|
|
961 *from = strtol(ss, &tmp, 10);
|
|
962 if ( ss==tmp ) return -1;
|
|
963 }
|
|
964
|
|
965 ss = se = line;
|
|
966 for (i=0; i<=ichr && *se; i++)
|
|
967 {
|
|
968 if ( i>0 ) ss = ++se;
|
|
969 while (*se && *se!='\t') se++;
|
|
970 }
|
|
971 if ( i<=ichr ) return -1;
|
|
972 *chr_end = se;
|
|
973 *chr = ss;
|
|
974 return 1;
|
|
975 }
|
|
976
|
|
977 bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr, int ifrom, int ito)
|
|
978 {
|
|
979 bcf_sr_regions_t *reg;
|
|
980 if ( !is_file ) return _regions_init_string(regions);
|
|
981
|
|
982 reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t));
|
|
983 reg->start = reg->end = -1;
|
|
984 reg->prev_start = reg->prev_seq = -1;
|
|
985
|
|
986 reg->file = hts_open(regions, "rb");
|
|
987 if ( !reg->file )
|
|
988 {
|
|
989 fprintf(stderr,"[%s:%d %s] Could not open file: %s\n", __FILE__,__LINE__,__FUNCTION__,regions);
|
|
990 free(reg);
|
|
991 return NULL;
|
|
992 }
|
|
993
|
|
994 reg->tbx = tbx_index_load(regions);
|
|
995 if ( !reg->tbx )
|
|
996 {
|
|
997 int len = strlen(regions);
|
|
998 int is_bed = strcasecmp(".bed",regions+len-4) ? 0 : 1;
|
|
999 if ( !is_bed && !strcasecmp(".bed.gz",regions+len-7) ) is_bed = 1;
|
|
1000
|
|
1001 if ( reg->file->format.format==vcf ) ito = 1;
|
|
1002
|
|
1003 // read the whole file, tabix index is not present
|
|
1004 while ( hts_getline(reg->file, KS_SEP_LINE, ®->line) > 0 )
|
|
1005 {
|
|
1006 char *chr, *chr_end;
|
|
1007 int from, to, ret;
|
|
1008 ret = _regions_parse_line(reg->line.s, ichr,ifrom,abs(ito), &chr,&chr_end,&from,&to);
|
|
1009 if ( ret < 0 )
|
|
1010 {
|
|
1011 if ( ito<0 )
|
|
1012 ret = _regions_parse_line(reg->line.s, ichr,ifrom,ifrom, &chr,&chr_end,&from,&to);
|
|
1013 if ( ret<0 )
|
|
1014 {
|
|
1015 fprintf(stderr,"[%s:%d] Could not parse the file %s, using the columns %d,%d[,%d]\n", __FILE__,__LINE__,regions,ichr+1,ifrom+1,ito+1);
|
|
1016 hts_close(reg->file); reg->file = NULL; free(reg);
|
|
1017 return NULL;
|
|
1018 }
|
|
1019 }
|
|
1020 if ( !ret ) continue;
|
|
1021 if ( is_bed ) from++;
|
|
1022 *chr_end = 0;
|
|
1023 _regions_add(reg, chr, from, to);
|
|
1024 *chr_end = '\t';
|
|
1025 }
|
|
1026 hts_close(reg->file); reg->file = NULL;
|
|
1027 if ( !reg->nseqs ) { free(reg); return NULL; }
|
|
1028 return reg;
|
|
1029 }
|
|
1030
|
|
1031 reg->seq_names = (char**) tbx_seqnames(reg->tbx, ®->nseqs);
|
|
1032 if ( !reg->seq_hash )
|
|
1033 reg->seq_hash = khash_str2int_init();
|
|
1034 int i;
|
|
1035 for (i=0; i<reg->nseqs; i++)
|
|
1036 {
|
|
1037 khash_str2int_set(reg->seq_hash,reg->seq_names[i],i);
|
|
1038 }
|
|
1039 reg->fname = strdup(regions);
|
|
1040 reg->is_bin = 1;
|
|
1041 return reg;
|
|
1042 }
|
|
1043
|
|
1044 void bcf_sr_regions_destroy(bcf_sr_regions_t *reg)
|
|
1045 {
|
|
1046 int i;
|
|
1047 free(reg->fname);
|
|
1048 if ( reg->itr ) tbx_itr_destroy(reg->itr);
|
|
1049 if ( reg->tbx ) tbx_destroy(reg->tbx);
|
|
1050 if ( reg->file ) hts_close(reg->file);
|
|
1051 if ( reg->als ) free(reg->als);
|
|
1052 if ( reg->als_str.s ) free(reg->als_str.s);
|
|
1053 free(reg->line.s);
|
|
1054 if ( reg->regs )
|
|
1055 {
|
|
1056 // free only in-memory names, tbx names are const
|
|
1057 for (i=0; i<reg->nseqs; i++)
|
|
1058 {
|
|
1059 free(reg->seq_names[i]);
|
|
1060 free(reg->regs[i].regs);
|
|
1061 }
|
|
1062 }
|
|
1063 free(reg->regs);
|
|
1064 free(reg->seq_names);
|
|
1065 khash_str2int_destroy(reg->seq_hash);
|
|
1066 free(reg);
|
|
1067 }
|
|
1068
|
|
1069 int bcf_sr_regions_seek(bcf_sr_regions_t *reg, const char *seq)
|
|
1070 {
|
|
1071 reg->iseq = reg->start = reg->end = -1;
|
|
1072 if ( khash_str2int_get(reg->seq_hash, seq, ®->iseq) < 0 ) return -1; // sequence seq not in regions
|
|
1073
|
|
1074 // using in-memory regions
|
|
1075 if ( reg->regs )
|
|
1076 {
|
|
1077 reg->regs[reg->iseq].creg = -1;
|
|
1078 return 0;
|
|
1079 }
|
|
1080
|
|
1081 // reading regions from tabix
|
|
1082 if ( reg->itr ) tbx_itr_destroy(reg->itr);
|
|
1083 reg->itr = tbx_itr_querys(reg->tbx, seq);
|
|
1084 if ( reg->itr ) return 0;
|
|
1085
|
|
1086 return -1;
|
|
1087 }
|
|
1088
|
|
1089 int bcf_sr_regions_next(bcf_sr_regions_t *reg)
|
|
1090 {
|
|
1091 if ( reg->iseq<0 ) return -1;
|
|
1092 reg->start = reg->end = -1;
|
|
1093 reg->nals = 0;
|
|
1094
|
|
1095 // using in-memory regions
|
|
1096 if ( reg->regs )
|
|
1097 {
|
|
1098 while ( reg->iseq < reg->nseqs )
|
|
1099 {
|
|
1100 reg->regs[reg->iseq].creg++;
|
|
1101 if ( reg->regs[reg->iseq].creg < reg->regs[reg->iseq].nregs ) break;
|
|
1102 reg->iseq++;
|
|
1103 }
|
|
1104 if ( reg->iseq >= reg->nseqs ) { reg->iseq = -1; return -1; } // no more regions left
|
|
1105 region1_t *creg = ®->regs[reg->iseq].regs[reg->regs[reg->iseq].creg];
|
|
1106 reg->start = creg->start;
|
|
1107 reg->end = creg->end;
|
|
1108 return 0;
|
|
1109 }
|
|
1110
|
|
1111 // reading from tabix
|
|
1112 char *chr, *chr_end;
|
|
1113 int ichr = 0, ifrom = 1, ito = 2, is_bed = 0, from, to;
|
|
1114 if ( reg->tbx )
|
|
1115 {
|
|
1116 ichr = reg->tbx->conf.sc-1;
|
|
1117 ifrom = reg->tbx->conf.bc-1;
|
|
1118 ito = reg->tbx->conf.ec-1;
|
|
1119 if ( ito<0 ) ito = ifrom;
|
|
1120 is_bed = reg->tbx->conf.preset==TBX_UCSC ? 1 : 0;
|
|
1121 }
|
|
1122
|
|
1123 int ret = 0;
|
|
1124 while ( !ret )
|
|
1125 {
|
|
1126 if ( reg->itr )
|
|
1127 {
|
|
1128 // tabix index present, reading a chromosome block
|
|
1129 ret = tbx_itr_next(reg->file, reg->tbx, reg->itr, ®->line);
|
|
1130 if ( ret<0 ) { reg->iseq = -1; return -1; }
|
|
1131 }
|
|
1132 else
|
|
1133 {
|
|
1134 if ( reg->is_bin )
|
|
1135 {
|
|
1136 // Waited for seek which never came. Reopen in text mode and stream
|
|
1137 // through the regions, otherwise hts_getline would fail
|
|
1138 hts_close(reg->file);
|
|
1139 reg->file = hts_open(reg->fname, "r");
|
|
1140 if ( !reg->file )
|
|
1141 {
|
|
1142 fprintf(stderr,"[%s:%d %s] Could not open file: %s\n", __FILE__,__LINE__,__FUNCTION__,reg->fname);
|
|
1143 reg->file = NULL;
|
|
1144 bcf_sr_regions_destroy(reg);
|
|
1145 return -1;
|
|
1146 }
|
|
1147 reg->is_bin = 0;
|
|
1148 }
|
|
1149
|
|
1150 // tabix index absent, reading the whole file
|
|
1151 ret = hts_getline(reg->file, KS_SEP_LINE, ®->line);
|
|
1152 if ( ret<0 ) { reg->iseq = -1; return -1; }
|
|
1153 }
|
|
1154 ret = _regions_parse_line(reg->line.s, ichr,ifrom,ito, &chr,&chr_end,&from,&to);
|
|
1155 if ( ret<0 )
|
|
1156 {
|
|
1157 fprintf(stderr,"[%s:%d] Could not parse the file %s, using the columns %d,%d,%d\n", __FILE__,__LINE__,reg->fname,ichr+1,ifrom+1,ito+1);
|
|
1158 return -1;
|
|
1159 }
|
|
1160 }
|
|
1161 if ( is_bed ) from++;
|
|
1162
|
|
1163 *chr_end = 0;
|
|
1164 if ( khash_str2int_get(reg->seq_hash, chr, ®->iseq)<0 )
|
|
1165 {
|
|
1166 fprintf(stderr,"Broken tabix index? The sequence \"%s\" not in dictionary [%s]\n", chr,reg->line.s);
|
|
1167 exit(1);
|
|
1168 }
|
|
1169 *chr_end = '\t';
|
|
1170
|
|
1171 reg->start = from - 1;
|
|
1172 reg->end = to - 1;
|
|
1173 return 0;
|
|
1174 }
|
|
1175
|
|
1176 static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec)
|
|
1177 {
|
|
1178 int i = 0, max_len = 0;
|
|
1179 if ( !reg->nals )
|
|
1180 {
|
|
1181 char *ss = reg->line.s;
|
|
1182 while ( i<als_idx && *ss )
|
|
1183 {
|
|
1184 if ( *ss=='\t' ) i++;
|
|
1185 ss++;
|
|
1186 }
|
|
1187 char *se = ss;
|
|
1188 reg->nals = 1;
|
|
1189 while ( *se && *se!='\t' )
|
|
1190 {
|
|
1191 if ( *se==',' ) reg->nals++;
|
|
1192 se++;
|
|
1193 }
|
|
1194 ks_resize(®->als_str, se-ss+1+reg->nals);
|
|
1195 reg->als_str.l = 0;
|
|
1196 hts_expand(char*,reg->nals,reg->mals,reg->als);
|
|
1197 reg->nals = 0;
|
|
1198
|
|
1199 se = ss;
|
|
1200 while ( *(++se) )
|
|
1201 {
|
|
1202 if ( *se=='\t' ) break;
|
|
1203 if ( *se!=',' ) continue;
|
|
1204 reg->als[reg->nals] = ®->als_str.s[reg->als_str.l];
|
|
1205 kputsn(ss,se-ss,®->als_str);
|
|
1206 if ( ®->als_str.s[reg->als_str.l] - reg->als[reg->nals] > max_len ) max_len = ®->als_str.s[reg->als_str.l] - reg->als[reg->nals];
|
|
1207 reg->als_str.l++;
|
|
1208 reg->nals++;
|
|
1209 ss = ++se;
|
|
1210 }
|
|
1211 reg->als[reg->nals] = ®->als_str.s[reg->als_str.l];
|
|
1212 kputsn(ss,se-ss,®->als_str);
|
|
1213 if ( ®->als_str.s[reg->als_str.l] - reg->als[reg->nals] > max_len ) max_len = ®->als_str.s[reg->als_str.l] - reg->als[reg->nals];
|
|
1214 reg->nals++;
|
|
1215 reg->als_type = max_len > 1 ? VCF_INDEL : VCF_SNP; // this is a simplified check, see vcf.c:bcf_set_variant_types
|
|
1216 }
|
|
1217 int type = bcf_get_variant_types(rec);
|
|
1218 if ( reg->als_type & VCF_INDEL )
|
|
1219 return type & VCF_INDEL ? 1 : 0;
|
|
1220 return !(type & VCF_INDEL) ? 1 : 0;
|
|
1221 }
|
|
1222
|
|
1223 int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, int start, int end)
|
|
1224 {
|
|
1225 int iseq;
|
|
1226 if ( khash_str2int_get(reg->seq_hash, seq, &iseq)<0 ) return -1; // no such sequence
|
|
1227
|
|
1228 if ( reg->prev_seq==-1 || iseq!=reg->prev_seq || reg->prev_start > start ) // new chromosome or after a seek
|
|
1229 {
|
|
1230 // flush regions left on previous chromosome
|
|
1231 if ( reg->missed_reg_handler && reg->prev_seq!=-1 && reg->iseq!=-1 )
|
|
1232 bcf_sr_regions_flush(reg);
|
|
1233
|
|
1234 bcf_sr_regions_seek(reg, seq);
|
|
1235 reg->start = reg->end = -1;
|
|
1236 }
|
|
1237 if ( reg->prev_seq==iseq && reg->iseq!=iseq ) return -2; // no more regions on this chromosome
|
|
1238 reg->prev_seq = reg->iseq;
|
|
1239 reg->prev_start = start;
|
|
1240
|
|
1241 while ( iseq==reg->iseq && reg->end < start )
|
|
1242 {
|
|
1243 if ( bcf_sr_regions_next(reg) < 0 ) return -2; // no more regions left
|
|
1244 if ( reg->iseq != iseq ) return -1; // does not overlap any regions
|
|
1245 if ( reg->missed_reg_handler && reg->end < start ) reg->missed_reg_handler(reg, reg->missed_reg_data);
|
|
1246 }
|
|
1247 if ( reg->start <= end ) return 0; // region overlap
|
|
1248 return -1; // no overlap
|
|
1249 }
|
|
1250
|
|
1251 void bcf_sr_regions_flush(bcf_sr_regions_t *reg)
|
|
1252 {
|
|
1253 if ( !reg->missed_reg_handler || reg->prev_seq==-1 ) return;
|
|
1254 while ( !bcf_sr_regions_next(reg) ) reg->missed_reg_handler(reg, reg->missed_reg_data);
|
|
1255 return;
|
|
1256 }
|
|
1257
|