Mercurial > repos > ulfschaefer > vcfs2fasta
comparison vcfs2fasta.py @ 22:96f393ad7fc6 draft default tip
Uploaded
| author | ulfschaefer |
|---|---|
| date | Wed, 23 Dec 2015 04:50:58 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 21:b09ffe50c378 | 22:96f393ad7fc6 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 ''' | |
| 3 Merge SNP data from multiple VCF files into a single fasta file. | |
| 4 | |
| 5 Created on 5 Oct 2015 | |
| 6 | |
| 7 @author: alex | |
| 8 ''' | |
| 9 import argparse | |
| 10 from collections import OrderedDict | |
| 11 from collections import defaultdict | |
| 12 import glob | |
| 13 import itertools | |
| 14 import logging | |
| 15 import os | |
| 16 | |
| 17 from Bio import SeqIO | |
| 18 from bintrees import FastRBTree | |
| 19 | |
| 20 # Try importing the matplotlib and numpy for stats. | |
| 21 try: | |
| 22 from matplotlib import pyplot as plt | |
| 23 import numpy | |
| 24 can_stats = True | |
| 25 except ImportError: | |
| 26 can_stats = False | |
| 27 | |
| 28 import vcf | |
| 29 | |
| 30 from phe.variant_filters import IUPAC_CODES | |
| 31 | |
| 32 | |
| 33 def plot_stats(pos_stats, total_samples, plots_dir="plots", discarded={}): | |
| 34 if not os.path.exists(plots_dir): | |
| 35 os.makedirs(plots_dir) | |
| 36 | |
| 37 for contig in pos_stats: | |
| 38 plt.style.use('ggplot') | |
| 39 | |
| 40 x = numpy.array([pos for pos in pos_stats[contig] if pos not in discarded.get(contig, [])]) | |
| 41 y = numpy.array([ float(pos_stats[contig][pos]["mut"]) / total_samples for pos in pos_stats[contig] if pos not in discarded.get(contig, []) ]) | |
| 42 | |
| 43 f, (ax1, ax2, ax3, ax4) = plt.subplots(4, sharex=True, sharey=True) | |
| 44 f.set_size_inches(12, 15) | |
| 45 ax1.plot(x, y, 'ro') | |
| 46 ax1.set_title("Fraction of samples with SNPs") | |
| 47 plt.ylim(0, 1.1) | |
| 48 | |
| 49 y = numpy.array([ float(pos_stats[contig][pos]["N"]) / total_samples for pos in pos_stats[contig] if pos not in discarded.get(contig, [])]) | |
| 50 ax2.plot(x, y, 'bo') | |
| 51 ax2.set_title("Fraction of samples with Ns") | |
| 52 | |
| 53 y = numpy.array([ float(pos_stats[contig][pos]["mix"]) / total_samples for pos in pos_stats[contig] if pos not in discarded.get(contig, [])]) | |
| 54 ax3.plot(x, y, 'go') | |
| 55 ax3.set_title("Fraction of samples with mixed bases") | |
| 56 | |
| 57 y = numpy.array([ float(pos_stats[contig][pos]["gap"]) / total_samples for pos in pos_stats[contig] if pos not in discarded.get(contig, [])]) | |
| 58 ax4.plot(x, y, 'yo') | |
| 59 ax4.set_title("Fraction of samples with uncallable genotype (gap)") | |
| 60 | |
| 61 contig = contig.replace("/", "-") | |
| 62 plt.savefig(os.path.join(plots_dir, "%s.png" % contig), dpi=100) | |
| 63 | |
| 64 def get_mixture(record, threshold): | |
| 65 mixtures = {} | |
| 66 try: | |
| 67 if len(record.samples[0].data.AD) > 1: | |
| 68 | |
| 69 total_depth = sum(record.samples[0].data.AD) | |
| 70 # Go over all combinations of touples. | |
| 71 for comb in itertools.combinations(range(0, len(record.samples[0].data.AD)), 2): | |
| 72 i = comb[0] | |
| 73 j = comb[1] | |
| 74 | |
| 75 alleles = list() | |
| 76 | |
| 77 if 0 in comb: | |
| 78 alleles.append(str(record.REF)) | |
| 79 | |
| 80 if i != 0: | |
| 81 alleles.append(str(record.ALT[i - 1])) | |
| 82 mixture = record.samples[0].data.AD[i] | |
| 83 if j != 0: | |
| 84 alleles.append(str(record.ALT[j - 1])) | |
| 85 mixture = record.samples[0].data.AD[j] | |
| 86 | |
| 87 ratio = float(mixture) / total_depth | |
| 88 if ratio == 1.0: | |
| 89 logging.debug("This is only designed for mixtures! %s %s %s %s", record, ratio, record.samples[0].data.AD, record.FILTER) | |
| 90 | |
| 91 if ratio not in mixtures: | |
| 92 mixtures[ratio] = [] | |
| 93 mixtures[ratio].append(alleles.pop()) | |
| 94 | |
| 95 elif ratio >= threshold: | |
| 96 try: | |
| 97 code = IUPAC_CODES[frozenset(alleles)] | |
| 98 if ratio not in mixtures: | |
| 99 mixtures[ratio] = [] | |
| 100 mixtures[ratio].append(code) | |
| 101 except KeyError: | |
| 102 logging.warn("Could not retrieve IUPAC code for %s from %s", alleles, record) | |
| 103 except AttributeError: | |
| 104 mixtures = {} | |
| 105 | |
| 106 return mixtures | |
| 107 | |
| 108 def print_stats(stats, pos_stats, total_vars): | |
| 109 for contig in stats: | |
| 110 for sample, info in stats[contig].items(): | |
| 111 print "%s,%i,%i" % (sample, len(info.get("n_pos", [])), total_vars) | |
| 112 | |
| 113 for contig in stats: | |
| 114 for pos, info in pos_stats[contig].iteritems(): | |
| 115 print "%s,%i,%i,%i,%i" % (contig, pos, info.get("N", "NA"), info.get("-", "NA"), info.get("mut", "NA")) | |
| 116 | |
| 117 | |
| 118 def get_args(): | |
| 119 args = argparse.ArgumentParser(description="Combine multiple VCFs into a single FASTA file.") | |
| 120 | |
| 121 group = args.add_mutually_exclusive_group(required=True) | |
| 122 group.add_argument("--directory", "-d", help="Path to the directory with .vcf files.") | |
| 123 group.add_argument("--input", "-i", type=str, nargs='+', help="List of VCF files to process.") | |
| 124 | |
| 125 args.add_argument("--out", "-o", required=True, help="Path to the output FASTA file.") | |
| 126 | |
| 127 args.add_argument("--with-mixtures", type=float, help="Specify this option with a threshold to output mixtures above this threshold.") | |
| 128 | |
| 129 args.add_argument("--column-Ns", type=float, help="Keeps columns with fraction of Ns above specified threshold.") | |
| 130 | |
| 131 args.add_argument("--sample-Ns", type=float, help="Keeps samples with fraction of Ns above specified threshold.") | |
| 132 | |
| 133 args.add_argument("--reference", type=str, help="If path to reference specified (FASTA), then whole genome will be written.") | |
| 134 | |
| 135 group = args.add_mutually_exclusive_group() | |
| 136 | |
| 137 group.add_argument("--include") | |
| 138 group.add_argument("--exclude") | |
| 139 | |
| 140 args.add_argument("--with-stats", help="If a path is specified, then position of the outputed SNPs is stored in this file. Requires mumpy and matplotlib.") | |
| 141 args.add_argument("--plots-dir", default="plots", help="Where to write summary plots on SNPs extracted. Requires mumpy and matplotlib.") | |
| 142 | |
| 143 args.add_argument("--debug", action="store_true", help="More verbose logging (default: turned off).") | |
| 144 args.add_argument("--local", action="store_true", help="Re-read the VCF instead of storing it in memory.") | |
| 145 | |
| 146 return args.parse_args() | |
| 147 | |
| 148 def main(): | |
| 149 """ | |
| 150 Process VCF files and merge them into a single fasta file. | |
| 151 """ | |
| 152 args = get_args() | |
| 153 | |
| 154 logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) | |
| 155 | |
| 156 contigs = list() | |
| 157 | |
| 158 sample_stats = dict() | |
| 159 | |
| 160 # All positions available for analysis. | |
| 161 avail_pos = dict() | |
| 162 # Stats about each position in each chromosome. | |
| 163 pos_stats = dict() | |
| 164 indel_summary = defaultdict(int) | |
| 165 # Cached version of the data. | |
| 166 vcf_data = dict() | |
| 167 mixtures = dict() | |
| 168 | |
| 169 empty_tree = FastRBTree() | |
| 170 | |
| 171 exclude = False | |
| 172 include = False | |
| 173 | |
| 174 if args.reference: | |
| 175 ref_seq = OrderedDict() | |
| 176 with open(args.reference) as fp: | |
| 177 for record in SeqIO.parse(fp, "fasta"): | |
| 178 ref_seq[record.id] = str(record.seq) | |
| 179 | |
| 180 args.reference = ref_seq | |
| 181 | |
| 182 if args.exclude or args.include: | |
| 183 pos = {} | |
| 184 chr_pos = [] | |
| 185 bed_file = args.include if args.include is not None else args.exclude | |
| 186 | |
| 187 with open(bed_file) as fp: | |
| 188 for line in fp: | |
| 189 data = line.strip().split("\t") | |
| 190 | |
| 191 chr_pos += [ (i, False,) for i in xrange(int(data[1]), int(data[2]) + 1)] | |
| 192 | |
| 193 if data[0] not in pos: | |
| 194 pos[data[0]] = [] | |
| 195 | |
| 196 pos[data[0]] += chr_pos | |
| 197 | |
| 198 | |
| 199 pos = {chrom: FastRBTree(l) for chrom, l in pos.items()} | |
| 200 | |
| 201 if args.include: | |
| 202 include = pos | |
| 203 else: | |
| 204 exclude = pos | |
| 205 | |
| 206 | |
| 207 if args.directory is not None and args.input is None: | |
| 208 args.input = glob.glob(os.path.join(args.directory, "*.filtered.vcf")) | |
| 209 | |
| 210 # First pass to get the references and the positions to be analysed. | |
| 211 for vcf_in in args.input: | |
| 212 sample_name, _ = os.path.splitext(os.path.basename(vcf_in)) | |
| 213 vcf_data[vcf_in] = list() | |
| 214 reader = vcf.Reader(filename=vcf_in) | |
| 215 | |
| 216 for record in reader: | |
| 217 if include and include.get(record.CHROM, empty_tree).get(record.POS, True) or exclude and not exclude.get(record.CHROM, empty_tree).get(record.POS, True): | |
| 218 continue | |
| 219 | |
| 220 if not args.local: | |
| 221 vcf_data[vcf_in].append(record) | |
| 222 | |
| 223 if record.CHROM not in contigs: | |
| 224 contigs.append(record.CHROM) | |
| 225 avail_pos[record.CHROM] = FastRBTree() | |
| 226 mixtures[record.CHROM] = {} | |
| 227 sample_stats[record.CHROM] = {} | |
| 228 | |
| 229 if sample_name not in mixtures[record.CHROM]: | |
| 230 mixtures[record.CHROM][sample_name] = FastRBTree() | |
| 231 | |
| 232 if sample_name not in sample_stats[record.CHROM]: | |
| 233 sample_stats[record.CHROM][sample_name] = {} | |
| 234 | |
| 235 if not record.FILTER: | |
| 236 if record.is_snp: | |
| 237 if record.POS in avail_pos[record.CHROM] and avail_pos[record.CHROM][record.POS] != record.REF: | |
| 238 logging.critical("SOMETHING IS REALLY WRONG because reference for the same position is DIFFERENT! %s in %s", record.POS, vcf_in) | |
| 239 return 2 | |
| 240 | |
| 241 if record.CHROM not in pos_stats: | |
| 242 pos_stats[record.CHROM] = {} | |
| 243 | |
| 244 avail_pos[record.CHROM].insert(record.POS, str(record.REF)) | |
| 245 pos_stats[record.CHROM][record.POS] = {"N":0, "-": 0, "mut": 0, "mix": 0, "gap": 0} | |
| 246 | |
| 247 elif args.with_mixtures and record.is_snp: | |
| 248 mix = get_mixture(record, args.with_mixtures) | |
| 249 | |
| 250 for ratio, code in mix.items(): | |
| 251 for c in code: | |
| 252 avail_pos[record.CHROM].insert(record.POS, str(record.REF)) | |
| 253 if record.CHROM not in pos_stats: | |
| 254 pos_stats[record.CHROM] = {} | |
| 255 pos_stats[record.CHROM][record.POS] = {"N": 0, "-": 0, "mut": 0, "mix": 0, "gap": 0} | |
| 256 | |
| 257 if sample_name not in mixtures[record.CHROM]: | |
| 258 mixtures[record.CHROM][sample_name] = FastRBTree() | |
| 259 | |
| 260 mixtures[record.CHROM][sample_name].insert(record.POS, c) | |
| 261 elif not record.is_deletion and not record.is_indel: | |
| 262 if record.CHROM not in pos_stats: | |
| 263 pos_stats[record.CHROM] = {} | |
| 264 pos_stats[record.CHROM][record.POS] = {"N": 0, "-": 0, "mut": 0, "mix": 0, "gap": 0} | |
| 265 avail_pos[record.CHROM].insert(record.POS, str(record.REF)) | |
| 266 else: | |
| 267 logging.debug("Discarding %s from %s as DEL and/or INDEL", record.POS, vcf_in) | |
| 268 indel_summary[vcf_in] += 1 | |
| 269 try: | |
| 270 vcf_data[vcf_in].remove(record) | |
| 271 except ValueError: | |
| 272 pass | |
| 273 | |
| 274 | |
| 275 all_data = { contig: {} for contig in contigs} | |
| 276 samples = [] | |
| 277 | |
| 278 for vcf_in in args.input: | |
| 279 | |
| 280 sample_seq = "" | |
| 281 sample_name, _ = os.path.splitext(os.path.basename(vcf_in)) | |
| 282 samples.append(sample_name) | |
| 283 | |
| 284 # Initialise the data for this sample to be REF positions. | |
| 285 for contig in contigs: | |
| 286 all_data[contig][sample_name] = { pos: avail_pos[contig][pos] for pos in avail_pos[contig] } | |
| 287 | |
| 288 # Re-read data from VCF if local is specified, otherwise get it from memory. | |
| 289 iterator = vcf.Reader(filename=vcf_in) if args.local else vcf_data[vcf_in] | |
| 290 for record in iterator: | |
| 291 # Array of filters that have been applied. | |
| 292 filters = [] | |
| 293 | |
| 294 # If position is our available position. | |
| 295 if avail_pos.get(record.CHROM, empty_tree).get(record.POS, False): | |
| 296 if not record.FILTER: | |
| 297 if record.is_snp: | |
| 298 if len(record.ALT) > 1: | |
| 299 logging.info("POS %s passed filters but has multiple alleles. Inserting N") | |
| 300 all_data[record.CHROM][sample_name][record.POS] = "N" | |
| 301 else: | |
| 302 all_data[record.CHROM][sample_name][record.POS] = record.ALT[0].sequence | |
| 303 pos_stats[record.CHROM][record.POS]["mut"] += 1 | |
| 304 else: | |
| 305 | |
| 306 # Currently we are only using first filter to call consensus. | |
| 307 extended_code = mixtures[record.CHROM][sample_name].get(record.POS, "N") | |
| 308 | |
| 309 # extended_code = PHEFilterBase.call_concensus(record) | |
| 310 | |
| 311 # Calculate the stats | |
| 312 if extended_code == "N": | |
| 313 pos_stats[record.CHROM][record.POS]["N"] += 1 | |
| 314 | |
| 315 if "n_pos" not in sample_stats[record.CHROM][sample_name]: | |
| 316 sample_stats[record.CHROM][sample_name]["n_pos"] = [] | |
| 317 sample_stats[record.CHROM][sample_name]["n_pos"].append(record.POS) | |
| 318 | |
| 319 elif extended_code == "-": | |
| 320 pos_stats[record.CHROM][record.POS]["-"] += 1 | |
| 321 else: | |
| 322 pos_stats[record.CHROM][record.POS]["mix"] += 1 | |
| 323 # print "Good mixture %s: %i (%s)" % (sample_name, record.POS, extended_code) | |
| 324 # Record if there was uncallable genoty/gap in the data. | |
| 325 if record.samples[0].data.GT == "./.": | |
| 326 pos_stats[record.CHROM][record.POS]["gap"] += 1 | |
| 327 | |
| 328 # Save the extended code of the SNP. | |
| 329 all_data[record.CHROM][sample_name][record.POS] = extended_code | |
| 330 del vcf_data[vcf_in] | |
| 331 | |
| 332 # Output the data to the fasta file. | |
| 333 # The data is already aligned so simply output it. | |
| 334 discarded = {} | |
| 335 | |
| 336 if args.reference: | |
| 337 # These should be in the same order as the order in reference. | |
| 338 contigs = args.reference.keys() | |
| 339 | |
| 340 if args.sample_Ns: | |
| 341 delete_samples = [] | |
| 342 for contig in contigs: | |
| 343 for sample in samples: | |
| 344 | |
| 345 # Skip if the contig not in sample_stats | |
| 346 if contig not in sample_stats: | |
| 347 continue | |
| 348 | |
| 349 sample_n_ratio = float(len(sample_stats[contig][sample]["n_pos"])) / len(avail_pos[contig]) | |
| 350 if sample_n_ratio > args.sample_Ns: | |
| 351 for pos in sample_stats[contig][sample]["n_pos"]: | |
| 352 pos_stats[contig][pos]["N"] -= 1 | |
| 353 | |
| 354 logging.info("Removing %s due to high Ns in sample: %s", sample , sample_n_ratio) | |
| 355 | |
| 356 delete_samples.append(sample) | |
| 357 | |
| 358 samples = [sample for sample in samples if sample not in delete_samples] | |
| 359 snp_positions = [] | |
| 360 with open(args.out, "w") as fp: | |
| 361 | |
| 362 for sample in samples: | |
| 363 sample_seq = "" | |
| 364 for contig in contigs: | |
| 365 if contig in avail_pos: | |
| 366 if args.reference: | |
| 367 positions = xrange(1, len(args.reference[contig]) + 1) | |
| 368 else: | |
| 369 positions = avail_pos[contig].keys() | |
| 370 for pos in positions: | |
| 371 if pos in avail_pos[contig]: | |
| 372 if not args.column_Ns or float(pos_stats[contig][pos]["N"]) / len(samples) < args.column_Ns and \ | |
| 373 float(pos_stats[contig][pos]["-"]) / len(samples) < args.column_Ns: | |
| 374 sample_seq += all_data[contig][sample][pos] | |
| 375 else: | |
| 376 if contig not in discarded: | |
| 377 discarded[contig] = [] | |
| 378 discarded[contig].append(pos) | |
| 379 elif args.reference: | |
| 380 sample_seq += args.reference[contig][pos - 1] | |
| 381 elif args.reference: | |
| 382 sample_seq += args.reference[contig] | |
| 383 | |
| 384 fp.write(">%s\n%s\n" % (sample, sample_seq)) | |
| 385 # Do the same for reference data. | |
| 386 ref_snps = "" | |
| 387 | |
| 388 for contig in contigs: | |
| 389 if contig in avail_pos: | |
| 390 if args.reference: | |
| 391 positions = xrange(1, len(args.reference[contig]) + 1) | |
| 392 else: | |
| 393 positions = avail_pos[contig].keys() | |
| 394 for pos in positions: | |
| 395 if pos in avail_pos[contig]: | |
| 396 if not args.column_Ns or float(pos_stats[contig][pos]["N"]) / len(samples) < args.column_Ns and \ | |
| 397 float(pos_stats[contig][pos]["-"]) / len(samples) < args.column_Ns: | |
| 398 | |
| 399 ref_snps += str(avail_pos[contig][pos]) | |
| 400 snp_positions.append((contig, pos,)) | |
| 401 elif args.reference: | |
| 402 ref_snps += args.reference[contig][pos - 1] | |
| 403 elif args.reference: | |
| 404 ref_snps += args.reference[contig] | |
| 405 | |
| 406 fp.write(">reference\n%s\n" % ref_snps) | |
| 407 | |
| 408 if can_stats and args.with_stats: | |
| 409 with open(args.with_stats, "wb") as fp: | |
| 410 fp.write("contig\tposition\tmutations\tn_frac\n") | |
| 411 for values in snp_positions: | |
| 412 fp.write("%s\t%s\t%s\t%s\n" % (values[0], | |
| 413 values[1], | |
| 414 float(pos_stats[values[0]][values[1]]["mut"]) / len(args.input), | |
| 415 float(pos_stats[values[0]][values[1]]["N"]) / len(args.input))) | |
| 416 plot_stats(pos_stats, len(samples), discarded=discarded, plots_dir=os.path.abspath(args.plots_dir)) | |
| 417 # print_stats(sample_stats, pos_stats, total_vars=len(avail_pos[contig])) | |
| 418 | |
| 419 total_discarded = 0 | |
| 420 for _, i in discarded.items(): | |
| 421 total_discarded += len(i) | |
| 422 logging.info("Discarded total of %i poor quality columns", float(total_discarded) / len(args.input)) | |
| 423 logging.info("Samples with indels:") | |
| 424 for sample, count in indel_summary.iteritems(): | |
| 425 logging.info("%s\t%s", sample, count) | |
| 426 return 0 | |
| 427 | |
| 428 if __name__ == '__main__': | |
| 429 import time | |
| 430 | |
| 431 # with PyCallGraph(output=graphviz): | |
| 432 # T0 = time.time() | |
| 433 r = main() | |
| 434 # T1 = time.time() | |
| 435 | |
| 436 # print "Time taken: %i" % (T1 - T0) | |
| 437 exit(r) |
