Mercurial > repos > thondeboer > neat_genreads
diff genReads.py @ 0:6e75a84e9338 draft
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
| author | thondeboer |
|---|---|
| date | Tue, 15 May 2018 02:39:53 -0400 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genReads.py Tue May 15 02:39:53 2018 -0400 @@ -0,0 +1,794 @@ +#!/usr/bin/env python +# encoding: utf-8 +""" //////////////////////////////////////////////////////////////////////////////// + /// /// + /// genReads.py /// + /// VERSION 2.0: HARDER, BETTER, FASTER, STRONGER! /// +/////// ////// + /// Variant and read simulator for benchmarking NGS workflows /// + /// /// + /// Written by: Zach Stephens /// +/////// For: DEPEND Research Group, UIUC /////// + /// Date: May 29, 2015 /// + /// Contact: zstephe2@illinois.edu /// + /// /// +/////////////////////////////////////////////////////////////////////////////// """ + +import os +import sys +import copy +import random +import re +import time +import bisect +import cPickle as pickle +import numpy as np +#import matplotlib.pyplot as mpl +import argparse + +# absolute path to this script +SIM_PATH = '/'.join(os.path.realpath(__file__).split('/')[:-1]) +sys.path.append(SIM_PATH+'/py/') + +from inputChecking import requiredField, checkFileOpen, checkDir, isInRange +from refFunc import indexRef, readRef, getAllRefRegions, partitionRefRegions, ALLOWED_NUCL +from vcfFunc import parseVCF +from OutputFileWriter import OutputFileWriter +from probability import DiscreteDistribution, mean_ind_of_weighted_list +from SequenceContainer import SequenceContainer, ReadContainer, parseInputMutationModel + +# if coverage val for a given window/position is below this value, consider it effectively zero. +LOW_COV_THRESH = 50 + +"""////////////////////////////////////////////////// +//////////// PARSE INPUT ARGUMENTS //////////// +//////////////////////////////////////////////////""" + + +parser = argparse.ArgumentParser(description='NEAT-genReads V2.0') +parser.add_argument('-r', type=str, required=True, metavar='<str>', help="* ref.fa") +parser.add_argument('-R', type=int, required=True, metavar='<int>', help="* read length") +parser.add_argument('-o', type=str, required=True, metavar='<str>', help="* output prefix") +parser.add_argument('-c', type=float, required=False, metavar='<float>', default=10., help="average coverage") +parser.add_argument('-e', type=str, required=False, metavar='<str>', default=None, help="sequencing error model") +parser.add_argument('-E', type=float, required=False, metavar='<float>', default=-1, help="rescale avg sequencing error rate to this") +parser.add_argument('-p', type=int, required=False, metavar='<int>', default=2, help="ploidy") +parser.add_argument('-t', type=str, required=False, metavar='<str>', default=None, help="bed file containing targeted regions") +parser.add_argument('-to',type=float, required=False, metavar='<float>', default=0.00, help="off-target coverage scalar") +parser.add_argument('-m', type=str, required=False, metavar='<str>', default=None, help="mutation model pickle file") +parser.add_argument('-M', type=float, required=False, metavar='<float>', default=-1, help="rescale avg mutation rate to this") +parser.add_argument('-Mb',type=str, required=False, metavar='<str>', default=None, help="bed file containing positional mut rates") +parser.add_argument('-N', type=int, required=False, metavar='<int>', default=-1, help="below this qual, replace base-calls with 'N's") +#parser.add_argument('-s', type=str, required=False, metavar='<str>', default=None, help="input sample model") +parser.add_argument('-v', type=str, required=False, metavar='<str>', default=None, help="input VCF file") + +parser.add_argument('--pe', nargs=2, type=int, required=False, metavar=('<int>','<int>'), default=(None,None), help='paired-end fragment length mean and std') +parser.add_argument('--pe-model', type=str, required=False, metavar='<str>', default=None, help='empirical fragment length distribution') +#parser.add_argument('--cancer', required=False, action='store_true', default=False, help='produce tumor/normal datasets') +#parser.add_argument('-cm', type=str, required=False, metavar='<str>', default=None, help="cancer mutation model directory") +#parser.add_argument('-cp', type=float, required=False, metavar='<float>', default=0.8, help="tumor sample purity") +parser.add_argument('--gc-model', type=str, required=False, metavar='<str>', default=None, help='empirical GC coverage bias distribution') +parser.add_argument('--job', nargs=2, type=int, required=False, metavar=('<int>','<int>'), default=(0,0), help='jobs IDs for generating reads in parallel') +parser.add_argument('--nnr', required=False, action='store_true', default=False, help='save non-N ref regions (for parallel jobs)') +parser.add_argument('--bam', required=False, action='store_true', default=False, help='output golden BAM file') +parser.add_argument('--vcf', required=False, action='store_true', default=False, help='output golden VCF file') +parser.add_argument('--rng', type=int, required=False, metavar='<int>', default=-1, help='rng seed value; identical RNG value should produce identical runs of the program, so things like read locations, variant positions, error positions, etc, should all be the same.') +parser.add_argument('--gz', required=False, action='store_true', default=False, help='gzip output FQ and VCF') +parser.add_argument('--no-fastq', required=False, action='store_true', default=False, help='bypass fastq generation') +args = parser.parse_args() + +# required args +(REFERENCE, READLEN, OUT_PREFIX) = (args.r, args.R, args.o) +# various dataset parameters +(COVERAGE, PLOIDS, INPUT_BED, SE_MODEL, SE_RATE, MUT_MODEL, MUT_RATE, MUT_BED, INPUT_VCF) = (args.c, args.p, args.t, args.e, args.E, args.m, args.M, args.Mb, args.v) +# cancer params (disabled currently) +#(CANCER, CANCER_MODEL, CANCER_PURITY) = (args.cancer, args.cm, args.cp) +(CANCER, CANCER_MODEL, CANCER_PURITY) = (False, None, 0.8) +(OFFTARGET_SCALAR) = (args.to) +# important flags +(SAVE_BAM, SAVE_VCF, GZIPPED_OUT, NO_FASTQ) = (args.bam, args.vcf, args.gz, args.no_fastq) + +ONLY_VCF = (NO_FASTQ and SAVE_VCF and not(SAVE_BAM)) +if ONLY_VCF: + print 'Only producing VCF output, that should speed things up a bit...' + +# sequencing model parameters +(FRAGMENT_SIZE, FRAGMENT_STD) = args.pe +FRAGLEN_MODEL = args.pe_model +GC_BIAS_MODEL = args.gc_model +N_MAX_QUAL = args.N + +# if user specified no fastq, no bam, no vcf, then inform them of their wasteful ways and exit +if NO_FASTQ == True and SAVE_BAM == False and SAVE_VCF == False: + print '\nError: No files will be written when --no-fastq is specified without --vcf or --bam.' + exit(1) + +# if user specified mean/std, use artificial fragment length distribution, otherwise use +# the empirical model specified. If neither, then we're doing single-end reads. +PAIRED_END = False +PAIRED_END_ARTIFICIAL = False +if FRAGMENT_SIZE != None and FRAGMENT_STD != None: + PAIRED_END = True + PAIRED_END_ARTIFICIAL = True +elif FRAGLEN_MODEL != None: + PAIRED_END = True + PAIRED_END_ARTIFICIAL = False + +(MYJOB, NJOBS) = args.job +if MYJOB == 0: + MYJOB = 1 + NJOBS = 1 +SAVE_NON_N = args.nnr + +RNG_SEED = args.rng +if RNG_SEED == -1: + RNG_SEED = random.randint(1,99999999) +random.seed(RNG_SEED) + + +"""************************************************ +**** INPUT ERROR CHECKING +************************************************""" + + +checkFileOpen(REFERENCE,'ERROR: could not open reference',required=True) +checkFileOpen(INPUT_VCF,'ERROR: could not open input VCF',required=False) +checkFileOpen(INPUT_BED,'ERROR: could not open input BED',required=False) +requiredField(OUT_PREFIX,'ERROR: no output prefix provided') +if (FRAGMENT_SIZE == None and FRAGMENT_STD != None) or (FRAGMENT_SIZE != None and FRAGMENT_STD == None): + print '\nError: --pe argument takes 2 space-separated arguments.\n' + exit(1) + + +"""************************************************ +**** LOAD INPUT MODELS +************************************************""" + + +# mutation models +# +MUT_MODEL = parseInputMutationModel(MUT_MODEL,1) +if CANCER: + CANCER_MODEL = parseInputMutationModel(CANCER_MODEL,2) +if MUT_RATE < 0.: + MUT_RATE = None + +# sequencing error model +# +if SE_RATE < 0.: + SE_RATE = None +if SE_MODEL == None: + print 'Using default sequencing error model.' + SE_MODEL = SIM_PATH+'/models/errorModel_toy.p' + SE_CLASS = ReadContainer(READLEN,SE_MODEL,SE_RATE) +else: + # probably need to do some sanity checking + SE_CLASS = ReadContainer(READLEN,SE_MODEL,SE_RATE) + +# GC-bias model +# +if GC_BIAS_MODEL == None: + print 'Using default gc-bias model.' + GC_BIAS_MODEL = SIM_PATH+'/models/gcBias_toy.p' + [GC_SCALE_COUNT, GC_SCALE_VAL] = pickle.load(open(GC_BIAS_MODEL,'rb')) + GC_WINDOW_SIZE = GC_SCALE_COUNT[-1] +else: + [GC_SCALE_COUNT, GC_SCALE_VAL] = pickle.load(open(GC_BIAS_MODEL,'rb')) + GC_WINDOW_SIZE = GC_SCALE_COUNT[-1] + +# fragment length distribution +# +if PAIRED_END and not(PAIRED_END_ARTIFICIAL): + print 'Using empirical fragment length distribution.' + [potential_vals, potential_prob] = pickle.load(open(FRAGLEN_MODEL,'rb')) + + FRAGLEN_VALS = [] + FRAGLEN_PROB = [] + for i in xrange(len(potential_vals)): + if potential_vals[i] > READLEN: + FRAGLEN_VALS.append(potential_vals[i]) + FRAGLEN_PROB.append(potential_prob[i]) + # should probably add some validation and sanity-checking code here... + FRAGLEN_DISTRIBUTION = DiscreteDistribution(FRAGLEN_PROB,FRAGLEN_VALS) + FRAGMENT_SIZE = FRAGLEN_VALS[mean_ind_of_weighted_list(FRAGLEN_PROB)] + +# Indicate not writing FASTQ reads +# +if NO_FASTQ: + print 'Bypassing FASTQ generation...' + +"""************************************************ +**** HARD-CODED CONSTANTS +************************************************""" + + +# target window size for read sampling. how many times bigger than read/frag length +WINDOW_TARGET_SCALE = 100 +# sub-window size for read sampling windows. this is basically the finest resolution +# that can be obtained for targeted region boundaries and GC% bias +SMALL_WINDOW = 20 +# is the mutation model constant throughout the simulation? If so we can save a lot of time +CONSTANT_MUT_MODEL = True + + +"""************************************************ +**** DEFAULT MODELS +************************************************""" + + +# fragment length distribution: normal distribution that goes out to +- 6 standard deviations +if PAIRED_END and PAIRED_END_ARTIFICIAL: + print 'Using artificial fragment length distribution. mean='+str(FRAGMENT_SIZE)+', std='+str(FRAGMENT_STD) + if FRAGMENT_STD == 0: + FRAGLEN_DISTRIBUTION = DiscreteDistribution([1],[FRAGMENT_SIZE],degenerateVal=FRAGMENT_SIZE) + else: + potential_vals = range(FRAGMENT_SIZE-6*FRAGMENT_STD,FRAGMENT_SIZE+6*FRAGMENT_STD+1) + FRAGLEN_VALS = [] + for i in xrange(len(potential_vals)): + if potential_vals[i] > READLEN: + FRAGLEN_VALS.append(potential_vals[i]) + FRAGLEN_PROB = [np.exp(-(((n-float(FRAGMENT_SIZE))**2)/(2*(FRAGMENT_STD**2)))) for n in FRAGLEN_VALS] + FRAGLEN_DISTRIBUTION = DiscreteDistribution(FRAGLEN_PROB,FRAGLEN_VALS) + + +"""************************************************ +**** MORE INPUT ERROR CHECKING +************************************************""" + + +isInRange(READLEN, 10,1000000, 'Error: -R must be between 10 and 1,000,000') +isInRange(COVERAGE, 0,1000000, 'Error: -c must be between 0 and 1,000,000') +isInRange(PLOIDS, 1,100, 'Error: -p must be between 1 and 100') +isInRange(OFFTARGET_SCALAR, 0,1, 'Error: -to must be between 0 and 1') +if MUT_RATE != -1 and MUT_RATE != None: + isInRange(MUT_RATE, 0,0.3, 'Error: -M must be between 0 and 0.3') +if SE_RATE != -1 and SE_RATE != None: + isInRange(SE_RATE, 0,0.3, 'Error: -E must be between 0 and 0.3') +if NJOBS != 1: + isInRange(NJOBS, 1,1000, 'Error: --job must be between 1 and 1,000') + isInRange(MYJOB, 1,1000, 'Error: --job must be between 1 and 1,000') + isInRange(MYJOB, 1,NJOBS, 'Error: job id must be less than or equal to number of jobs') +if N_MAX_QUAL != -1: + isInRange(N_MAX_QUAL, 1,40, 'Error: -N must be between 1 and 40') + + +"""************************************************ +**** MAIN() +************************************************""" + + +def main(): + + # index reference + refIndex = indexRef(REFERENCE) + if PAIRED_END: + N_HANDLING = ('random',FRAGMENT_SIZE) + else: + N_HANDLING = ('ignore',READLEN) + indices_by_refName = {refIndex[n][0]:n for n in xrange(len(refIndex))} + + # parse input variants, if present + inputVariants = [] + if INPUT_VCF != None: + if CANCER: + (sampNames, inputVariants) = parseVCF(INPUT_VCF,tumorNormal=True,ploidy=PLOIDS) + tumorInd = sampNames.index('TUMOR') + normalInd = sampNames.index('NORMAL') + else: + (sampNames, inputVariants) = parseVCF(INPUT_VCF,ploidy=PLOIDS) + for k in sorted(inputVariants.keys()): + inputVariants[k].sort() + + # parse input targeted regions, if present + inputRegions = {} + refList = [n[0] for n in refIndex] + if INPUT_BED != None: + with open(INPUT_BED,'r') as f: + for line in f: + [myChr,pos1,pos2] = line.strip().split('\t')[:3] + if myChr not in inputRegions: + inputRegions[myChr] = [-1] + inputRegions[myChr].extend([int(pos1),int(pos2)]) + # some validation + nInBedOnly = 0 + nInRefOnly = 0 + for k in refList: + if k not in inputRegions: + nInRefOnly += 1 + for k in inputRegions.keys(): + if not k in refList: + nInBedOnly += 1 + del inputRegions[k] + if nInRefOnly > 0: + print 'Warning: Reference contains sequences not found in targeted regions BED file.' + if nInBedOnly > 0: + print 'Warning: Targeted regions BED file contains sequence names not found in reference (regions ignored).' + + + # parse input mutation rate rescaling regions, if present + mutRateRegions = {} + mutRateValues = {} + if MUT_BED != None: + with open(MUT_BED,'r') as f: + for line in f: + [myChr,pos1,pos2,metaData] = line.strip().split('\t')[:4] + mutStr = re.findall(r"MUT_RATE=.*?(?=;)",metaData+';') + (pos1,pos2) = (int(pos1),int(pos2)) + if len(mutStr) and (pos2-pos1) > 1: + # mutRate = #_mutations / length_of_region, let's bound it by a reasonable amount + mutRate = max([0.0,min([float(mutStr[0][9:]),0.3])]) + if myChr not in mutRateRegions: + mutRateRegions[myChr] = [-1] + mutRateValues[myChr] = [0.0] + mutRateRegions[myChr].extend([pos1,pos2]) + mutRateValues.extend([mutRate*(pos2-pos1)]*2) + + # initialize output files (part I) + bamHeader = None + if SAVE_BAM: + bamHeader = [copy.deepcopy(refIndex)] + vcfHeader = None + if SAVE_VCF: + vcfHeader = [REFERENCE] + + # If processing jobs in parallel, precompute the independent regions that can be process separately + if NJOBS > 1: + parallelRegionList = getAllRefRegions(REFERENCE,refIndex,N_HANDLING,saveOutput=SAVE_NON_N) + (myRefs, myRegions) = partitionRefRegions(parallelRegionList,refIndex,MYJOB,NJOBS) + if not len(myRegions): + print 'This job id has no regions to process, exiting...' + exit(1) + for i in xrange(len(refIndex)-1,-1,-1): # delete reference not used in our job + if not refIndex[i][0] in myRefs: + del refIndex[i] + # if value of NJOBS is too high, let's change it to the maximum possible, to avoid output filename confusion + corrected_nJobs = min([NJOBS,sum([len(n) for n in parallelRegionList.values()])]) + else: + corrected_nJobs = 1 + + # initialize output files (part II) + if CANCER: + OFW = OutputFileWriter(OUT_PREFIX+'_normal',paired=PAIRED_END,BAM_header=bamHeader,VCF_header=vcfHeader,gzipped=GZIPPED_OUT,noFASTQ=NO_FASTQ) + OFW_CANCER = OutputFileWriter(OUT_PREFIX+'_tumor',paired=PAIRED_END,BAM_header=bamHeader,VCF_header=vcfHeader,gzipped=GZIPPED_OUT,jobTuple=(MYJOB,corrected_nJobs),noFASTQ=NO_FASTQ) + else: + OFW = OutputFileWriter(OUT_PREFIX,paired=PAIRED_END,BAM_header=bamHeader,VCF_header=vcfHeader,gzipped=GZIPPED_OUT,jobTuple=(MYJOB,corrected_nJobs),noFASTQ=NO_FASTQ) + OUT_PREFIX_NAME = OUT_PREFIX.split('/')[-1] + + + """************************************************ + **** LET'S GET THIS PARTY STARTED... + ************************************************""" + + + readNameCount = 1 # keep track of the number of reads we've sampled, for read-names + unmapped_records = [] + + for RI in xrange(len(refIndex)): + + # read in reference sequence and notate blocks of Ns + (refSequence,N_regions) = readRef(REFERENCE,refIndex[RI],N_HANDLING) + + # if we're processing jobs in parallel only take the regions relevant for the current job + if NJOBS > 1: + for i in xrange(len(N_regions['non_N'])-1,-1,-1): + if not (refIndex[RI][0],N_regions['non_N'][i][0],N_regions['non_N'][i][1]) in myRegions: + del N_regions['non_N'][i] + + # count total bp we'll be spanning so we can get an idea of how far along we are (for printing progress indicators) + total_bp_span = sum([n[1]-n[0] for n in N_regions['non_N']]) + currentProgress = 0 + currentPercent = 0 + + # prune invalid input variants, e.g variants that: + # - try to delete or alter any N characters + # - don't match the reference base at their specified position + # - any alt allele contains anything other than allowed characters + validVariants = [] + nSkipped = [0,0,0] + if refIndex[RI][0] in inputVariants: + for n in inputVariants[refIndex[RI][0]]: + span = (n[0],n[0]+len(n[1])) + rseq = str(refSequence[span[0]-1:span[1]-1]) # -1 because going from VCF coords to array coords + anyBadChr = any((nn not in ALLOWED_NUCL) for nn in [item for sublist in n[2] for item in sublist]) + if rseq != n[1]: + nSkipped[0] += 1 + continue + elif 'N' in rseq: + nSkipped[1] += 1 + continue + elif anyBadChr: + nSkipped[2] += 1 + continue + #if bisect.bisect(N_regions['big'],span[0])%2 or bisect.bisect(N_regions['big'],span[1])%2: + # continue + validVariants.append(n) + print 'found',len(validVariants),'valid variants for '+refIndex[RI][0]+' in input VCF...' + if any(nSkipped): + print sum(nSkipped),'variants skipped...' + print ' - ['+str(nSkipped[0])+'] ref allele does not match reference' + print ' - ['+str(nSkipped[1])+'] attempting to insert into N-region' + print ' - ['+str(nSkipped[2])+'] alt allele contains non-ACGT characters' + + + # add large random structural variants + # + # TBD!!! + + + # determine sampling windows based on read length, large N regions, and structural mutations. + # in order to obtain uniform coverage, windows should overlap by: + # - READLEN, if single-end reads + # - FRAGMENT_SIZE (mean), if paired-end reads + # ploidy is fixed per large sampling window, + # coverage distributions due to GC% and targeted regions are specified within these windows + samplingWindows = [] + ALL_VARIANTS_OUT = {} + sequences = None + if PAIRED_END: + targSize = WINDOW_TARGET_SCALE*FRAGMENT_SIZE + overlap = FRAGMENT_SIZE + overlap_minWindowSize = max(FRAGLEN_DISTRIBUTION.values) + 10 + else: + targSize = WINDOW_TARGET_SCALE*READLEN + overlap = READLEN + overlap_minWindowSize = READLEN + 10 + + print '--------------------------------' + if ONLY_VCF: + print 'generating vcf...' + else: + print 'sampling reads...' + tt = time.time() + + for i in xrange(len(N_regions['non_N'])): + (pi,pf) = N_regions['non_N'][i] + nTargWindows = max([1,(pf-pi)/targSize]) + bpd = int((pf-pi)/float(nTargWindows)) + #bpd += GC_WINDOW_SIZE - bpd%GC_WINDOW_SIZE + + #print len(refSequence), (pi,pf), nTargWindows + #print structuralVars + + # if for some reason our region is too small to process, skip it! (sorry) + if nTargWindows == 1 and (pf-pi) < overlap_minWindowSize: + #print 'Does this ever happen?' + continue + + start = pi + end = min([start+bpd,pf]) + #print '------------------RAWR:', (pi,pf), nTargWindows, bpd + varsFromPrevOverlap = [] + varsCancerFromPrevOverlap = [] + vindFromPrev = 0 + isLastTime = False + havePrinted100 = False + + while True: + + # which inserted variants are in this window? + varsInWindow = [] + updated = False + for j in xrange(vindFromPrev,len(validVariants)): + vPos = validVariants[j][0] + if vPos > start and vPos < end: # update: changed >= to >, so variant cannot be inserted in first position + varsInWindow.append(tuple([vPos-1]+list(validVariants[j][1:]))) # vcf --> array coords + if vPos >= end-overlap-1 and updated == False: + updated = True + vindFromPrev = j + if vPos >= end: + break + + # determine which structural variants will affect our sampling window positions + structuralVars = [] + for n in varsInWindow: + bufferNeeded = max([max([abs(len(n[1])-len(alt_allele)),1]) for alt_allele in n[2]]) # change: added abs() so that insertions are also buffered. + structuralVars.append((n[0]-1,bufferNeeded)) # -1 because going from VCF coords to array coords + + # adjust end-position of window based on inserted structural mutations + buffer_added = 0 + keepGoing = True + while keepGoing: + keepGoing = False + for n in structuralVars: + # adding "overlap" here to prevent SVs from being introduced in overlap regions + # (which can cause problems if random mutations from the previous window land on top of them) + delta = (end-1) - (n[0] + n[1]) - 2 - overlap + if delta < 0: + #print 'DELTA:', delta, 'END:', end, '-->', + buffer_added = -delta + end += buffer_added + ####print end + keepGoing = True + break + next_start = end-overlap + next_end = min([next_start+bpd,pf]) + if next_end-next_start < bpd: + end = next_end + isLastTime = True + + # print progress indicator + ####print 'PROCESSING WINDOW:',(start,end), [buffer_added], 'next:', (next_start,next_end) + currentProgress += end-start + newPercent = int((currentProgress*100)/float(total_bp_span)) + if newPercent > currentPercent: + if newPercent <= 99 or (newPercent == 100 and not havePrinted100): + sys.stdout.write(str(newPercent)+'% ') + sys.stdout.flush() + currentPercent = newPercent + if currentPercent == 100: + havePrinted100 = True + + skip_this_window = False + + # compute coverage modifiers + coverage_avg = None + coverage_dat = [GC_WINDOW_SIZE,GC_SCALE_VAL,[]] + if INPUT_BED == None: + coverage_dat[2] = [1.0]*(end-start) + else: + if refIndex[RI][0] not in inputRegions: + coverage_dat[2] = [OFFTARGET_SCALAR]*(end-start) + else: + for j in xrange(start,end): + if not(bisect.bisect(inputRegions[refIndex[RI][0]],j)%2): + coverage_dat[2].append(1.0) + else: + coverage_dat[2].append(OFFTARGET_SCALAR) + + #print len(coverage_dat[2]), sum(coverage_dat[2]) + if sum(coverage_dat[2]) < LOW_COV_THRESH: + coverage_avg = 0.0 + skip_this_window = True + + # check for small window sizes + if (end-start) < overlap_minWindowSize: + skip_this_window = True + + if skip_this_window: + # skip window, save cpu time + start = next_start + end = next_end + if isLastTime: + break + if end >= pf: + isLastTime = True + continue + + ##### skip windows if we can + ####skip_this_window = False + ####if INPUT_BED != None and OFFTARGET_SCALAR < 1.0e-12: + #### if refIndex[RI][0] in inputRegions: + #### skip_this_window = True + #### else: + #### skip_this_window = True + ####if skip_this_window: + #### # prepare indices of next window + #### start = next_start + #### end = next_end + #### if isLastTime: + #### break + #### if end >= pf: + #### isLastTime = True + #### continue + + + ##### if computing only VCF, we can skip this... + ####if ONLY_VCF: + #### coverage_dat = None + #### coverage_avg = None + ####else: + #### # pre-compute gc-bias and targeted sequencing coverage modifiers + #### nSubWindows = (end-start)/GC_WINDOW_SIZE + #### coverage_dat = (GC_WINDOW_SIZE,[]) + #### for j in xrange(nSubWindows): + #### rInd = start + j*GC_WINDOW_SIZE + #### if INPUT_BED == None: + #### tCov = True + #### elif refIndex[RI][0] in inputRegions: + #### tCov = not(bisect.bisect(inputRegions[refIndex[RI][0]],rInd)%2) or not(bisect.bisect(inputRegions[refIndex[RI][0]],rInd+GC_WINDOW_SIZE)%2) + #### else: + #### tCov = False + #### if tCov: + #### tScl = 1.0 + #### else: + #### tScl = OFFTARGET_SCALAR + #### gc_v = refSequence[rInd:rInd+GC_WINDOW_SIZE].count('G') + refSequence[rInd:rInd+GC_WINDOW_SIZE].count('C') + #### gScl = GC_SCALE_VAL[gc_v] + #### coverage_dat[1].append(1.0*tScl*gScl) + #### coverage_avg = np.mean(coverage_dat[1]) + #### + ##### pre-compute mutation rate tracks + ##### PROVIDED MUTATION RATES OVERRIDE AVERAGE VALUE + + # construct sequence data that we will sample reads from + if sequences == None: + sequences = SequenceContainer(start,refSequence[start:end],PLOIDS,overlap,READLEN,[MUT_MODEL]*PLOIDS,MUT_RATE,onlyVCF=ONLY_VCF) + else: + sequences.update(start,refSequence[start:end],PLOIDS,overlap,READLEN,[MUT_MODEL]*PLOIDS,MUT_RATE) + + # some inserted variant debugging... + ####print '\n',start,end,end-overlap,'\n',varsFromPrevOverlap,'\n',varsInWindow,'\n' + + # insert variants + sequences.insert_mutations(varsFromPrevOverlap + varsInWindow) + all_inserted_variants = sequences.random_mutations() + #print all_inserted_variants + + # init coverage + if sum(coverage_dat[2]) >= LOW_COV_THRESH: + if PAIRED_END: + coverage_avg = sequences.init_coverage(tuple(coverage_dat),fragDist=FRAGLEN_DISTRIBUTION) + else: + coverage_avg = sequences.init_coverage(tuple(coverage_dat)) + + # unused cancer stuff + if CANCER: + tumor_sequences = SequenceContainer(start,refSequence[start:end],PLOIDS,overlap,READLEN,[CANCER_MODEL]*PLOIDS,MUT_RATE,coverage_dat) + tumor_sequences.insert_mutations(varsCancerFromPrevOverlap + all_inserted_variants) + all_cancer_variants = tumor_sequences.random_mutations() + + # which variants do we need to keep for next time (because of window overlap)? + varsFromPrevOverlap = [] + varsCancerFromPrevOverlap = [] + for n in all_inserted_variants: + if n[0] >= end-overlap-1: + varsFromPrevOverlap.append(n) + if CANCER: + for n in all_cancer_variants: + if n[0] >= end-overlap-1: + varsCancerFromPrevOverlap.append(n) + + # if we're only producing VCF, no need to go through the hassle of generating reads + if ONLY_VCF: + pass + else: + if PAIRED_END: + readsToSample = int(((end-start)*float(COVERAGE)*coverage_avg)/(2*READLEN))+1 + else: + readsToSample = int(((end-start)*float(COVERAGE)*coverage_avg)/(READLEN))+1 + + # if coverage is so low such that no reads are to be sampled, skip region + # (i.e., remove buffer of +1 reads we add to every window) + if readsToSample == 1 and sum(coverage_dat[2]) < LOW_COV_THRESH: + readsToSample = 0 + + # sample reads + ASDF2_TT = time.time() + for i in xrange(readsToSample): + + isUnmapped = [] + if PAIRED_END: + myFraglen = FRAGLEN_DISTRIBUTION.sample() + myReadData = sequences.sample_read(SE_CLASS,myFraglen) + if myReadData == None: # skip if we failed to find a valid position to sample read + continue + if myReadData[0][0] == None: + isUnmapped.append(True) + else: + isUnmapped.append(False) + myReadData[0][0] += start # adjust mapping position based on window start + if myReadData[1][0] == None: + isUnmapped.append(True) + else: + isUnmapped.append(False) + myReadData[1][0] += start + else: + myReadData = sequences.sample_read(SE_CLASS) + if myReadData == None: # skip if we failed to find a valid position to sample read + continue + if myReadData[0][0] == None: # unmapped read (lives in large insertion) + isUnmapped = [True] + else: + isUnmapped = [False] + myReadData[0][0] += start # adjust mapping position based on window start + + if NJOBS > 1: + myReadName = OUT_PREFIX_NAME+'-j'+str(MYJOB)+'-'+refIndex[RI][0]+'-r'+str(readNameCount) + else: + myReadName = OUT_PREFIX_NAME+'-'+refIndex[RI][0]+'-'+str(readNameCount) + readNameCount += len(myReadData) + + # if desired, replace all low-quality bases with Ns + if N_MAX_QUAL > -1: + for j in xrange(len(myReadData)): + myReadString = [n for n in myReadData[j][2]] + for k in xrange(len(myReadData[j][3])): + adjusted_qual = ord(myReadData[j][3][k])-SE_CLASS.offQ + if adjusted_qual <= N_MAX_QUAL: + myReadString[k] = 'N' + myReadData[j][2] = ''.join(myReadString) + + # if read (or read + mate for PE) are unmapped, put them at end of bam file + if all(isUnmapped): + if PAIRED_END: + unmapped_records.append((myReadName+'/1',myReadData[0],109)) + unmapped_records.append((myReadName+'/2',myReadData[1],157)) + else: + unmapped_records.append((myReadName+'/1',myReadData[0],4)) + + # write read data out to FASTQ and BAM files, bypass FASTQ if option specified + myRefIndex = indices_by_refName[refIndex[RI][0]] + if len(myReadData) == 1: + if NO_FASTQ != True: + OFW.writeFASTQRecord(myReadName,myReadData[0][2],myReadData[0][3]) + if SAVE_BAM: + if isUnmapped[0] == False: + OFW.writeBAMRecord(myRefIndex, myReadName+'/1', myReadData[0][0], myReadData[0][1], myReadData[0][2], myReadData[0][3], samFlag=0) + elif len(myReadData) == 2: + if NO_FASTQ != True: + OFW.writeFASTQRecord(myReadName,myReadData[0][2],myReadData[0][3],read2=myReadData[1][2],qual2=myReadData[1][3]) + if SAVE_BAM: + if isUnmapped[0] == False and isUnmapped[1] == False: + OFW.writeBAMRecord(myRefIndex, myReadName+'/1', myReadData[0][0], myReadData[0][1], myReadData[0][2], myReadData[0][3], samFlag=99, matePos=myReadData[1][0]) + OFW.writeBAMRecord(myRefIndex, myReadName+'/2', myReadData[1][0], myReadData[1][1], myReadData[1][2], myReadData[1][3], samFlag=147, matePos=myReadData[0][0]) + elif isUnmapped[0] == False and isUnmapped[1] == True: + OFW.writeBAMRecord(myRefIndex, myReadName+'/1', myReadData[0][0], myReadData[0][1], myReadData[0][2], myReadData[0][3], samFlag=105, matePos=myReadData[0][0]) + OFW.writeBAMRecord(myRefIndex, myReadName+'/2', myReadData[0][0], myReadData[1][1], myReadData[1][2], myReadData[1][3], samFlag=149, matePos=myReadData[0][0], alnMapQual=0) + elif isUnmapped[0] == True and isUnmapped[1] == False: + OFW.writeBAMRecord(myRefIndex, myReadName+'/1', myReadData[1][0], myReadData[0][1], myReadData[0][2], myReadData[0][3], samFlag=101, matePos=myReadData[1][0], alnMapQual=0) + OFW.writeBAMRecord(myRefIndex, myReadName+'/2', myReadData[1][0], myReadData[1][1], myReadData[1][2], myReadData[1][3], samFlag=153, matePos=myReadData[1][0]) + else: + print '\nError: Unexpected number of reads generated...\n' + exit(1) + #print 'READS:',time.time()-ASDF2_TT + + if not isLastTime: + OFW.flushBuffers(bamMax=next_start) + else: + OFW.flushBuffers(bamMax=end+1) + + # tally up all the variants that got successfully introduced + for n in all_inserted_variants: + ALL_VARIANTS_OUT[n] = True + + # prepare indices of next window + start = next_start + end = next_end + if isLastTime: + break + if end >= pf: + isLastTime = True + + if currentPercent != 100 and not havePrinted100: + print '100%' + else: + print '' + if ONLY_VCF: + print 'VCF generation completed in', + else: + print 'Read sampling completed in', + print int(time.time()-tt),'(sec)' + + # write all output variants for this reference + if SAVE_VCF: + print 'Writing output VCF...' + for k in sorted(ALL_VARIANTS_OUT.keys()): + currentRef = refIndex[RI][0] + myID = '.' + myQual = '.' + myFilt = 'PASS' + # k[0] + 1 because we're going back to 1-based vcf coords + OFW.writeVCFRecord(currentRef, str(int(k[0])+1), myID, k[1], k[2], myQual, myFilt, k[4]) + + #break + + # write unmapped reads to bam file + if SAVE_BAM and len(unmapped_records): + print 'writing unmapped reads to bam file...' + for umr in unmapped_records: + if PAIRED_END: + OFW.writeBAMRecord(-1, umr[0], 0, umr[1][1], umr[1][2], umr[1][3], samFlag=umr[2], matePos=0, alnMapQual=0) + else: + OFW.writeBAMRecord(-1, umr[0], 0, umr[1][1], umr[1][2], umr[1][3], samFlag=umr[2], alnMapQual=0) + + # close output files + OFW.closeFiles() + if CANCER: + OFW_CANCER.closeFiles() + + +if __name__ == '__main__': + main() + + +
