sharplabtool: tools/ngs_simulation/ngs

comparison tools/ngs_simulation/ngs_simulation.py @ 0:9071e359b9a3

Uploaded

author	xuebing
date	Fri, 09 Mar 2012 19:37:19 -0500
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:9071e359b9a3
+#!/usr/bin/env python
+"""
+Runs Ben's simulation.
+usage: %prog [options]
+-i, --input=i: Input genome (FASTA format)
+-g, --genome=g: If built-in, the genome being used
+-l, --read_len=l: Read length
+-c, --avg_coverage=c: Average coverage
+-e, --error_rate=e: Error rate (0-1)
+-n, --num_sims=n: Number of simulations to run
+-p, --polymorphism=p: Frequency/ies for minor allele (comma-separate list of 0-1)
+-d, --detection_thresh=d: Detection thresholds (comma-separate list of 0-1)
+-p, --output_png=p: Plot output
+-s, --summary_out=s: Whether or not to output a file with summary of all simulations
+-m, --output_summary=m: File name for output summary of all simulations
+-f, --new_file_path=f: Directory for summary output files
+"""
+# removed output of all simulation results on request (not working)
+#   -r, --sim_results=r: Output all tabular simulation results (number of polymorphisms times number of detection thresholds)
+#   -o, --output=o: Base name for summary output for each run
+from rpy import *
+import os
+import random, sys, tempfile
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+from bx.cookbook import doc_optparse
+def stop_err( msg ):
+sys.stderr.write( '%s\n' % msg )
+sys.exit()
+def __main__():
+#Parse Command Line
+options, args = doc_optparse.parse( __doc__ )
+# validate parameters
+error = ''
+try:
+read_len = int( options.read_len )
+if read_len <= 0:
+raise Exception, ' greater than 0'
+except TypeError, e:
+error = ': %s' % str( e )
+if error:
+stop_err( 'Make sure your number of reads is an integer value%s' % error )
+error = ''
+try:
+avg_coverage = int( options.avg_coverage )
+if avg_coverage <= 0:
+raise Exception, ' greater than 0'
+except Exception, e:
+error = ': %s' % str( e )
+if error:
+stop_err( 'Make sure your average coverage is an integer value%s' % error )
+error = ''
+try:
+error_rate = float( options.error_rate )
+if error_rate >= 1.0:
+error_rate = 10 ** ( -error_rate / 10.0 )
+elif error_rate < 0:
+raise Exception, ' between 0 and 1'
+except Exception, e:
+error = ': %s' % str( e )
+if error:
+stop_err( 'Make sure the error rate is a decimal value%s or the quality score is at least 1' % error )
+try:
+num_sims = int( options.num_sims )
+except TypeError, e:
+stop_err( 'Make sure the number of simulations is an integer value: %s' % str( e ) )
+if len( options.polymorphism ) > 0:
+polymorphisms = [ float( p ) for p in options.polymorphism.split( ',' ) ]
+else:
+stop_err( 'Select at least one polymorphism value to use' )
+if len( options.detection_thresh ) > 0:
+detection_threshes = [ float( dt ) for dt in options.detection_thresh.split( ',' ) ]
+else:
+stop_err( 'Select at least one detection threshold to use' )
+# mutation dictionaries
+hp_dict = { 'A':'G', 'G':'A', 'C':'T', 'T':'C', 'N':'N' } # heteroplasmy dictionary
+mt_dict = { 'A':'C', 'C':'A', 'G':'T', 'T':'G', 'N':'N'} # misread dictionary
+# read fasta file to seq string
+all_lines = open( options.input, 'rb' ).readlines()
+seq = ''
+for line in all_lines:
+line = line.rstrip()
+if line.startswith('>'):
+pass
+else:
+seq += line.upper()
+seq_len = len( seq )
+# output file name template
+# removed output of all simulation results on request (not working)
+#    if options.sim_results == "true":
+#        out_name_template = os.path.join( options.new_file_path, 'primary_output%s_' + options.output + '_visible_tabular' )
+#    else:
+#        out_name_template = tempfile.NamedTemporaryFile().name + '_%s'
+out_name_template = tempfile.NamedTemporaryFile().name + '_%s'
+print 'out_name_template:', out_name_template
+# set up output files
+outputs = {}
+i = 1
+for p in polymorphisms:
+outputs[ p ] = {}
+for d in detection_threshes:
+outputs[ p ][ d ] = out_name_template % i
+i += 1
+# run sims
+for polymorphism in polymorphisms:
+for detection_thresh in detection_threshes:
+output = open( outputs[ polymorphism ][ detection_thresh ], 'wb' )
+output.write( 'FP\tFN\tGENOMESIZE=%s\n' % seq_len )
+sim_count = 0
+while sim_count < num_sims:
+# randomly pick heteroplasmic base index
+hbase = random.choice( range( 0, seq_len ) )
+#hbase = seq_len/2#random.randrange( 0, seq_len )
+# create 2D quasispecies list
+qspec = map( lambda x: [], [0] * seq_len )
+# simulate read indices and assign to quasispecies
+i = 0
+while i < ( avg_coverage * ( seq_len / read_len ) ): # number of reads (approximates coverage)
+start = random.choice( range( 0, seq_len ) )
+#start = seq_len/2#random.randrange( 0, seq_len ) # assign read start
+if random.random() < 0.5: # positive sense read
+end = start + read_len # assign read end
+if end > seq_len: # overshooting origin
+read = range( start, seq_len ) + range( 0, ( end - seq_len ) )
+else: # regular read
+read = range( start, end )
+else: # negative sense read
+end = start - read_len # assign read end
+if end < -1: # overshooting origin
+read = range( start, -1, -1) + range( ( seq_len - 1 ), ( seq_len + end ), -1 )
+else: # regular read
+read = range( start, end, -1 )
+# assign read to quasispecies list by index
+for j in read:
+if j == hbase and random.random() < polymorphism: # heteroplasmic base is variant with p = het
+ref = hp_dict[ seq[ j ] ]
+else: # ref is the verbatim reference nucleotide (all positions)
+ref = seq[ j ]
+if random.random() < error_rate: # base in read is misread with p = err
+qspec[ j ].append( mt_dict[ ref ] )
+else: # otherwise we carry ref through to the end
+qspec[ j ].append(ref)
+# last but not least
+i += 1
+bases, fpos, fneg = {}, 0, 0 # last two will be outputted to summary file later
+for i, nuc in enumerate( seq ):
+cov = len( qspec[ i ] )
+bases[ 'A' ] = qspec[ i ].count( 'A' )
+bases[ 'C' ] = qspec[ i ].count( 'C' )
+bases[ 'G' ] = qspec[ i ].count( 'G' )
+bases[ 'T' ] = qspec[ i ].count( 'T' )
+# calculate max NON-REF deviation
+del bases[ nuc ]
+maxdev = float( max( bases.values() ) ) / cov
+# deal with non-het sites
+if i != hbase:
+if maxdev >= detection_thresh: # greater than detection threshold = false positive
+fpos += 1
+# deal with het sites
+if i == hbase:
+hnuc = hp_dict[ nuc ] # let's recover het variant
+if ( float( bases[ hnuc ] ) / cov ) < detection_thresh: # less than detection threshold = false negative
+fneg += 1
+del bases[ hnuc ] # ignore het variant
+maxdev = float( max( bases.values() ) ) / cov # check other non-ref bases at het site
+if maxdev >= detection_thresh: # greater than detection threshold = false positive (possible)
+fpos += 1
+# output error sums and genome size to summary file
+output.write( '%d\t%d\n' % ( fpos, fneg ) )
+sim_count += 1
+# close output up
+output.close()
+# Parameters (heteroplasmy, error threshold, colours)
+r( '''
+het=c(%s)
+err=c(%s)
+grade = (0:32)/32
+hues = rev(gray(grade))
+''' % ( ','.join( [ str( p ) for p in polymorphisms ] ), ','.join( [ str( d ) for d in detection_threshes ] ) ) )
+# Suppress warnings
+r( 'options(warn=-1)' )
+# Create allsum (for FP) and allneg (for FN) objects
+r( 'allsum <- data.frame()' )
+for polymorphism in polymorphisms:
+for detection_thresh in detection_threshes:
+output = outputs[ polymorphism ][ detection_thresh ]
+cmd = '''
+ngsum = read.delim('%s', header=T)
+ngsum$fprate <- ngsum$FP/%s
+ngsum$hetcol <- %s
+ngsum$errcol <- %s
+allsum <- rbind(allsum, ngsum)
+''' % ( output, seq_len, polymorphism, detection_thresh )
+r( cmd )
+if os.path.getsize( output ) == 0:
+for p in outputs.keys():
+for d in outputs[ p ].keys():
+sys.stderr.write(outputs[ p ][ d ] + ' '+str( os.path.getsize( outputs[ p ][ d ] ) )+'\n')
+if options.summary_out == "true":
+r( 'write.table(summary(ngsum), file="%s", quote=FALSE, sep="\t", row.names=FALSE)' % options.output_summary )
+# Summary objects (these could be printed)
+r( '''
+tr_pos <- tapply(allsum$fprate,list(allsum$hetcol,allsum$errcol), mean)
+tr_neg <- tapply(allsum$FN,list(allsum$hetcol,allsum$errcol), mean)
+cat('\nFalse Positive Rate Summary\n\t', file='%s', append=T, sep='\t')
+write.table(format(tr_pos, digits=4), file='%s', append=T, quote=F, sep='\t')
+cat('\nFalse Negative Rate Summary\n\t', file='%s', append=T, sep='\t')
+write.table(format(tr_neg, digits=4), file='%s', append=T, quote=F, sep='\t')
+''' % tuple( [ options.output_summary ] * 4 ) )
+# Setup graphs
+#pdf(paste(prefix,'_jointgraph.pdf',sep=''), 15, 10)
+r( '''
+png('%s', width=800, height=500, units='px', res=250)
+layout(matrix(data=c(1,2,1,3,1,4), nrow=2, ncol=3), widths=c(4,6,2), heights=c(1,10,10))
+''' % options.output_png )
+# Main title
+genome = ''
+if options.genome:
+genome = '%s: ' % options.genome
+r( '''
+par(mar=c(0,0,0,0))
+plot(1, type='n', axes=F, xlab='', ylab='')
+text(1,1,paste('%sVariation in False Positives and Negatives (', %s, ' simulations, coverage ', %s,')', sep=''), font=2, family='sans', cex=0.7)
+''' % ( genome, options.num_sims, options.avg_coverage ) )
+# False positive boxplot
+r( '''
+par(mar=c(5,4,2,2), las=1, cex=0.35)
+boxplot(allsum$fprate ~ allsum$errcol, horizontal=T, ylim=rev(range(allsum$fprate)), cex.axis=0.85)
+title(main='False Positives', xlab='false positive rate', ylab='')
+''' )
+# False negative heatmap (note zlim command!)
+num_polys = len( polymorphisms )
+num_dets = len( detection_threshes )
+r( '''
+par(mar=c(5,4,2,1), las=1, cex=0.35)
+image(1:%s, 1:%s, tr_neg, zlim=c(0,1), col=hues, xlab='', ylab='', axes=F, border=1)
+axis(1, at=1:%s, labels=rownames(tr_neg), lwd=1, cex.axis=0.85, axs='i')
+axis(2, at=1:%s, labels=colnames(tr_neg), lwd=1, cex.axis=0.85)
+title(main='False Negatives', xlab='minor allele frequency', ylab='detection threshold')
+''' % ( num_polys, num_dets, num_polys, num_dets ) )
+# Scale alongside
+r( '''
+par(mar=c(2,2,2,3), las=1)
+image(1, grade, matrix(grade, ncol=length(grade), nrow=1), col=hues, xlab='', ylab='', xaxt='n', las=1, cex.axis=0.85)
+title(main='Key', cex=0.35)
+mtext('false negative rate', side=1, cex=0.35)
+''' )
+# Close graphics
+r( '''
+layout(1)
+dev.off()
+''' )
+# Tidy up
+#    r( 'rm(folder,prefix,sim,cov,het,err,grade,hues,i,j,ngsum)' )
+if __name__ == "__main__" : __main__()

Mercurial > repos > xuebing > sharplabtool

comparison tools/ngs_simulation/ngs_simulation.py @ 0:9071e359b9a3