Mercurial > repos > xuebing > sharplabtool
comparison tools/ngs_simulation/ngs_simulation.py @ 0:9071e359b9a3
Uploaded
| author | xuebing |
|---|---|
| date | Fri, 09 Mar 2012 19:37:19 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:9071e359b9a3 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 """ | |
| 4 Runs Ben's simulation. | |
| 5 | |
| 6 usage: %prog [options] | |
| 7 -i, --input=i: Input genome (FASTA format) | |
| 8 -g, --genome=g: If built-in, the genome being used | |
| 9 -l, --read_len=l: Read length | |
| 10 -c, --avg_coverage=c: Average coverage | |
| 11 -e, --error_rate=e: Error rate (0-1) | |
| 12 -n, --num_sims=n: Number of simulations to run | |
| 13 -p, --polymorphism=p: Frequency/ies for minor allele (comma-separate list of 0-1) | |
| 14 -d, --detection_thresh=d: Detection thresholds (comma-separate list of 0-1) | |
| 15 -p, --output_png=p: Plot output | |
| 16 -s, --summary_out=s: Whether or not to output a file with summary of all simulations | |
| 17 -m, --output_summary=m: File name for output summary of all simulations | |
| 18 -f, --new_file_path=f: Directory for summary output files | |
| 19 | |
| 20 """ | |
| 21 # removed output of all simulation results on request (not working) | |
| 22 # -r, --sim_results=r: Output all tabular simulation results (number of polymorphisms times number of detection thresholds) | |
| 23 # -o, --output=o: Base name for summary output for each run | |
| 24 | |
| 25 from rpy import * | |
| 26 import os | |
| 27 import random, sys, tempfile | |
| 28 from galaxy import eggs | |
| 29 import pkg_resources; pkg_resources.require( "bx-python" ) | |
| 30 from bx.cookbook import doc_optparse | |
| 31 | |
| 32 def stop_err( msg ): | |
| 33 sys.stderr.write( '%s\n' % msg ) | |
| 34 sys.exit() | |
| 35 | |
| 36 def __main__(): | |
| 37 #Parse Command Line | |
| 38 options, args = doc_optparse.parse( __doc__ ) | |
| 39 # validate parameters | |
| 40 error = '' | |
| 41 try: | |
| 42 read_len = int( options.read_len ) | |
| 43 if read_len <= 0: | |
| 44 raise Exception, ' greater than 0' | |
| 45 except TypeError, e: | |
| 46 error = ': %s' % str( e ) | |
| 47 if error: | |
| 48 stop_err( 'Make sure your number of reads is an integer value%s' % error ) | |
| 49 error = '' | |
| 50 try: | |
| 51 avg_coverage = int( options.avg_coverage ) | |
| 52 if avg_coverage <= 0: | |
| 53 raise Exception, ' greater than 0' | |
| 54 except Exception, e: | |
| 55 error = ': %s' % str( e ) | |
| 56 if error: | |
| 57 stop_err( 'Make sure your average coverage is an integer value%s' % error ) | |
| 58 error = '' | |
| 59 try: | |
| 60 error_rate = float( options.error_rate ) | |
| 61 if error_rate >= 1.0: | |
| 62 error_rate = 10 ** ( -error_rate / 10.0 ) | |
| 63 elif error_rate < 0: | |
| 64 raise Exception, ' between 0 and 1' | |
| 65 except Exception, e: | |
| 66 error = ': %s' % str( e ) | |
| 67 if error: | |
| 68 stop_err( 'Make sure the error rate is a decimal value%s or the quality score is at least 1' % error ) | |
| 69 try: | |
| 70 num_sims = int( options.num_sims ) | |
| 71 except TypeError, e: | |
| 72 stop_err( 'Make sure the number of simulations is an integer value: %s' % str( e ) ) | |
| 73 if len( options.polymorphism ) > 0: | |
| 74 polymorphisms = [ float( p ) for p in options.polymorphism.split( ',' ) ] | |
| 75 else: | |
| 76 stop_err( 'Select at least one polymorphism value to use' ) | |
| 77 if len( options.detection_thresh ) > 0: | |
| 78 detection_threshes = [ float( dt ) for dt in options.detection_thresh.split( ',' ) ] | |
| 79 else: | |
| 80 stop_err( 'Select at least one detection threshold to use' ) | |
| 81 | |
| 82 # mutation dictionaries | |
| 83 hp_dict = { 'A':'G', 'G':'A', 'C':'T', 'T':'C', 'N':'N' } # heteroplasmy dictionary | |
| 84 mt_dict = { 'A':'C', 'C':'A', 'G':'T', 'T':'G', 'N':'N'} # misread dictionary | |
| 85 | |
| 86 # read fasta file to seq string | |
| 87 all_lines = open( options.input, 'rb' ).readlines() | |
| 88 seq = '' | |
| 89 for line in all_lines: | |
| 90 line = line.rstrip() | |
| 91 if line.startswith('>'): | |
| 92 pass | |
| 93 else: | |
| 94 seq += line.upper() | |
| 95 seq_len = len( seq ) | |
| 96 | |
| 97 # output file name template | |
| 98 # removed output of all simulation results on request (not working) | |
| 99 # if options.sim_results == "true": | |
| 100 # out_name_template = os.path.join( options.new_file_path, 'primary_output%s_' + options.output + '_visible_tabular' ) | |
| 101 # else: | |
| 102 # out_name_template = tempfile.NamedTemporaryFile().name + '_%s' | |
| 103 out_name_template = tempfile.NamedTemporaryFile().name + '_%s' | |
| 104 print 'out_name_template:', out_name_template | |
| 105 | |
| 106 # set up output files | |
| 107 outputs = {} | |
| 108 i = 1 | |
| 109 for p in polymorphisms: | |
| 110 outputs[ p ] = {} | |
| 111 for d in detection_threshes: | |
| 112 outputs[ p ][ d ] = out_name_template % i | |
| 113 i += 1 | |
| 114 | |
| 115 # run sims | |
| 116 for polymorphism in polymorphisms: | |
| 117 for detection_thresh in detection_threshes: | |
| 118 output = open( outputs[ polymorphism ][ detection_thresh ], 'wb' ) | |
| 119 output.write( 'FP\tFN\tGENOMESIZE=%s\n' % seq_len ) | |
| 120 sim_count = 0 | |
| 121 while sim_count < num_sims: | |
| 122 # randomly pick heteroplasmic base index | |
| 123 hbase = random.choice( range( 0, seq_len ) ) | |
| 124 #hbase = seq_len/2#random.randrange( 0, seq_len ) | |
| 125 # create 2D quasispecies list | |
| 126 qspec = map( lambda x: [], [0] * seq_len ) | |
| 127 # simulate read indices and assign to quasispecies | |
| 128 i = 0 | |
| 129 while i < ( avg_coverage * ( seq_len / read_len ) ): # number of reads (approximates coverage) | |
| 130 start = random.choice( range( 0, seq_len ) ) | |
| 131 #start = seq_len/2#random.randrange( 0, seq_len ) # assign read start | |
| 132 if random.random() < 0.5: # positive sense read | |
| 133 end = start + read_len # assign read end | |
| 134 if end > seq_len: # overshooting origin | |
| 135 read = range( start, seq_len ) + range( 0, ( end - seq_len ) ) | |
| 136 else: # regular read | |
| 137 read = range( start, end ) | |
| 138 else: # negative sense read | |
| 139 end = start - read_len # assign read end | |
| 140 if end < -1: # overshooting origin | |
| 141 read = range( start, -1, -1) + range( ( seq_len - 1 ), ( seq_len + end ), -1 ) | |
| 142 else: # regular read | |
| 143 read = range( start, end, -1 ) | |
| 144 # assign read to quasispecies list by index | |
| 145 for j in read: | |
| 146 if j == hbase and random.random() < polymorphism: # heteroplasmic base is variant with p = het | |
| 147 ref = hp_dict[ seq[ j ] ] | |
| 148 else: # ref is the verbatim reference nucleotide (all positions) | |
| 149 ref = seq[ j ] | |
| 150 if random.random() < error_rate: # base in read is misread with p = err | |
| 151 qspec[ j ].append( mt_dict[ ref ] ) | |
| 152 else: # otherwise we carry ref through to the end | |
| 153 qspec[ j ].append(ref) | |
| 154 # last but not least | |
| 155 i += 1 | |
| 156 bases, fpos, fneg = {}, 0, 0 # last two will be outputted to summary file later | |
| 157 for i, nuc in enumerate( seq ): | |
| 158 cov = len( qspec[ i ] ) | |
| 159 bases[ 'A' ] = qspec[ i ].count( 'A' ) | |
| 160 bases[ 'C' ] = qspec[ i ].count( 'C' ) | |
| 161 bases[ 'G' ] = qspec[ i ].count( 'G' ) | |
| 162 bases[ 'T' ] = qspec[ i ].count( 'T' ) | |
| 163 # calculate max NON-REF deviation | |
| 164 del bases[ nuc ] | |
| 165 maxdev = float( max( bases.values() ) ) / cov | |
| 166 # deal with non-het sites | |
| 167 if i != hbase: | |
| 168 if maxdev >= detection_thresh: # greater than detection threshold = false positive | |
| 169 fpos += 1 | |
| 170 # deal with het sites | |
| 171 if i == hbase: | |
| 172 hnuc = hp_dict[ nuc ] # let's recover het variant | |
| 173 if ( float( bases[ hnuc ] ) / cov ) < detection_thresh: # less than detection threshold = false negative | |
| 174 fneg += 1 | |
| 175 del bases[ hnuc ] # ignore het variant | |
| 176 maxdev = float( max( bases.values() ) ) / cov # check other non-ref bases at het site | |
| 177 if maxdev >= detection_thresh: # greater than detection threshold = false positive (possible) | |
| 178 fpos += 1 | |
| 179 # output error sums and genome size to summary file | |
| 180 output.write( '%d\t%d\n' % ( fpos, fneg ) ) | |
| 181 sim_count += 1 | |
| 182 # close output up | |
| 183 output.close() | |
| 184 | |
| 185 # Parameters (heteroplasmy, error threshold, colours) | |
| 186 r( ''' | |
| 187 het=c(%s) | |
| 188 err=c(%s) | |
| 189 grade = (0:32)/32 | |
| 190 hues = rev(gray(grade)) | |
| 191 ''' % ( ','.join( [ str( p ) for p in polymorphisms ] ), ','.join( [ str( d ) for d in detection_threshes ] ) ) ) | |
| 192 | |
| 193 # Suppress warnings | |
| 194 r( 'options(warn=-1)' ) | |
| 195 | |
| 196 # Create allsum (for FP) and allneg (for FN) objects | |
| 197 r( 'allsum <- data.frame()' ) | |
| 198 for polymorphism in polymorphisms: | |
| 199 for detection_thresh in detection_threshes: | |
| 200 output = outputs[ polymorphism ][ detection_thresh ] | |
| 201 cmd = ''' | |
| 202 ngsum = read.delim('%s', header=T) | |
| 203 ngsum$fprate <- ngsum$FP/%s | |
| 204 ngsum$hetcol <- %s | |
| 205 ngsum$errcol <- %s | |
| 206 allsum <- rbind(allsum, ngsum) | |
| 207 ''' % ( output, seq_len, polymorphism, detection_thresh ) | |
| 208 r( cmd ) | |
| 209 | |
| 210 if os.path.getsize( output ) == 0: | |
| 211 for p in outputs.keys(): | |
| 212 for d in outputs[ p ].keys(): | |
| 213 sys.stderr.write(outputs[ p ][ d ] + ' '+str( os.path.getsize( outputs[ p ][ d ] ) )+'\n') | |
| 214 | |
| 215 if options.summary_out == "true": | |
| 216 r( 'write.table(summary(ngsum), file="%s", quote=FALSE, sep="\t", row.names=FALSE)' % options.output_summary ) | |
| 217 | |
| 218 # Summary objects (these could be printed) | |
| 219 r( ''' | |
| 220 tr_pos <- tapply(allsum$fprate,list(allsum$hetcol,allsum$errcol), mean) | |
| 221 tr_neg <- tapply(allsum$FN,list(allsum$hetcol,allsum$errcol), mean) | |
| 222 cat('\nFalse Positive Rate Summary\n\t', file='%s', append=T, sep='\t') | |
| 223 write.table(format(tr_pos, digits=4), file='%s', append=T, quote=F, sep='\t') | |
| 224 cat('\nFalse Negative Rate Summary\n\t', file='%s', append=T, sep='\t') | |
| 225 write.table(format(tr_neg, digits=4), file='%s', append=T, quote=F, sep='\t') | |
| 226 ''' % tuple( [ options.output_summary ] * 4 ) ) | |
| 227 | |
| 228 # Setup graphs | |
| 229 #pdf(paste(prefix,'_jointgraph.pdf',sep=''), 15, 10) | |
| 230 r( ''' | |
| 231 png('%s', width=800, height=500, units='px', res=250) | |
| 232 layout(matrix(data=c(1,2,1,3,1,4), nrow=2, ncol=3), widths=c(4,6,2), heights=c(1,10,10)) | |
| 233 ''' % options.output_png ) | |
| 234 | |
| 235 # Main title | |
| 236 genome = '' | |
| 237 if options.genome: | |
| 238 genome = '%s: ' % options.genome | |
| 239 r( ''' | |
| 240 par(mar=c(0,0,0,0)) | |
| 241 plot(1, type='n', axes=F, xlab='', ylab='') | |
| 242 text(1,1,paste('%sVariation in False Positives and Negatives (', %s, ' simulations, coverage ', %s,')', sep=''), font=2, family='sans', cex=0.7) | |
| 243 ''' % ( genome, options.num_sims, options.avg_coverage ) ) | |
| 244 | |
| 245 # False positive boxplot | |
| 246 r( ''' | |
| 247 par(mar=c(5,4,2,2), las=1, cex=0.35) | |
| 248 boxplot(allsum$fprate ~ allsum$errcol, horizontal=T, ylim=rev(range(allsum$fprate)), cex.axis=0.85) | |
| 249 title(main='False Positives', xlab='false positive rate', ylab='') | |
| 250 ''' ) | |
| 251 | |
| 252 # False negative heatmap (note zlim command!) | |
| 253 num_polys = len( polymorphisms ) | |
| 254 num_dets = len( detection_threshes ) | |
| 255 r( ''' | |
| 256 par(mar=c(5,4,2,1), las=1, cex=0.35) | |
| 257 image(1:%s, 1:%s, tr_neg, zlim=c(0,1), col=hues, xlab='', ylab='', axes=F, border=1) | |
| 258 axis(1, at=1:%s, labels=rownames(tr_neg), lwd=1, cex.axis=0.85, axs='i') | |
| 259 axis(2, at=1:%s, labels=colnames(tr_neg), lwd=1, cex.axis=0.85) | |
| 260 title(main='False Negatives', xlab='minor allele frequency', ylab='detection threshold') | |
| 261 ''' % ( num_polys, num_dets, num_polys, num_dets ) ) | |
| 262 | |
| 263 # Scale alongside | |
| 264 r( ''' | |
| 265 par(mar=c(2,2,2,3), las=1) | |
| 266 image(1, grade, matrix(grade, ncol=length(grade), nrow=1), col=hues, xlab='', ylab='', xaxt='n', las=1, cex.axis=0.85) | |
| 267 title(main='Key', cex=0.35) | |
| 268 mtext('false negative rate', side=1, cex=0.35) | |
| 269 ''' ) | |
| 270 | |
| 271 # Close graphics | |
| 272 r( ''' | |
| 273 layout(1) | |
| 274 dev.off() | |
| 275 ''' ) | |
| 276 | |
| 277 # Tidy up | |
| 278 # r( 'rm(folder,prefix,sim,cov,het,err,grade,hues,i,j,ngsum)' ) | |
| 279 | |
| 280 if __name__ == "__main__" : __main__() |
