sharplabtool: tools/sr_mapping/bwa

annotate tools/sr_mapping/bwa_wrapper.py @ 1:cdcb0ce84a1b

Uploaded

author	xuebing
date	Fri, 09 Mar 2012 19:45:15 -0500
parents	9071e359b9a3
children

rev	line source
0 9071e359b9a3 Uploaded xuebing parents: diff changeset	1 #!/usr/bin/env python
9071e359b9a3 Uploaded xuebing parents: diff changeset	2
9071e359b9a3 Uploaded xuebing parents: diff changeset	3 """
9071e359b9a3 Uploaded xuebing parents: diff changeset	4 Runs BWA on single-end or paired-end data.
9071e359b9a3 Uploaded xuebing parents: diff changeset	5 Produces a SAM file containing the mappings.
9071e359b9a3 Uploaded xuebing parents: diff changeset	6 Works with BWA version 0.5.9.
9071e359b9a3 Uploaded xuebing parents: diff changeset	7
9071e359b9a3 Uploaded xuebing parents: diff changeset	8 usage: bwa_wrapper.py [options]
9071e359b9a3 Uploaded xuebing parents: diff changeset	9
9071e359b9a3 Uploaded xuebing parents: diff changeset	10 See below for options
9071e359b9a3 Uploaded xuebing parents: diff changeset	11 """
9071e359b9a3 Uploaded xuebing parents: diff changeset	12
9071e359b9a3 Uploaded xuebing parents: diff changeset	13 import optparse, os, shutil, subprocess, sys, tempfile
9071e359b9a3 Uploaded xuebing parents: diff changeset	14
9071e359b9a3 Uploaded xuebing parents: diff changeset	15 def stop_err( msg ):
9071e359b9a3 Uploaded xuebing parents: diff changeset	16 sys.stderr.write( '%s\n' % msg )
9071e359b9a3 Uploaded xuebing parents: diff changeset	17 sys.exit()
9071e359b9a3 Uploaded xuebing parents: diff changeset	18
9071e359b9a3 Uploaded xuebing parents: diff changeset	19 def check_is_double_encoded( fastq ):
9071e359b9a3 Uploaded xuebing parents: diff changeset	20 # check that first read is bases, not one base followed by numbers
9071e359b9a3 Uploaded xuebing parents: diff changeset	21 bases = [ 'A', 'C', 'G', 'T', 'a', 'c', 'g', 't', 'N' ]
9071e359b9a3 Uploaded xuebing parents: diff changeset	22 nums = [ '0', '1', '2', '3' ]
9071e359b9a3 Uploaded xuebing parents: diff changeset	23 for line in file( fastq, 'rb'):
9071e359b9a3 Uploaded xuebing parents: diff changeset	24 if not line.strip() or line.startswith( '@' ):
9071e359b9a3 Uploaded xuebing parents: diff changeset	25 continue
9071e359b9a3 Uploaded xuebing parents: diff changeset	26 if len( [ b for b in line.strip() if b in nums ] ) > 0:
9071e359b9a3 Uploaded xuebing parents: diff changeset	27 return False
9071e359b9a3 Uploaded xuebing parents: diff changeset	28 elif line.strip()[0] in bases and len( [ b for b in line.strip() if b in bases ] ) == len( line.strip() ):
9071e359b9a3 Uploaded xuebing parents: diff changeset	29 return True
9071e359b9a3 Uploaded xuebing parents: diff changeset	30 else:
9071e359b9a3 Uploaded xuebing parents: diff changeset	31 raise Exception, 'First line in first read does not appear to be a valid FASTQ read in either base-space or color-space'
9071e359b9a3 Uploaded xuebing parents: diff changeset	32 raise Exception, 'There is no non-comment and non-blank line in your FASTQ file'
9071e359b9a3 Uploaded xuebing parents: diff changeset	33
9071e359b9a3 Uploaded xuebing parents: diff changeset	34 def __main__():
9071e359b9a3 Uploaded xuebing parents: diff changeset	35 #Parse Command Line
9071e359b9a3 Uploaded xuebing parents: diff changeset	36 parser = optparse.OptionParser()
9071e359b9a3 Uploaded xuebing parents: diff changeset	37 parser.add_option( '-t', '--threads', dest='threads', help='The number of threads to use' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	38 parser.add_option( '-c', '--color-space', dest='color_space', action='store_true', help='If the input files are SOLiD format' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	39 parser.add_option( '-r', '--ref', dest='ref', help='The reference genome to use or index' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	40 parser.add_option( '-f', '--input1', dest='fastq', help='The (forward) fastq file to use for the mapping' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	41 parser.add_option( '-F', '--input2', dest='rfastq', help='The reverse fastq file to use for mapping if paired-end data' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	42 parser.add_option( '-u', '--output', dest='output', help='The file to save the output (SAM format)' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	43 parser.add_option( '-g', '--genAlignType', dest='genAlignType', help='The type of pairing (single or paired)' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	44 parser.add_option( '-p', '--params', dest='params', help='Parameter setting to use (pre_set or full)' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	45 parser.add_option( '-s', '--fileSource', dest='fileSource', help='Whether to use a previously indexed reference sequence or one form history (indexed or history)' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	46 parser.add_option( '-n', '--maxEditDist', dest='maxEditDist', help='Maximum edit distance if integer' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	47 parser.add_option( '-m', '--fracMissingAligns', dest='fracMissingAligns', help='Fraction of missing alignments given 2% uniform base error rate if fraction' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	48 parser.add_option( '-o', '--maxGapOpens', dest='maxGapOpens', help='Maximum number of gap opens' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	49 parser.add_option( '-e', '--maxGapExtens', dest='maxGapExtens', help='Maximum number of gap extensions' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	50 parser.add_option( '-d', '--disallowLongDel', dest='disallowLongDel', help='Disallow a long deletion within specified bps' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	51 parser.add_option( '-i', '--disallowIndel', dest='disallowIndel', help='Disallow indel within specified bps' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	52 parser.add_option( '-l', '--seed', dest='seed', help='Take the first specified subsequences' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	53 parser.add_option( '-k', '--maxEditDistSeed', dest='maxEditDistSeed', help='Maximum edit distance to the seed' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	54 parser.add_option( '-M', '--mismatchPenalty', dest='mismatchPenalty', help='Mismatch penalty' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	55 parser.add_option( '-O', '--gapOpenPenalty', dest='gapOpenPenalty', help='Gap open penalty' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	56 parser.add_option( '-E', '--gapExtensPenalty', dest='gapExtensPenalty', help='Gap extension penalty' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	57 parser.add_option( '-R', '--suboptAlign', dest='suboptAlign', help='Proceed with suboptimal alignments even if the top hit is a repeat' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	58 parser.add_option( '-N', '--noIterSearch', dest='noIterSearch', help='Disable iterative search' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	59 parser.add_option( '-T', '--outputTopN', dest='outputTopN', help='Maximum number of alignments to output in the XA tag for reads paired properly' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	60 parser.add_option( '', '--outputTopNDisc', dest='outputTopNDisc', help='Maximum number of alignments to output in the XA tag for disconcordant read pairs (excluding singletons)' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	61 parser.add_option( '-S', '--maxInsertSize', dest='maxInsertSize', help='Maximum insert size for a read pair to be considered mapped good' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	62 parser.add_option( '-P', '--maxOccurPairing', dest='maxOccurPairing', help='Maximum occurrences of a read for pairings' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	63 parser.add_option( '', '--rgid', dest='rgid', help='Read group identifier' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	64 parser.add_option( '', '--rgcn', dest='rgcn', help='Sequencing center that produced the read' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	65 parser.add_option( '', '--rgds', dest='rgds', help='Description' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	66 parser.add_option( '', '--rgdt', dest='rgdt', help='Date that run was produced (ISO8601 format date or date/time, like YYYY-MM-DD)' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	67 parser.add_option( '', '--rgfo', dest='rgfo', help='Flow order' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	68 parser.add_option( '', '--rgks', dest='rgks', help='The array of nucleotide bases that correspond to the key sequence of each read' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	69 parser.add_option( '', '--rglb', dest='rglb', help='Library name' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	70 parser.add_option( '', '--rgpg', dest='rgpg', help='Programs used for processing the read group' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	71 parser.add_option( '', '--rgpi', dest='rgpi', help='Predicted median insert size' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	72 parser.add_option( '', '--rgpl', dest='rgpl', choices=[ 'CAPILLARY', 'LS454', 'ILLUMINA', 'SOLID', 'HELICOS', 'IONTORRENT' and 'PACBIO' ], help='Platform/technology used to produce the reads' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	73 parser.add_option( '', '--rgpu', dest='rgpu', help='Platform unit (e.g. flowcell-barcode.lane for Illumina or slide for SOLiD)' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	74 parser.add_option( '', '--rgsm', dest='rgsm', help='Sample' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	75 parser.add_option( '-D', '--dbkey', dest='dbkey', help='Dbkey for reference genome' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	76 parser.add_option( '-X', '--do_not_build_index', dest='do_not_build_index', action='store_true', help="Don't build index" )
9071e359b9a3 Uploaded xuebing parents: diff changeset	77 parser.add_option( '-H', '--suppressHeader', dest='suppressHeader', help='Suppress header' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	78 parser.add_option( '-I', '--illumina1.3', dest='illumina13qual', help='Input FASTQ files have Illuina 1.3 quality scores' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	79 (options, args) = parser.parse_args()
9071e359b9a3 Uploaded xuebing parents: diff changeset	80
9071e359b9a3 Uploaded xuebing parents: diff changeset	81 # output version # of tool
9071e359b9a3 Uploaded xuebing parents: diff changeset	82 try:
9071e359b9a3 Uploaded xuebing parents: diff changeset	83 tmp = tempfile.NamedTemporaryFile().name
9071e359b9a3 Uploaded xuebing parents: diff changeset	84 tmp_stdout = open( tmp, 'wb' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	85 proc = subprocess.Popen( args='bwa 2>&1', shell=True, stdout=tmp_stdout )
9071e359b9a3 Uploaded xuebing parents: diff changeset	86 tmp_stdout.close()
9071e359b9a3 Uploaded xuebing parents: diff changeset	87 returncode = proc.wait()
9071e359b9a3 Uploaded xuebing parents: diff changeset	88 stdout = None
9071e359b9a3 Uploaded xuebing parents: diff changeset	89 for line in open( tmp_stdout.name, 'rb' ):
9071e359b9a3 Uploaded xuebing parents: diff changeset	90 if line.lower().find( 'version' ) >= 0:
9071e359b9a3 Uploaded xuebing parents: diff changeset	91 stdout = line.strip()
9071e359b9a3 Uploaded xuebing parents: diff changeset	92 break
9071e359b9a3 Uploaded xuebing parents: diff changeset	93 if stdout:
9071e359b9a3 Uploaded xuebing parents: diff changeset	94 sys.stdout.write( 'BWA %s\n' % stdout )
9071e359b9a3 Uploaded xuebing parents: diff changeset	95 else:
9071e359b9a3 Uploaded xuebing parents: diff changeset	96 raise Exception
9071e359b9a3 Uploaded xuebing parents: diff changeset	97 except:
9071e359b9a3 Uploaded xuebing parents: diff changeset	98 sys.stdout.write( 'Could not determine BWA version\n' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	99
9071e359b9a3 Uploaded xuebing parents: diff changeset	100 # check for color space fastq that's not double-encoded and exit if appropriate
9071e359b9a3 Uploaded xuebing parents: diff changeset	101 if options.color_space:
9071e359b9a3 Uploaded xuebing parents: diff changeset	102 if not check_is_double_encoded( options.fastq ):
9071e359b9a3 Uploaded xuebing parents: diff changeset	103 stop_err( 'Your file must be double-encoded (it must be converted from "numbers" to "bases"). See the help section for details' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	104 if options.genAlignType == 'paired':
9071e359b9a3 Uploaded xuebing parents: diff changeset	105 if not check_is_double_encoded( options.rfastq ):
9071e359b9a3 Uploaded xuebing parents: diff changeset	106 stop_err( 'Your reverse reads file must also be double-encoded (it must be converted from "numbers" to "bases"). See the help section for details' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	107
9071e359b9a3 Uploaded xuebing parents: diff changeset	108 fastq = options.fastq
9071e359b9a3 Uploaded xuebing parents: diff changeset	109 if options.rfastq:
9071e359b9a3 Uploaded xuebing parents: diff changeset	110 rfastq = options.rfastq
9071e359b9a3 Uploaded xuebing parents: diff changeset	111
9071e359b9a3 Uploaded xuebing parents: diff changeset	112 # set color space variable
9071e359b9a3 Uploaded xuebing parents: diff changeset	113 if options.color_space:
9071e359b9a3 Uploaded xuebing parents: diff changeset	114 color_space = '-c'
9071e359b9a3 Uploaded xuebing parents: diff changeset	115 else:
9071e359b9a3 Uploaded xuebing parents: diff changeset	116 color_space = ''
9071e359b9a3 Uploaded xuebing parents: diff changeset	117
9071e359b9a3 Uploaded xuebing parents: diff changeset	118 # make temp directory for placement of indices
9071e359b9a3 Uploaded xuebing parents: diff changeset	119 tmp_index_dir = tempfile.mkdtemp()
9071e359b9a3 Uploaded xuebing parents: diff changeset	120 tmp_dir = tempfile.mkdtemp()
9071e359b9a3 Uploaded xuebing parents: diff changeset	121 # index if necessary
9071e359b9a3 Uploaded xuebing parents: diff changeset	122 if options.fileSource == 'history' and not options.do_not_build_index:
9071e359b9a3 Uploaded xuebing parents: diff changeset	123 ref_file = tempfile.NamedTemporaryFile( dir=tmp_index_dir )
9071e359b9a3 Uploaded xuebing parents: diff changeset	124 ref_file_name = ref_file.name
9071e359b9a3 Uploaded xuebing parents: diff changeset	125 ref_file.close()
9071e359b9a3 Uploaded xuebing parents: diff changeset	126 os.symlink( options.ref, ref_file_name )
9071e359b9a3 Uploaded xuebing parents: diff changeset	127 # determine which indexing algorithm to use, based on size
9071e359b9a3 Uploaded xuebing parents: diff changeset	128 try:
9071e359b9a3 Uploaded xuebing parents: diff changeset	129 size = os.stat( options.ref ).st_size
9071e359b9a3 Uploaded xuebing parents: diff changeset	130 if size <= 2**30:
9071e359b9a3 Uploaded xuebing parents: diff changeset	131 indexingAlg = 'is'
9071e359b9a3 Uploaded xuebing parents: diff changeset	132 else:
9071e359b9a3 Uploaded xuebing parents: diff changeset	133 indexingAlg = 'bwtsw'
9071e359b9a3 Uploaded xuebing parents: diff changeset	134 except:
9071e359b9a3 Uploaded xuebing parents: diff changeset	135 indexingAlg = 'is'
9071e359b9a3 Uploaded xuebing parents: diff changeset	136 indexing_cmds = '%s -a %s' % ( color_space, indexingAlg )
9071e359b9a3 Uploaded xuebing parents: diff changeset	137 cmd1 = 'bwa index %s %s' % ( indexing_cmds, ref_file_name )
9071e359b9a3 Uploaded xuebing parents: diff changeset	138 try:
9071e359b9a3 Uploaded xuebing parents: diff changeset	139 tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name
9071e359b9a3 Uploaded xuebing parents: diff changeset	140 tmp_stderr = open( tmp, 'wb' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	141 proc = subprocess.Popen( args=cmd1, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() )
9071e359b9a3 Uploaded xuebing parents: diff changeset	142 returncode = proc.wait()
9071e359b9a3 Uploaded xuebing parents: diff changeset	143 tmp_stderr.close()
9071e359b9a3 Uploaded xuebing parents: diff changeset	144 # get stderr, allowing for case where it's very large
9071e359b9a3 Uploaded xuebing parents: diff changeset	145 tmp_stderr = open( tmp, 'rb' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	146 stderr = ''
9071e359b9a3 Uploaded xuebing parents: diff changeset	147 buffsize = 1048576
9071e359b9a3 Uploaded xuebing parents: diff changeset	148 try:
9071e359b9a3 Uploaded xuebing parents: diff changeset	149 while True:
9071e359b9a3 Uploaded xuebing parents: diff changeset	150 stderr += tmp_stderr.read( buffsize )
9071e359b9a3 Uploaded xuebing parents: diff changeset	151 if not stderr or len( stderr ) % buffsize != 0:
9071e359b9a3 Uploaded xuebing parents: diff changeset	152 break
9071e359b9a3 Uploaded xuebing parents: diff changeset	153 except OverflowError:
9071e359b9a3 Uploaded xuebing parents: diff changeset	154 pass
9071e359b9a3 Uploaded xuebing parents: diff changeset	155 tmp_stderr.close()
9071e359b9a3 Uploaded xuebing parents: diff changeset	156 if returncode != 0:
9071e359b9a3 Uploaded xuebing parents: diff changeset	157 raise Exception, stderr
9071e359b9a3 Uploaded xuebing parents: diff changeset	158 except Exception, e:
9071e359b9a3 Uploaded xuebing parents: diff changeset	159 # clean up temp dirs
9071e359b9a3 Uploaded xuebing parents: diff changeset	160 if os.path.exists( tmp_index_dir ):
9071e359b9a3 Uploaded xuebing parents: diff changeset	161 shutil.rmtree( tmp_index_dir )
9071e359b9a3 Uploaded xuebing parents: diff changeset	162 if os.path.exists( tmp_dir ):
9071e359b9a3 Uploaded xuebing parents: diff changeset	163 shutil.rmtree( tmp_dir )
9071e359b9a3 Uploaded xuebing parents: diff changeset	164 stop_err( 'Error indexing reference sequence. ' + str( e ) )
9071e359b9a3 Uploaded xuebing parents: diff changeset	165 else:
9071e359b9a3 Uploaded xuebing parents: diff changeset	166 ref_file_name = options.ref
9071e359b9a3 Uploaded xuebing parents: diff changeset	167 if options.illumina13qual:
9071e359b9a3 Uploaded xuebing parents: diff changeset	168 illumina_quals = "-I"
9071e359b9a3 Uploaded xuebing parents: diff changeset	169 else:
9071e359b9a3 Uploaded xuebing parents: diff changeset	170 illumina_quals = ""
9071e359b9a3 Uploaded xuebing parents: diff changeset	171
9071e359b9a3 Uploaded xuebing parents: diff changeset	172 # set up aligning and generate aligning command options
9071e359b9a3 Uploaded xuebing parents: diff changeset	173 if options.params == 'pre_set':
9071e359b9a3 Uploaded xuebing parents: diff changeset	174 aligning_cmds = '-t %s %s %s' % ( options.threads, color_space, illumina_quals )
9071e359b9a3 Uploaded xuebing parents: diff changeset	175 gen_alignment_cmds = ''
9071e359b9a3 Uploaded xuebing parents: diff changeset	176 else:
9071e359b9a3 Uploaded xuebing parents: diff changeset	177 if options.maxEditDist != '0':
9071e359b9a3 Uploaded xuebing parents: diff changeset	178 editDist = options.maxEditDist
9071e359b9a3 Uploaded xuebing parents: diff changeset	179 else:
9071e359b9a3 Uploaded xuebing parents: diff changeset	180 editDist = options.fracMissingAligns
9071e359b9a3 Uploaded xuebing parents: diff changeset	181 if options.seed != '-1':
9071e359b9a3 Uploaded xuebing parents: diff changeset	182 seed = '-l %s' % options.seed
9071e359b9a3 Uploaded xuebing parents: diff changeset	183 else:
9071e359b9a3 Uploaded xuebing parents: diff changeset	184 seed = ''
9071e359b9a3 Uploaded xuebing parents: diff changeset	185 if options.suboptAlign == 'true':
9071e359b9a3 Uploaded xuebing parents: diff changeset	186 suboptAlign = '-R'
9071e359b9a3 Uploaded xuebing parents: diff changeset	187 else:
9071e359b9a3 Uploaded xuebing parents: diff changeset	188 suboptAlign = ''
9071e359b9a3 Uploaded xuebing parents: diff changeset	189 if options.noIterSearch == 'true':
9071e359b9a3 Uploaded xuebing parents: diff changeset	190 noIterSearch = '-N'
9071e359b9a3 Uploaded xuebing parents: diff changeset	191 else:
9071e359b9a3 Uploaded xuebing parents: diff changeset	192 noIterSearch = ''
9071e359b9a3 Uploaded xuebing parents: diff changeset	193 aligning_cmds = '-n %s -o %s -e %s -d %s -i %s %s -k %s -t %s -M %s -O %s -E %s %s %s %s %s' % \
9071e359b9a3 Uploaded xuebing parents: diff changeset	194 ( editDist, options.maxGapOpens, options.maxGapExtens, options.disallowLongDel,
9071e359b9a3 Uploaded xuebing parents: diff changeset	195 options.disallowIndel, seed, options.maxEditDistSeed, options.threads,
9071e359b9a3 Uploaded xuebing parents: diff changeset	196 options.mismatchPenalty, options.gapOpenPenalty, options.gapExtensPenalty,
9071e359b9a3 Uploaded xuebing parents: diff changeset	197 suboptAlign, noIterSearch, color_space, illumina_quals )
9071e359b9a3 Uploaded xuebing parents: diff changeset	198 if options.genAlignType == 'paired':
9071e359b9a3 Uploaded xuebing parents: diff changeset	199 gen_alignment_cmds = '-a %s -o %s' % ( options.maxInsertSize, options.maxOccurPairing )
9071e359b9a3 Uploaded xuebing parents: diff changeset	200 if options.outputTopNDisc:
9071e359b9a3 Uploaded xuebing parents: diff changeset	201 gen_alignment_cmds += ' -N %s' % options.outputTopNDisc
9071e359b9a3 Uploaded xuebing parents: diff changeset	202 else:
9071e359b9a3 Uploaded xuebing parents: diff changeset	203 gen_alignment_cmds = ''
9071e359b9a3 Uploaded xuebing parents: diff changeset	204 if options.rgid:
9071e359b9a3 Uploaded xuebing parents: diff changeset	205 if not options.rglb or not options.rgpl or not options.rgsm:
9071e359b9a3 Uploaded xuebing parents: diff changeset	206 stop_err( 'If you want to specify read groups, you must include the ID, LB, PL, and SM tags.' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	207 readGroup = '@RG\tID:%s\tLB:%s\tPL:%s\tSM:%s' % ( options.rgid, options.rglb, options.rgpl, options.rgsm )
9071e359b9a3 Uploaded xuebing parents: diff changeset	208 if options.rgcn:
9071e359b9a3 Uploaded xuebing parents: diff changeset	209 readGroup += '\tCN:%s' % options.rgcn
9071e359b9a3 Uploaded xuebing parents: diff changeset	210 if options.rgds:
9071e359b9a3 Uploaded xuebing parents: diff changeset	211 readGroup += '\tDS:%s' % options.rgds
9071e359b9a3 Uploaded xuebing parents: diff changeset	212 if options.rgdt:
9071e359b9a3 Uploaded xuebing parents: diff changeset	213 readGroup += '\tDT:%s' % options.rgdt
9071e359b9a3 Uploaded xuebing parents: diff changeset	214 if options.rgfo:
9071e359b9a3 Uploaded xuebing parents: diff changeset	215 readGroup += '\tFO:%s' % options.rgfo
9071e359b9a3 Uploaded xuebing parents: diff changeset	216 if options.rgks:
9071e359b9a3 Uploaded xuebing parents: diff changeset	217 readGroup += '\tKS:%s' % options.rgks
9071e359b9a3 Uploaded xuebing parents: diff changeset	218 if options.rgpg:
9071e359b9a3 Uploaded xuebing parents: diff changeset	219 readGroup += '\tPG:%s' % options.rgpg
9071e359b9a3 Uploaded xuebing parents: diff changeset	220 if options.rgpi:
9071e359b9a3 Uploaded xuebing parents: diff changeset	221 readGroup += '\tPI:%s' % options.rgpi
9071e359b9a3 Uploaded xuebing parents: diff changeset	222 if options.rgpu:
9071e359b9a3 Uploaded xuebing parents: diff changeset	223 readGroup += '\tPU:%s' % options.rgpu
9071e359b9a3 Uploaded xuebing parents: diff changeset	224 gen_alignment_cmds += ' -r "%s"' % readGroup
9071e359b9a3 Uploaded xuebing parents: diff changeset	225 if options.outputTopN:
9071e359b9a3 Uploaded xuebing parents: diff changeset	226 gen_alignment_cmds += ' -n %s' % options.outputTopN
9071e359b9a3 Uploaded xuebing parents: diff changeset	227 # set up output files
9071e359b9a3 Uploaded xuebing parents: diff changeset	228 tmp_align_out = tempfile.NamedTemporaryFile( dir=tmp_dir )
9071e359b9a3 Uploaded xuebing parents: diff changeset	229 tmp_align_out_name = tmp_align_out.name
9071e359b9a3 Uploaded xuebing parents: diff changeset	230 tmp_align_out.close()
9071e359b9a3 Uploaded xuebing parents: diff changeset	231 tmp_align_out2 = tempfile.NamedTemporaryFile( dir=tmp_dir )
9071e359b9a3 Uploaded xuebing parents: diff changeset	232 tmp_align_out2_name = tmp_align_out2.name
9071e359b9a3 Uploaded xuebing parents: diff changeset	233 tmp_align_out2.close()
9071e359b9a3 Uploaded xuebing parents: diff changeset	234 # prepare actual aligning and generate aligning commands
9071e359b9a3 Uploaded xuebing parents: diff changeset	235 cmd2 = 'bwa aln %s %s %s > %s' % ( aligning_cmds, ref_file_name, fastq, tmp_align_out_name )
9071e359b9a3 Uploaded xuebing parents: diff changeset	236 cmd2b = ''
9071e359b9a3 Uploaded xuebing parents: diff changeset	237 if options.genAlignType == 'paired':
9071e359b9a3 Uploaded xuebing parents: diff changeset	238 cmd2b = 'bwa aln %s %s %s > %s' % ( aligning_cmds, ref_file_name, rfastq, tmp_align_out2_name )
9071e359b9a3 Uploaded xuebing parents: diff changeset	239 cmd3 = 'bwa sampe %s %s %s %s %s %s >> %s' % ( gen_alignment_cmds, ref_file_name, tmp_align_out_name, tmp_align_out2_name, fastq, rfastq, options.output )
9071e359b9a3 Uploaded xuebing parents: diff changeset	240 else:
9071e359b9a3 Uploaded xuebing parents: diff changeset	241 cmd3 = 'bwa samse %s %s %s %s >> %s' % ( gen_alignment_cmds, ref_file_name, tmp_align_out_name, fastq, options.output )
9071e359b9a3 Uploaded xuebing parents: diff changeset	242 # perform alignments
9071e359b9a3 Uploaded xuebing parents: diff changeset	243 buffsize = 1048576
9071e359b9a3 Uploaded xuebing parents: diff changeset	244 try:
9071e359b9a3 Uploaded xuebing parents: diff changeset	245 # need to nest try-except in try-finally to handle 2.4
9071e359b9a3 Uploaded xuebing parents: diff changeset	246 try:
9071e359b9a3 Uploaded xuebing parents: diff changeset	247 # align
9071e359b9a3 Uploaded xuebing parents: diff changeset	248 try:
9071e359b9a3 Uploaded xuebing parents: diff changeset	249 tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
9071e359b9a3 Uploaded xuebing parents: diff changeset	250 tmp_stderr = open( tmp, 'wb' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	251 proc = subprocess.Popen( args=cmd2, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
9071e359b9a3 Uploaded xuebing parents: diff changeset	252 returncode = proc.wait()
9071e359b9a3 Uploaded xuebing parents: diff changeset	253 tmp_stderr.close()
9071e359b9a3 Uploaded xuebing parents: diff changeset	254 # get stderr, allowing for case where it's very large
9071e359b9a3 Uploaded xuebing parents: diff changeset	255 tmp_stderr = open( tmp, 'rb' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	256 stderr = ''
9071e359b9a3 Uploaded xuebing parents: diff changeset	257 try:
9071e359b9a3 Uploaded xuebing parents: diff changeset	258 while True:
9071e359b9a3 Uploaded xuebing parents: diff changeset	259 stderr += tmp_stderr.read( buffsize )
9071e359b9a3 Uploaded xuebing parents: diff changeset	260 if not stderr or len( stderr ) % buffsize != 0:
9071e359b9a3 Uploaded xuebing parents: diff changeset	261 break
9071e359b9a3 Uploaded xuebing parents: diff changeset	262 except OverflowError:
9071e359b9a3 Uploaded xuebing parents: diff changeset	263 pass
9071e359b9a3 Uploaded xuebing parents: diff changeset	264 tmp_stderr.close()
9071e359b9a3 Uploaded xuebing parents: diff changeset	265 if returncode != 0:
9071e359b9a3 Uploaded xuebing parents: diff changeset	266 raise Exception, stderr
9071e359b9a3 Uploaded xuebing parents: diff changeset	267 except Exception, e:
9071e359b9a3 Uploaded xuebing parents: diff changeset	268 raise Exception, 'Error aligning sequence. ' + str( e )
9071e359b9a3 Uploaded xuebing parents: diff changeset	269 # and again if paired data
9071e359b9a3 Uploaded xuebing parents: diff changeset	270 try:
9071e359b9a3 Uploaded xuebing parents: diff changeset	271 if cmd2b:
9071e359b9a3 Uploaded xuebing parents: diff changeset	272 tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
9071e359b9a3 Uploaded xuebing parents: diff changeset	273 tmp_stderr = open( tmp, 'wb' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	274 proc = subprocess.Popen( args=cmd2b, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
9071e359b9a3 Uploaded xuebing parents: diff changeset	275 returncode = proc.wait()
9071e359b9a3 Uploaded xuebing parents: diff changeset	276 tmp_stderr.close()
9071e359b9a3 Uploaded xuebing parents: diff changeset	277 # get stderr, allowing for case where it's very large
9071e359b9a3 Uploaded xuebing parents: diff changeset	278 tmp_stderr = open( tmp, 'rb' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	279 stderr = ''
9071e359b9a3 Uploaded xuebing parents: diff changeset	280 try:
9071e359b9a3 Uploaded xuebing parents: diff changeset	281 while True:
9071e359b9a3 Uploaded xuebing parents: diff changeset	282 stderr += tmp_stderr.read( buffsize )
9071e359b9a3 Uploaded xuebing parents: diff changeset	283 if not stderr or len( stderr ) % buffsize != 0:
9071e359b9a3 Uploaded xuebing parents: diff changeset	284 break
9071e359b9a3 Uploaded xuebing parents: diff changeset	285 except OverflowError:
9071e359b9a3 Uploaded xuebing parents: diff changeset	286 pass
9071e359b9a3 Uploaded xuebing parents: diff changeset	287 tmp_stderr.close()
9071e359b9a3 Uploaded xuebing parents: diff changeset	288 if returncode != 0:
9071e359b9a3 Uploaded xuebing parents: diff changeset	289 raise Exception, stderr
9071e359b9a3 Uploaded xuebing parents: diff changeset	290 except Exception, e:
9071e359b9a3 Uploaded xuebing parents: diff changeset	291 raise Exception, 'Error aligning second sequence. ' + str( e )
9071e359b9a3 Uploaded xuebing parents: diff changeset	292 # generate align
9071e359b9a3 Uploaded xuebing parents: diff changeset	293 try:
9071e359b9a3 Uploaded xuebing parents: diff changeset	294 tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
9071e359b9a3 Uploaded xuebing parents: diff changeset	295 tmp_stderr = open( tmp, 'wb' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	296 proc = subprocess.Popen( args=cmd3, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
9071e359b9a3 Uploaded xuebing parents: diff changeset	297 returncode = proc.wait()
9071e359b9a3 Uploaded xuebing parents: diff changeset	298 tmp_stderr.close()
9071e359b9a3 Uploaded xuebing parents: diff changeset	299 # get stderr, allowing for case where it's very large
9071e359b9a3 Uploaded xuebing parents: diff changeset	300 tmp_stderr = open( tmp, 'rb' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	301 stderr = ''
9071e359b9a3 Uploaded xuebing parents: diff changeset	302 try:
9071e359b9a3 Uploaded xuebing parents: diff changeset	303 while True:
9071e359b9a3 Uploaded xuebing parents: diff changeset	304 stderr += tmp_stderr.read( buffsize )
9071e359b9a3 Uploaded xuebing parents: diff changeset	305 if not stderr or len( stderr ) % buffsize != 0:
9071e359b9a3 Uploaded xuebing parents: diff changeset	306 break
9071e359b9a3 Uploaded xuebing parents: diff changeset	307 except OverflowError:
9071e359b9a3 Uploaded xuebing parents: diff changeset	308 pass
9071e359b9a3 Uploaded xuebing parents: diff changeset	309 tmp_stderr.close()
9071e359b9a3 Uploaded xuebing parents: diff changeset	310 if returncode != 0:
9071e359b9a3 Uploaded xuebing parents: diff changeset	311 raise Exception, stderr
9071e359b9a3 Uploaded xuebing parents: diff changeset	312 except Exception, e:
9071e359b9a3 Uploaded xuebing parents: diff changeset	313 raise Exception, 'Error generating alignments. ' + str( e )
9071e359b9a3 Uploaded xuebing parents: diff changeset	314 # remove header if necessary
9071e359b9a3 Uploaded xuebing parents: diff changeset	315 if options.suppressHeader == 'true':
9071e359b9a3 Uploaded xuebing parents: diff changeset	316 tmp_out = tempfile.NamedTemporaryFile( dir=tmp_dir)
9071e359b9a3 Uploaded xuebing parents: diff changeset	317 tmp_out_name = tmp_out.name
9071e359b9a3 Uploaded xuebing parents: diff changeset	318 tmp_out.close()
9071e359b9a3 Uploaded xuebing parents: diff changeset	319 try:
9071e359b9a3 Uploaded xuebing parents: diff changeset	320 shutil.move( options.output, tmp_out_name )
9071e359b9a3 Uploaded xuebing parents: diff changeset	321 except Exception, e:
9071e359b9a3 Uploaded xuebing parents: diff changeset	322 raise Exception, 'Error moving output file before removing headers. ' + str( e )
9071e359b9a3 Uploaded xuebing parents: diff changeset	323 fout = file( options.output, 'w' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	324 for line in file( tmp_out.name, 'r' ):
9071e359b9a3 Uploaded xuebing parents: diff changeset	325 if not ( line.startswith( '@HD' ) or line.startswith( '@SQ' ) or line.startswith( '@RG' ) or line.startswith( '@PG' ) or line.startswith( '@CO' ) ):
9071e359b9a3 Uploaded xuebing parents: diff changeset	326 fout.write( line )
9071e359b9a3 Uploaded xuebing parents: diff changeset	327 fout.close()
9071e359b9a3 Uploaded xuebing parents: diff changeset	328 # check that there are results in the output file
9071e359b9a3 Uploaded xuebing parents: diff changeset	329 if os.path.getsize( options.output ) > 0:
9071e359b9a3 Uploaded xuebing parents: diff changeset	330 sys.stdout.write( 'BWA run on %s-end data' % options.genAlignType )
9071e359b9a3 Uploaded xuebing parents: diff changeset	331 else:
9071e359b9a3 Uploaded xuebing parents: diff changeset	332 raise Exception, 'The output file is empty. You may simply have no matches, or there may be an error with your input file or settings.'
9071e359b9a3 Uploaded xuebing parents: diff changeset	333 except Exception, e:
9071e359b9a3 Uploaded xuebing parents: diff changeset	334 stop_err( 'The alignment failed.\n' + str( e ) )
9071e359b9a3 Uploaded xuebing parents: diff changeset	335 finally:
9071e359b9a3 Uploaded xuebing parents: diff changeset	336 # clean up temp dir
9071e359b9a3 Uploaded xuebing parents: diff changeset	337 if os.path.exists( tmp_index_dir ):
9071e359b9a3 Uploaded xuebing parents: diff changeset	338 shutil.rmtree( tmp_index_dir )
9071e359b9a3 Uploaded xuebing parents: diff changeset	339 if os.path.exists( tmp_dir ):
9071e359b9a3 Uploaded xuebing parents: diff changeset	340 shutil.rmtree( tmp_dir )
9071e359b9a3 Uploaded xuebing parents: diff changeset	341
9071e359b9a3 Uploaded xuebing parents: diff changeset	342 if __name__=="__main__": __main__()

Mercurial > repos > xuebing > sharplabtool

annotate tools/sr_mapping/bwa_wrapper.py @ 1:cdcb0ce84a1b