Mercurial > repos > gbcs-embl-heidelberg > jemultiplexer
view jemultiplexer.py @ 2:1b79b43626ef draft
Uploaded
author | gbcs-embl-heidelberg |
---|---|
date | Wed, 03 Sep 2014 04:12:06 -0400 |
parents | |
children | 861cbe4eca25 |
line wrap: on
line source
#!/usr/bin/env python import os, sys, string, shutil, subprocess, tempfile, re def cleanup(tmpdir): # cleanup try: if os.path.exists( tmpdir ): shutil.rmtree( tmpdir ) except Exception, e: stop_err( 'Error cleaning. ' + str( e ) ) # In the unlikely event of a fire, please use the nearest emergency exit def stop_err( msg ): sys.stderr.write( '%s\n' % msg ) sys.exit() def __main__(): tooldir = os.path.dirname(sys.argv[0]) executable = tooldir + "/jemultiplexer_embase_1.0.4_bundle.jar" if not os.path.exists(executable): stop_err( "The file \"%s\" was not found. " % ( executable ) ) mpxdata = sys.argv[1] output1 = sys.argv[2] output1id = sys.argv[3] barcodes = sys.argv[4] barcode_list = sys.argv[5] newfilepath = sys.argv[6] extension = sys.argv[7] barlen = sys.argv[8] qualityFormat = sys.argv[9] maxMismatches = sys.argv[10] minBaseQuality = sys.argv[11] minMismatchingDelta = sys.argv[12] xTrimLen = sys.argv[13] zTrimLen = sys.argv[14] clipBarcode = sys.argv[15] addBarcodeToHeader = sys.argv[16] gzipOutput = sys.argv[17] barcodeDiagFile = sys.argv[18] rChar = sys.argv[19] barcodeReadPos = sys.argv[20] barcodeForSampleMatching = sys.argv[21] redundantBarcode = sys.argv[22] strict = sys.argv[23] MpxData2 = sys.argv[24] ## create the tmpdir & co tmpdir = newfilepath + "/demultiplex_" + output1id oldnames=[] stat = tmpdir+"/jemultiplexer_out_stats.txt" #the default output stat file name try: if not os.path.isdir(tmpdir): os.mkdir(tmpdir) except Exception, e: stop_err( "Error creating directory \"%s\". %s" % ( tmpdir, str( e ) ) ) #output file extension if gzipOutput == "true": outputExtension=".gz" else: outputExtension="" # Reconstructing the output file names as jemultiplexer writes them if MpxData2 != "single": oldnames.append('unassigned_1.txt' + outputExtension) oldnames.append('unassigned_2.txt' + outputExtension) else: oldnames.append('unassigned_1.txt' + outputExtension) if barcodeDiagFile=="true": oldnames.append('barcode_match_report.txt') if barcode_list == "none": # If a .bs file was given bc = open(barcodes, "r") bcw = open( tmpdir + "/barcodes.txt", "w" ) for line in bc.readlines(): l = line.split() if l[0] != "": if MpxData2 != "single": oldnames.append(l[0]+'_'+l[1]+'_1.txt' + outputExtension) oldnames.append(l[0]+'_'+l[1]+'_2.txt' + outputExtension) bcw.write(l[0] + "\t" + l[1] + "\t" + l[0]+'_'+l[1]+'_1.txt' + outputExtension + "\t" + l[0]+'_'+l[1]+'_2.txt' + outputExtension + "\n") else: oldnames.append(l[0]+'_'+l[1]+'_1.txt' + outputExtension) bcw.write(l[0] + "\t" + l[1] + "\t" + l[0]+'_'+l[1]+'_1.txt' + outputExtension + "\n") bc.close() bcw.close() elif barcodes == "none": # If the text area was used lines = barcode_list.split("__cr____cn__") #bc = csv.writer( open( tmpdir + "/barcodes.txt", "w" ), delimiter = "\t" ) bc = open( tmpdir + "/barcodes.txt", "w" ) for l in lines: l = l.replace("__tc__", " ").split() if len(l) < 2: continue if l[0] != "": if MpxData2 != "single": oldnames.append(l[0]+'_'+l[1]+'_1.txt' + outputExtension) oldnames.append(l[0]+'_'+l[1]+'_2.txt' + outputExtension) bc.write(l[0] + "\t" + l[1] + "\t" + l[0]+'_'+l[1]+'_1.txt' + outputExtension + "\t" + l[0]+'_'+l[1]+'_2.txt' + outputExtension + "\n") else: oldnames.append(l[0]+'_'+l[1]+'_1.txt' + outputExtension) bc.write(l[0] + "\t" + l[1] + "\t" + l[0]+'_'+l[1]+'_1.txt' + outputExtension + "\n") bc.close() ## Building the command line cmd = "java -Xmx8g -jar " + executable cmd+= " F1=" cmd+= mpxdata ## if we have PE data if MpxData2 != "single": cmd+= " F2=" cmd+= MpxData2 cmd+= " BPOS=" cmd+= barcodeReadPos cmd+= " BRED=" cmd+= redundantBarcode cmd+= " BM=" cmd+= barcodeForSampleMatching cmd+= " S=" cmd+= strict cmd+= " BF=" cmd+= tmpdir + "/barcodes.txt" cmd+= " OUTPUT_DIR=\"" cmd+= tmpdir cmd+= "\"" cmd+= " BCLEN=" cmd+= barlen cmd+= " QUALITY_FORMAT=" cmd+= qualityFormat cmd+= " MAX_MISMATCHES=" cmd+= maxMismatches cmd+= " MIN_BASE_QUALITY=" cmd+= minBaseQuality cmd+= " MMD=" cmd+= minMismatchingDelta cmd+= " XT=" cmd+= xTrimLen cmd+= " ZT=" cmd+= zTrimLen cmd+= " C=" cmd+= clipBarcode cmd+= " ADD=" cmd+= addBarcodeToHeader cmd+= " GZ=" cmd+= gzipOutput if rChar=="2": cmd+= " RCHAR=" cmd+= ":" elif rChar=="3": cmd+= " RCHAR=" cmd+= "_" elif rChar=="4": cmd+= " RCHAR=" cmd+= "-" if barcodeDiagFile=="true": cmd+= " BARCODE_DIAG_FILE=" cmd+= 'barcode_match_report.txt' # Executing jemultiplexer # status = os.system(cmd) try: tmperr = tempfile.NamedTemporaryFile( dir=tmpdir ).name tmpout = tempfile.NamedTemporaryFile( dir=tmpdir ).name tmp_stderr = open( tmperr, 'wb' ) tmp_stdout = open( tmpout, 'wb' ) proc = subprocess.Popen( args=cmd, shell=True, cwd=tmpdir, stdout=tmp_stdout.fileno(), stderr=tmp_stderr.fileno() ) returncode = proc.wait() tmp_stderr.close() tmp_stdout.close() # get stderr, allowing for case where it's very large tmp_stderr = open( tmperr, 'rb' ) stderr = '' buffsize = 1048576 try: while True: stderr += tmp_stderr.read( buffsize ) if not stderr or len( stderr ) % buffsize != 0: break except OverflowError: pass tmp_stderr.close() if returncode != 0: raise Exception, stderr except Exception, e: #cleanup(tmpdir) stop_err( 'Error demultiplexing sequence. ' + str( e ) ) # Creating the required paths for multiple outputs newnames=[] if os.path.isdir(tmpdir): for f in oldnames: tmpf = tmpdir+"/"+f if os.path.isfile(tmpf): # check the size if os.path.getsize(tmpf) == 0: stop_err( 'The output file: ' + f + ' is empty, there may be an error with your barcode file or settings.') name = f s = "primary_" s+= output1id s+= "_" s+= string.replace(name, "_", "-") s+= "_visible_" if extension == "fastqillumina": if gzipOutput == "true": s+="gz" else: s+= extension else: if f=="barcode_match_report.txt": s+="text" else: if gzipOutput == "true": s+="gz" else: s+="fastqsanger" newnames.append(newfilepath+"/"+s) # Adding the appropriate prefixes to the old filenames for i in range(len(oldnames)): oldnames[i] = tmpdir+"/"+oldnames[i] ### NUMSTAT rewriting ### htmlout = open(output1, "w") numstat = open(stat, "r") htmlout.write("<html><head><title>numStat</title></head><body><!--\n") for l in numstat.readlines(): htmlout.write(l) numstat.seek(0) htmlout.write("-->\n<h2>Please refresh your history to display the new datasets</h2>\n<h3>numStat</h3><table border=\"1\">\n") htmlout.write("<tr><td>%s</td></tr>\n" % (cmd )) for l in numstat.readlines(): l = l.split() htmlout.write("<tr><td>%s</td><td>%s</td></tr>\n" % (l[0], l[1]) ) htmlout.write("</table></body></html>") numstat.close() htmlout.close() #~ # Moving the first file (the mandatory output file defined in the xml (the txt stats)) #~ shutil.move(stat,output1) #~ #~ # add a warning to the numStat #~ statfh = open(stat,'a') #~ statfh.write("\nRefresh you library to see the new DataSets.\n") #~ statfh.close() # Moving everything where it will be seen properly by Galaxy for i in range(len(oldnames)): shutil.move(oldnames[i],newnames[i]) # cleanup cleanup(tmpdir) # Ta-da! if __name__=="__main__": __main__()