Mercurial > repos > yufei-luo > s_mart
view SMART/DiffExpAnal/gsnap_parallel_unSQL.py @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python import optparse, os, shutil, subprocess, sys, tempfile, fileinput, tarfile, glob import time from commons.core.launcher.Launcher import Launcher from commons.core.sql.TableJobAdaptatorFactory import TableJobAdaptatorFactory from commons.core.utils.FileUtils import FileUtils from optparse import OptionParser def stop_err( msg ): sys.stderr.write( "%s\n" % msg ) sys.exit() def toTar(tarFileName, accepted_hits_outputNames): tfile = tarfile.open(tarFileName + ".tmp.tar", "w") currentPath = os.getcwd() os.chdir(dir) for file in accepted_hits_outputNames: relativeFileName = os.path.basename(file) tfile.add(relativeFileName) os.system("mv %s %s" % (tarFileName + ".tmp.tar", tarFileName)) tfile.close() os.chdir(currentPath) def joinSAM(dCutOut2Out): for key in dCutOut2Out.keys(): FileUtils.catFilesFromList(dCutOut2Out[key],key, False) def _map(iLauncher, cmd, cmdStart, cmdFinish ): lCmds = [] lCmds.extend(cmd) lCmdStart = [] lCmdStart.extend(cmdStart) lCmdFinish = [] lCmdFinish.extend(cmdFinish) return(iLauncher.prepareCommands_withoutIndentation(lCmds, lCmdStart, lCmdFinish)) def _createGsnapSplicingOptions(options): lArgs = [] lArgs.append("-N %s" % options.novelsplicing) if options.useSplicing: lArgs.append("-s %s" % options.useSplicing) lArgs.append("-w %s" % options.localsplicedist) lArgs.append("-e %s" % options.localSplicePenality) lArgs.append("-E %s" % options.distantSplicePenality) lArgs.append("-K %s" % options.distantSpliceEndlength) lArgs.append("-l %s" % options.shortendSpliceEndlength) return lArgs def _createGsnapPairedEndOptions(options): lArgs = [] if not(options.useSplicing or options.pairedEndFile): lArgs.append("--pairmax-dna %s" % options.pairmaxRna) if options.useSplicing or options.pairedEndFile: lArgs.append("--pairmax-rna %s" % options.pairmaxRna) lArgs.append("--pairexpect=%s" % options.pairexpect) lArgs.append("--pairdev=%s" % options.pairedev) def _createGsnapCommand(iLauncher, options, workingDir, inputFileNames, inputRevFilesNames, outputFileName, batchNumber, numberOfBatch): lArgs = [] lArgs.append("-d %s" % options.genomeName) lArgs.append("-k %s" % options.kmer) lArgs.append("-D %s" % workingDir) lArgs.append("-A %s" % options.outputFormat) lArgs.append("-q %s/%s" % (batchNumber, numberOfBatch)) lArgs.append("--no-sam-headers") lArgs.append(inputFileNames) print 'N option: %s, pairedEndFile option: %s' %(options.novelsplicing, options.pairedEndFile) if options.pairedEndFile: lArgs.append(inputRevFilesNames) if options.novelsplicing == '1': lArgs.extend(_createGsnapSplicingOptions(options)) elif options.pairedEndFile: lArgs.extend(_createGsnapPairedEndOptions(options)) lArgs.append("> %s" % outputFileName) return iLauncher.getSystemCommand("gsnap", lArgs) def __main__(): #Parse Command Line description = "GMAP/GSNAP version:2012-12-20." parser = OptionParser(description = description) parser.add_option('-o', '--outputTxtFile', dest='outputTxtFile', help='for Differential expression analysis pipeline, new output option gives a txt output containing the list of mapping results.') parser.add_option("-q", "--inputTxt", dest="inputTxt", action="store", type="string", help="input, a txt file for a list of input reads files [compulsory]") parser.add_option('-t', '--tar', dest='outputTar', default=None, help='output all accepted hits results in a tar file.' ) parser.add_option("-d", "--genomeName", dest="genomeName", help="Define the reference genome name.[compulsory]") # parser.add_option("-o", "--outputFile", dest="outputfile", help="output[compulsory]") parser.add_option("-k", "--kmer", dest="kmer", default=12, help="Choose kmer value (<=16), a big kmer value can take more RAM(4Go).[compulsory]") parser.add_option("-i", "--inputFasta", dest="inputFastaFile", help="Reference genome file, fasta format.[compulsory]") parser.add_option("-p", "--pairedEnd", dest="pairedEndFile", default=None, help="Input paired-end fastq file.") parser.add_option("-A", "--outputFormat", dest="outputFormat", default="sam", help="Choose an output format [sam, goby (need to re-compile with appropriate options)].") #Splicing options for RNA-Seq parser.add_option("-N","--novelsplicing", dest="novelsplicing", default=0, help="Look for novel splicing (0=no (default), 1=yes)") parser.add_option("-s","--use-splicing", dest="useSplicing",action="store", type="string", help="Look for splicing involving known sites or known introns (in <STRING>.iit), at short or long distances") parser.add_option("-w","--localsplicedist", dest="localsplicedist", default=200000, help="Definition of local novel splicing event (default 200000)") parser.add_option("-e","--local-splice-penality", dest="localSplicePenality", default=0, help="Penalty for a local splice (default 0). Counts against mismatches allowed") parser.add_option("-E","--distant-splice-penality", dest="distantSplicePenality", default=1, help="Penalty for a distant splice (default 1). A distant splice is one where the intron length exceeds the value of -w, or --localsplicedist, or is an inversion, scramble, or translocation between two different chromosomes Counts against mismatches allowed") parser.add_option("-K","--distant-splice-endlength", dest="distantSpliceEndlength", default=16, help="Minimum length at end required for distant spliced alignments (default 16, min allowed is the value of -k, or kmer size)") parser.add_option("-l","--shortend-splice-endlength", dest="shortendSpliceEndlength", default=2, help="Minimum length at end required for short-end spliced alignments (default 2, but unless known splice sites are provided with the -s flag, GSNAP may still need the end length to be the value of -k, or kmer size to find a given splice") #Specific paired-ends reads parser.add_option("--pairmax-dna", dest="pairmaxDna", default=1000, help="Max total genomic length for DNA-Seq paired reads, or other reads without splicing (default 1000).") parser.add_option("--pairmax-rna", dest="pairmaxRna", default=2000, help="Max total genomic length for RNA-Seq paired reads, or other reads that could have a splice (default 200000).") parser.add_option("--pairexpect", dest="pairexpect", default=200, help="Expected paired-end length, used for calling splices in medial part of paired-end reads (default 200)") parser.add_option("--pairdev", dest="pairdev", default=25, help="Allowable deviation from expected paired-end length, used for calling splices in medial part of paired-end reads (default 25)") (options, args) = parser.parse_args() workingDir = os.path.dirname(options.inputFastaFile) file = open(options.inputTxt,"r") lines = file.readlines() inputFileNames = [] gsnapOutputNames = [] outputName = options.outputTxtFile resDirName = os.path.dirname(outputName) + '/' out = open(outputName, "w") for line in lines: timeId = time.strftime("%Y%m%d%H%M%S") tab = line.split() inputFileNames.append(tab[1]) OutputName = resDirName + tab[0] + '_samOutput_%s.sam' % timeId gsnapOutputNames.append(OutputName) out.write(tab[0] + '\t' + OutputName + '\n') file.close() out.close() if options.pairedEndFile: revFile = open(options.pairedEndFile,"r") lines = revFile.readlines() inputRevFileNames = [] for line in lines: revTab = line.split() inputRevFileNames.append(revTab[1]) revFile.close() #Create gsnap make lCmdsTuples =[] acronym = "gsnap_make" jobdb = TableJobAdaptatorFactory.createJobInstance() iLauncher = Launcher(jobdb, os.getcwd(), "", "", os.getcwd(), os.getcwd(), "jobs", "", acronym, acronym, False, True) cmds = [] cmd_setup = "gmap_setup -d %s -D %s -k %s %s;" % (options.genomeName, workingDir, options.kmer, options.inputFastaFile) cmds.append(cmd_setup) cmd_make_coords = "make -f Makefile.%s coords;" % options.genomeName cmds.append(cmd_make_coords) cmd_make_gmapdb = "make -f Makefile.%s gmapdb;" % options.genomeName cmds.append(cmd_make_gmapdb) cmd_make_install = "make -f Makefile.%s install;" % options.genomeName cmds.append(cmd_make_install) cmd_index = iLauncher.getSystemCommand("", cmds) cmd2Launch = [] cmdStart = [] cmdFinish = [] cmd2Launch.append(cmd_index) lCmdsTuples.append(_map(iLauncher, cmd2Launch, cmdStart, cmdFinish)) iLauncher.runLauncherForMultipleJobs(acronym, lCmdsTuples, True) acronym = "gsnap" jobdb = TableJobAdaptatorFactory.createJobInstance() iLauncher = Launcher(jobdb, os.getcwd(), "", "", os.getcwd(), os.getcwd(), "jobs", "", acronym, acronym, False, True) lCmdsTuples = [] dCutOut2Out = {} lAllFile2remove = [] numberOfBatch = 20 #usually for testing, working on to find a value for default launch on galaxy for i in range(len(inputFileNames)): lCutOutput = [] for j in range(numberOfBatch): cutOutput = "%s_out_%s" % (inputFileNames[i], j) lCutOutput.append(cutOutput) lAllFile2remove.extend(lCutOutput) cmd2Launch = [] if options.pairedEndFile: inputRevFile = inputRevFileNames[i] else: inputRevFile = "" cmd2Launch.append(_createGsnapCommand(iLauncher, options, workingDir, inputFileNames[i], inputRevFile, cutOutput, j, numberOfBatch)) cmdStart = [] cmdFinish = [] lCmdsTuples.append(_map(iLauncher, cmd2Launch, cmdStart, cmdFinish)) dCutOut2Out[gsnapOutputNames[i]] = lCutOutput iLauncher.runLauncherForMultipleJobs(acronym, lCmdsTuples, True) joinSAM(dCutOut2Out) FileUtils.removeFilesFromListIfExist(lAllFile2remove) if options.outputTar != None: toTar(options.outputTar, gsnapOutputNames) if __name__=="__main__": __main__()