Mercurial > repos > yufei-luo > s_mart
diff SMART/DiffExpAnal/gsnap_parallel_unSQL.py @ 31:0ab839023fe4
Uploaded
author | m-zytnicki |
---|---|
date | Tue, 30 Apr 2013 14:33:21 -0400 |
parents | 94ab73e8a190 |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/DiffExpAnal/gsnap_parallel_unSQL.py Tue Apr 30 14:33:21 2013 -0400 @@ -0,0 +1,195 @@ +#!/usr/bin/env python + +import optparse, os, shutil, subprocess, sys, tempfile, fileinput, tarfile, glob +import time +from commons.core.launcher.Launcher import Launcher +from commons.core.sql.TableJobAdaptatorFactory import TableJobAdaptatorFactory +from commons.core.utils.FileUtils import FileUtils +from optparse import OptionParser + +def stop_err( msg ): + sys.stderr.write( "%s\n" % msg ) + sys.exit() + +def toTar(tarFileName, accepted_hits_outputNames): + tfile = tarfile.open(tarFileName + ".tmp.tar", "w") + currentPath = os.getcwd() + os.chdir(dir) + for file in accepted_hits_outputNames: + relativeFileName = os.path.basename(file) + tfile.add(relativeFileName) + os.system("mv %s %s" % (tarFileName + ".tmp.tar", tarFileName)) + tfile.close() + os.chdir(currentPath) + +def joinSAM(dCutOut2Out): + for key in dCutOut2Out.keys(): + FileUtils.catFilesFromList(dCutOut2Out[key],key, False) + +def _map(iLauncher, cmd, cmdStart, cmdFinish ): + lCmds = [] + lCmds.extend(cmd) + lCmdStart = [] + lCmdStart.extend(cmdStart) + lCmdFinish = [] + lCmdFinish.extend(cmdFinish) + return(iLauncher.prepareCommands_withoutIndentation(lCmds, lCmdStart, lCmdFinish)) + +def _createGsnapSplicingOptions(options): + lArgs = [] + lArgs.append("-N %s" % options.novelsplicing) + if options.useSplicing: + lArgs.append("-s %s" % options.useSplicing) + lArgs.append("-w %s" % options.localsplicedist) + lArgs.append("-e %s" % options.localSplicePenality) + lArgs.append("-E %s" % options.distantSplicePenality) + lArgs.append("-K %s" % options.distantSpliceEndlength) + lArgs.append("-l %s" % options.shortendSpliceEndlength) + + + return lArgs + +def _createGsnapPairedEndOptions(options): + lArgs = [] + if not(options.useSplicing or options.pairedEndFile): + lArgs.append("--pairmax-dna %s" % options.pairmaxRna) + if options.useSplicing or options.pairedEndFile: + lArgs.append("--pairmax-rna %s" % options.pairmaxRna) + lArgs.append("--pairexpect=%s" % options.pairexpect) + lArgs.append("--pairdev=%s" % options.pairedev) + + + +def _createGsnapCommand(iLauncher, options, workingDir, inputFileNames, inputRevFilesNames, outputFileName, batchNumber, numberOfBatch): + lArgs = [] + lArgs.append("-d %s" % options.genomeName) + lArgs.append("-k %s" % options.kmer) + lArgs.append("-D %s" % workingDir) + lArgs.append("-A %s" % options.outputFormat) + lArgs.append("-q %s/%s" % (batchNumber, numberOfBatch)) + lArgs.append("--no-sam-headers") + lArgs.append(inputFileNames) + print 'N option: %s, pairedEndFile option: %s' %(options.novelsplicing, options.pairedEndFile) + if options.pairedEndFile: + lArgs.append(inputRevFilesNames) + if options.novelsplicing == '1': + lArgs.extend(_createGsnapSplicingOptions(options)) + elif options.pairedEndFile: + lArgs.extend(_createGsnapPairedEndOptions(options)) + + lArgs.append("> %s" % outputFileName) + return iLauncher.getSystemCommand("gsnap", lArgs) + +def __main__(): + #Parse Command Line + description = "GMAP/GSNAP version:2012-12-20." + parser = OptionParser(description = description) + parser.add_option('-o', '--outputTxtFile', dest='outputTxtFile', help='for Differential expression analysis pipeline, new output option gives a txt output containing the list of mapping results.') + parser.add_option("-q", "--inputTxt", dest="inputTxt", action="store", type="string", help="input, a txt file for a list of input reads files [compulsory]") + parser.add_option('-t', '--tar', dest='outputTar', default=None, help='output all accepted hits results in a tar file.' ) + parser.add_option("-d", "--genomeName", dest="genomeName", help="Define the reference genome name.[compulsory]") +# parser.add_option("-o", "--outputFile", dest="outputfile", help="output[compulsory]") + parser.add_option("-k", "--kmer", dest="kmer", default=12, help="Choose kmer value (<=16), a big kmer value can take more RAM(4Go).[compulsory]") + parser.add_option("-i", "--inputFasta", dest="inputFastaFile", help="Reference genome file, fasta format.[compulsory]") + parser.add_option("-p", "--pairedEnd", dest="pairedEndFile", default=None, help="Input paired-end fastq file.") + parser.add_option("-A", "--outputFormat", dest="outputFormat", default="sam", help="Choose an output format [sam, goby (need to re-compile with appropriate options)].") + #Splicing options for RNA-Seq + parser.add_option("-N","--novelsplicing", dest="novelsplicing", default=0, help="Look for novel splicing (0=no (default), 1=yes)") + parser.add_option("-s","--use-splicing", dest="useSplicing",action="store", type="string", help="Look for splicing involving known sites or known introns (in <STRING>.iit), at short or long distances") + parser.add_option("-w","--localsplicedist", dest="localsplicedist", default=200000, help="Definition of local novel splicing event (default 200000)") + parser.add_option("-e","--local-splice-penality", dest="localSplicePenality", default=0, help="Penalty for a local splice (default 0). Counts against mismatches allowed") + parser.add_option("-E","--distant-splice-penality", dest="distantSplicePenality", default=1, help="Penalty for a distant splice (default 1). A distant splice is one where the intron length exceeds the value of -w, or --localsplicedist, or is an inversion, scramble, or translocation between two different chromosomes Counts against mismatches allowed") + parser.add_option("-K","--distant-splice-endlength", dest="distantSpliceEndlength", default=16, help="Minimum length at end required for distant spliced alignments (default 16, min allowed is the value of -k, or kmer size)") + parser.add_option("-l","--shortend-splice-endlength", dest="shortendSpliceEndlength", default=2, help="Minimum length at end required for short-end spliced alignments (default 2, but unless known splice sites are provided with the -s flag, GSNAP may still need the end length to be the value of -k, or kmer size to find a given splice") + + #Specific paired-ends reads + parser.add_option("--pairmax-dna", dest="pairmaxDna", default=1000, help="Max total genomic length for DNA-Seq paired reads, or other reads without splicing (default 1000).") + parser.add_option("--pairmax-rna", dest="pairmaxRna", default=2000, help="Max total genomic length for RNA-Seq paired reads, or other reads that could have a splice (default 200000).") + parser.add_option("--pairexpect", dest="pairexpect", default=200, help="Expected paired-end length, used for calling splices in medial part of paired-end reads (default 200)") + parser.add_option("--pairdev", dest="pairdev", default=25, help="Allowable deviation from expected paired-end length, used for calling splices in medial part of paired-end reads (default 25)") + + (options, args) = parser.parse_args() + + workingDir = os.path.dirname(options.inputFastaFile) + + file = open(options.inputTxt,"r") + lines = file.readlines() + inputFileNames = [] + gsnapOutputNames = [] + outputName = options.outputTxtFile + resDirName = os.path.dirname(outputName) + '/' + out = open(outputName, "w") + for line in lines: + timeId = time.strftime("%Y%m%d%H%M%S") + tab = line.split() + inputFileNames.append(tab[1]) + OutputName = resDirName + tab[0] + '_samOutput_%s.sam' % timeId + gsnapOutputNames.append(OutputName) + out.write(tab[0] + '\t' + OutputName + '\n') + file.close() + out.close() + + if options.pairedEndFile: + revFile = open(options.pairedEndFile,"r") + lines = revFile.readlines() + inputRevFileNames = [] + for line in lines: + revTab = line.split() + inputRevFileNames.append(revTab[1]) + revFile.close() + + #Create gsnap make + lCmdsTuples =[] + acronym = "gsnap_make" + jobdb = TableJobAdaptatorFactory.createJobInstance() + iLauncher = Launcher(jobdb, os.getcwd(), "", "", os.getcwd(), os.getcwd(), "jobs", "", acronym, acronym, False, True) + cmds = [] + cmd_setup = "gmap_setup -d %s -D %s -k %s %s;" % (options.genomeName, workingDir, options.kmer, options.inputFastaFile) + cmds.append(cmd_setup) + cmd_make_coords = "make -f Makefile.%s coords;" % options.genomeName + cmds.append(cmd_make_coords) + cmd_make_gmapdb = "make -f Makefile.%s gmapdb;" % options.genomeName + cmds.append(cmd_make_gmapdb) + cmd_make_install = "make -f Makefile.%s install;" % options.genomeName + cmds.append(cmd_make_install) + cmd_index = iLauncher.getSystemCommand("", cmds) + cmd2Launch = [] + cmdStart = [] + cmdFinish = [] + cmd2Launch.append(cmd_index) + lCmdsTuples.append(_map(iLauncher, cmd2Launch, cmdStart, cmdFinish)) + iLauncher.runLauncherForMultipleJobs(acronym, lCmdsTuples, True) + + acronym = "gsnap" + jobdb = TableJobAdaptatorFactory.createJobInstance() + iLauncher = Launcher(jobdb, os.getcwd(), "", "", os.getcwd(), os.getcwd(), "jobs", "", acronym, acronym, False, True) + lCmdsTuples = [] + dCutOut2Out = {} + lAllFile2remove = [] + numberOfBatch = 20 #usually for testing, working on to find a value for default launch on galaxy + for i in range(len(inputFileNames)): + lCutOutput = [] + for j in range(numberOfBatch): + cutOutput = "%s_out_%s" % (inputFileNames[i], j) + lCutOutput.append(cutOutput) + lAllFile2remove.extend(lCutOutput) + cmd2Launch = [] + if options.pairedEndFile: + inputRevFile = inputRevFileNames[i] + else: + inputRevFile = "" + cmd2Launch.append(_createGsnapCommand(iLauncher, options, workingDir, inputFileNames[i], inputRevFile, cutOutput, j, numberOfBatch)) + cmdStart = [] + cmdFinish = [] + lCmdsTuples.append(_map(iLauncher, cmd2Launch, cmdStart, cmdFinish)) + dCutOut2Out[gsnapOutputNames[i]] = lCutOutput + iLauncher.runLauncherForMultipleJobs(acronym, lCmdsTuples, True) + + joinSAM(dCutOut2Out) + FileUtils.removeFilesFromListIfExist(lAllFile2remove) + + if options.outputTar != None: + toTar(options.outputTar, gsnapOutputNames) + + +if __name__=="__main__": __main__()