diff SMART/DiffExpAnal/gsnap_parallel_unSQL.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SMART/DiffExpAnal/gsnap_parallel_unSQL.py	Mon Apr 29 03:20:15 2013 -0400
@@ -0,0 +1,195 @@
+#!/usr/bin/env python
+
+import optparse, os, shutil, subprocess, sys, tempfile, fileinput, tarfile, glob 
+import time
+from commons.core.launcher.Launcher import Launcher
+from commons.core.sql.TableJobAdaptatorFactory import TableJobAdaptatorFactory
+from commons.core.utils.FileUtils import FileUtils
+from optparse import OptionParser
+
+def stop_err( msg ):
+    sys.stderr.write( "%s\n" % msg )
+    sys.exit()
+
+def toTar(tarFileName, accepted_hits_outputNames):
+    tfile = tarfile.open(tarFileName + ".tmp.tar", "w")
+    currentPath = os.getcwd()
+    os.chdir(dir)
+    for file in accepted_hits_outputNames:
+        relativeFileName = os.path.basename(file)
+        tfile.add(relativeFileName)
+    os.system("mv %s %s" % (tarFileName + ".tmp.tar", tarFileName))
+    tfile.close()
+    os.chdir(currentPath)
+    
+def joinSAM(dCutOut2Out):
+    for key in dCutOut2Out.keys():
+        FileUtils.catFilesFromList(dCutOut2Out[key],key, False)
+        
+def _map(iLauncher, cmd, cmdStart, cmdFinish ):
+    lCmds = []
+    lCmds.extend(cmd)
+    lCmdStart = []
+    lCmdStart.extend(cmdStart)
+    lCmdFinish = []
+    lCmdFinish.extend(cmdFinish)
+    return(iLauncher.prepareCommands_withoutIndentation(lCmds, lCmdStart, lCmdFinish))
+
+def _createGsnapSplicingOptions(options):
+    lArgs = []
+    lArgs.append("-N %s" % options.novelsplicing)
+    if options.useSplicing:
+        lArgs.append("-s %s" % options.useSplicing)
+    lArgs.append("-w %s" % options.localsplicedist)
+    lArgs.append("-e %s" % options.localSplicePenality)
+    lArgs.append("-E %s" % options.distantSplicePenality)
+    lArgs.append("-K %s" % options.distantSpliceEndlength)
+    lArgs.append("-l %s" % options.shortendSpliceEndlength)
+    
+    
+    return lArgs
+
+def _createGsnapPairedEndOptions(options):
+    lArgs = []
+    if not(options.useSplicing or options.pairedEndFile):
+        lArgs.append("--pairmax-dna %s" % options.pairmaxRna)
+    if options.useSplicing or options.pairedEndFile:
+        lArgs.append("--pairmax-rna %s" % options.pairmaxRna)
+    lArgs.append("--pairexpect=%s" % options.pairexpect)
+    lArgs.append("--pairdev=%s" % options.pairedev)
+    
+    
+
+def _createGsnapCommand(iLauncher, options, workingDir, inputFileNames, inputRevFilesNames, outputFileName, batchNumber, numberOfBatch):
+    lArgs = []
+    lArgs.append("-d %s" % options.genomeName)
+    lArgs.append("-k %s" % options.kmer)
+    lArgs.append("-D %s" % workingDir)
+    lArgs.append("-A %s" % options.outputFormat)
+    lArgs.append("-q %s/%s" % (batchNumber, numberOfBatch))
+    lArgs.append("--no-sam-headers")
+    lArgs.append(inputFileNames)
+    print 'N option: %s, pairedEndFile option: %s' %(options.novelsplicing, options.pairedEndFile)
+    if options.pairedEndFile:
+        lArgs.append(inputRevFilesNames)
+    if options.novelsplicing == '1':
+        lArgs.extend(_createGsnapSplicingOptions(options))
+    elif options.pairedEndFile:
+        lArgs.extend(_createGsnapPairedEndOptions(options))
+    
+    lArgs.append("> %s" % outputFileName)  
+    return iLauncher.getSystemCommand("gsnap", lArgs)   
+
+def __main__():
+    #Parse Command Line
+    description = "GMAP/GSNAP version:2012-12-20."
+    parser = OptionParser(description = description)
+    parser.add_option('-o', '--outputTxtFile', dest='outputTxtFile', help='for Differential expression analysis pipeline, new output option gives a txt output containing the list of mapping results.')
+    parser.add_option("-q", "--inputTxt",         dest="inputTxt",         action="store",                 type="string",     help="input, a txt file for a list of input reads files [compulsory]")
+    parser.add_option('-t', '--tar', dest='outputTar', default=None, help='output all accepted hits results in a tar file.' )
+    parser.add_option("-d", "--genomeName", dest="genomeName", help="Define the reference genome name.[compulsory]")
+#    parser.add_option("-o", "--outputFile", dest="outputfile", help="output[compulsory]")
+    parser.add_option("-k", "--kmer", dest="kmer", default=12, help="Choose kmer value (<=16), a big kmer value can take more RAM(4Go).[compulsory]")
+    parser.add_option("-i", "--inputFasta", dest="inputFastaFile", help="Reference genome file, fasta format.[compulsory]")
+    parser.add_option("-p", "--pairedEnd", dest="pairedEndFile", default=None, help="Input paired-end fastq file.")
+    parser.add_option("-A", "--outputFormat", dest="outputFormat", default="sam", help="Choose an output format [sam, goby (need to re-compile with appropriate options)].")
+    #Splicing options for RNA-Seq
+    parser.add_option("-N","--novelsplicing", dest="novelsplicing", default=0, help="Look for novel splicing (0=no (default), 1=yes)")
+    parser.add_option("-s","--use-splicing", dest="useSplicing",action="store", type="string", help="Look for splicing involving known sites or known introns (in <STRING>.iit), at short or long distances")
+    parser.add_option("-w","--localsplicedist", dest="localsplicedist", default=200000, help="Definition of local novel splicing event (default 200000)")
+    parser.add_option("-e","--local-splice-penality", dest="localSplicePenality", default=0, help="Penalty for a local splice (default 0).  Counts against mismatches allowed")
+    parser.add_option("-E","--distant-splice-penality", dest="distantSplicePenality", default=1, help="Penalty for a distant splice (default 1).  A distant splice is one where the intron length exceeds the value of -w, or --localsplicedist, or is an inversion, scramble, or translocation between two different chromosomes Counts against mismatches allowed")
+    parser.add_option("-K","--distant-splice-endlength", dest="distantSpliceEndlength", default=16, help="Minimum length at end required for distant spliced alignments (default 16, min allowed is the value of -k, or kmer size)")
+    parser.add_option("-l","--shortend-splice-endlength", dest="shortendSpliceEndlength", default=2, help="Minimum length at end required for short-end spliced alignments (default 2, but unless known splice sites are provided with the -s flag, GSNAP may still need the end length to be the value of -k, or kmer size to find a given splice")
+
+    #Specific paired-ends reads
+    parser.add_option("--pairmax-dna", dest="pairmaxDna", default=1000, help="Max total genomic length for DNA-Seq paired reads, or other reads without splicing (default 1000).")
+    parser.add_option("--pairmax-rna", dest="pairmaxRna", default=2000, help="Max total genomic length for RNA-Seq paired reads, or other reads that could have a splice (default 200000).")
+    parser.add_option("--pairexpect", dest="pairexpect", default=200, help="Expected paired-end length, used for calling splices in medial part of paired-end reads (default 200)")
+    parser.add_option("--pairdev", dest="pairdev", default=25, help="Allowable deviation from expected paired-end length, used for calling splices in medial part of paired-end reads (default 25)")
+    
+    (options, args) = parser.parse_args()    
+
+    workingDir = os.path.dirname(options.inputFastaFile)
+    
+    file = open(options.inputTxt,"r")
+    lines = file.readlines()
+    inputFileNames = []
+    gsnapOutputNames = []
+    outputName = options.outputTxtFile
+    resDirName = os.path.dirname(outputName) + '/'
+    out = open(outputName, "w")
+    for line in lines:
+        timeId = time.strftime("%Y%m%d%H%M%S")
+        tab = line.split()
+        inputFileNames.append(tab[1])
+        OutputName = resDirName + tab[0] + '_samOutput_%s.sam' % timeId
+        gsnapOutputNames.append(OutputName) 
+        out.write(tab[0] + '\t' + OutputName + '\n')
+    file.close()
+    out.close()
+    
+    if options.pairedEndFile:
+        revFile = open(options.pairedEndFile,"r")
+        lines = revFile.readlines()
+        inputRevFileNames = []
+        for line in lines:
+            revTab = line.split()
+            inputRevFileNames.append(revTab[1])
+        revFile.close()
+
+    #Create gsnap make 
+    lCmdsTuples =[]
+    acronym = "gsnap_make"
+    jobdb = TableJobAdaptatorFactory.createJobInstance()
+    iLauncher = Launcher(jobdb, os.getcwd(), "", "", os.getcwd(), os.getcwd(), "jobs", "", acronym, acronym, False, True)
+    cmds = []
+    cmd_setup = "gmap_setup -d %s -D %s -k %s %s;" % (options.genomeName, workingDir, options.kmer, options.inputFastaFile)
+    cmds.append(cmd_setup)
+    cmd_make_coords = "make -f Makefile.%s coords;" % options.genomeName 
+    cmds.append(cmd_make_coords)
+    cmd_make_gmapdb = "make -f Makefile.%s gmapdb;" % options.genomeName
+    cmds.append(cmd_make_gmapdb)
+    cmd_make_install = "make -f Makefile.%s install;" % options.genomeName
+    cmds.append(cmd_make_install)
+    cmd_index = iLauncher.getSystemCommand("", cmds)
+    cmd2Launch = []
+    cmdStart = []
+    cmdFinish = []
+    cmd2Launch.append(cmd_index)
+    lCmdsTuples.append(_map(iLauncher, cmd2Launch, cmdStart, cmdFinish)) 
+    iLauncher.runLauncherForMultipleJobs(acronym, lCmdsTuples, True)    
+    
+    acronym = "gsnap"
+    jobdb = TableJobAdaptatorFactory.createJobInstance()
+    iLauncher = Launcher(jobdb, os.getcwd(), "", "", os.getcwd(), os.getcwd(), "jobs", "", acronym, acronym, False, True)
+    lCmdsTuples = []
+    dCutOut2Out = {}
+    lAllFile2remove = []
+    numberOfBatch = 20  #usually for testing, working on to find a value for default launch on galaxy
+    for i in range(len(inputFileNames)):
+        lCutOutput = []
+        for j in range(numberOfBatch):
+            cutOutput = "%s_out_%s" % (inputFileNames[i], j)
+            lCutOutput.append(cutOutput)
+            lAllFile2remove.extend(lCutOutput)
+            cmd2Launch = []
+            if options.pairedEndFile: 
+                inputRevFile = inputRevFileNames[i]
+            else:
+                inputRevFile = ""
+            cmd2Launch.append(_createGsnapCommand(iLauncher, options, workingDir, inputFileNames[i], inputRevFile, cutOutput, j, numberOfBatch))
+            cmdStart = []
+            cmdFinish = []
+            lCmdsTuples.append(_map(iLauncher, cmd2Launch, cmdStart, cmdFinish))    
+            dCutOut2Out[gsnapOutputNames[i]] = lCutOutput
+    iLauncher.runLauncherForMultipleJobs(acronym, lCmdsTuples, True)
+    
+    joinSAM(dCutOut2Out) 
+    FileUtils.removeFilesFromListIfExist(lAllFile2remove)   
+                 
+    if options.outputTar != None:
+        toTar(options.outputTar, gsnapOutputNames)
+
+
+if __name__=="__main__": __main__()