Mercurial > repos > yufei-luo > s_mart
comparison SMART/DiffExpAnal/gsnap_parallel_unSQL.py @ 18:94ab73e8a190
Uploaded
| author | m-zytnicki | 
|---|---|
| date | Mon, 29 Apr 2013 03:20:15 -0400 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| 17:b0e8584489e6 | 18:94ab73e8a190 | 
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 import optparse, os, shutil, subprocess, sys, tempfile, fileinput, tarfile, glob | |
| 4 import time | |
| 5 from commons.core.launcher.Launcher import Launcher | |
| 6 from commons.core.sql.TableJobAdaptatorFactory import TableJobAdaptatorFactory | |
| 7 from commons.core.utils.FileUtils import FileUtils | |
| 8 from optparse import OptionParser | |
| 9 | |
| 10 def stop_err( msg ): | |
| 11 sys.stderr.write( "%s\n" % msg ) | |
| 12 sys.exit() | |
| 13 | |
| 14 def toTar(tarFileName, accepted_hits_outputNames): | |
| 15 tfile = tarfile.open(tarFileName + ".tmp.tar", "w") | |
| 16 currentPath = os.getcwd() | |
| 17 os.chdir(dir) | |
| 18 for file in accepted_hits_outputNames: | |
| 19 relativeFileName = os.path.basename(file) | |
| 20 tfile.add(relativeFileName) | |
| 21 os.system("mv %s %s" % (tarFileName + ".tmp.tar", tarFileName)) | |
| 22 tfile.close() | |
| 23 os.chdir(currentPath) | |
| 24 | |
| 25 def joinSAM(dCutOut2Out): | |
| 26 for key in dCutOut2Out.keys(): | |
| 27 FileUtils.catFilesFromList(dCutOut2Out[key],key, False) | |
| 28 | |
| 29 def _map(iLauncher, cmd, cmdStart, cmdFinish ): | |
| 30 lCmds = [] | |
| 31 lCmds.extend(cmd) | |
| 32 lCmdStart = [] | |
| 33 lCmdStart.extend(cmdStart) | |
| 34 lCmdFinish = [] | |
| 35 lCmdFinish.extend(cmdFinish) | |
| 36 return(iLauncher.prepareCommands_withoutIndentation(lCmds, lCmdStart, lCmdFinish)) | |
| 37 | |
| 38 def _createGsnapSplicingOptions(options): | |
| 39 lArgs = [] | |
| 40 lArgs.append("-N %s" % options.novelsplicing) | |
| 41 if options.useSplicing: | |
| 42 lArgs.append("-s %s" % options.useSplicing) | |
| 43 lArgs.append("-w %s" % options.localsplicedist) | |
| 44 lArgs.append("-e %s" % options.localSplicePenality) | |
| 45 lArgs.append("-E %s" % options.distantSplicePenality) | |
| 46 lArgs.append("-K %s" % options.distantSpliceEndlength) | |
| 47 lArgs.append("-l %s" % options.shortendSpliceEndlength) | |
| 48 | |
| 49 | |
| 50 return lArgs | |
| 51 | |
| 52 def _createGsnapPairedEndOptions(options): | |
| 53 lArgs = [] | |
| 54 if not(options.useSplicing or options.pairedEndFile): | |
| 55 lArgs.append("--pairmax-dna %s" % options.pairmaxRna) | |
| 56 if options.useSplicing or options.pairedEndFile: | |
| 57 lArgs.append("--pairmax-rna %s" % options.pairmaxRna) | |
| 58 lArgs.append("--pairexpect=%s" % options.pairexpect) | |
| 59 lArgs.append("--pairdev=%s" % options.pairedev) | |
| 60 | |
| 61 | |
| 62 | |
| 63 def _createGsnapCommand(iLauncher, options, workingDir, inputFileNames, inputRevFilesNames, outputFileName, batchNumber, numberOfBatch): | |
| 64 lArgs = [] | |
| 65 lArgs.append("-d %s" % options.genomeName) | |
| 66 lArgs.append("-k %s" % options.kmer) | |
| 67 lArgs.append("-D %s" % workingDir) | |
| 68 lArgs.append("-A %s" % options.outputFormat) | |
| 69 lArgs.append("-q %s/%s" % (batchNumber, numberOfBatch)) | |
| 70 lArgs.append("--no-sam-headers") | |
| 71 lArgs.append(inputFileNames) | |
| 72 print 'N option: %s, pairedEndFile option: %s' %(options.novelsplicing, options.pairedEndFile) | |
| 73 if options.pairedEndFile: | |
| 74 lArgs.append(inputRevFilesNames) | |
| 75 if options.novelsplicing == '1': | |
| 76 lArgs.extend(_createGsnapSplicingOptions(options)) | |
| 77 elif options.pairedEndFile: | |
| 78 lArgs.extend(_createGsnapPairedEndOptions(options)) | |
| 79 | |
| 80 lArgs.append("> %s" % outputFileName) | |
| 81 return iLauncher.getSystemCommand("gsnap", lArgs) | |
| 82 | |
| 83 def __main__(): | |
| 84 #Parse Command Line | |
| 85 description = "GMAP/GSNAP version:2012-12-20." | |
| 86 parser = OptionParser(description = description) | |
| 87 parser.add_option('-o', '--outputTxtFile', dest='outputTxtFile', help='for Differential expression analysis pipeline, new output option gives a txt output containing the list of mapping results.') | |
| 88 parser.add_option("-q", "--inputTxt", dest="inputTxt", action="store", type="string", help="input, a txt file for a list of input reads files [compulsory]") | |
| 89 parser.add_option('-t', '--tar', dest='outputTar', default=None, help='output all accepted hits results in a tar file.' ) | |
| 90 parser.add_option("-d", "--genomeName", dest="genomeName", help="Define the reference genome name.[compulsory]") | |
| 91 # parser.add_option("-o", "--outputFile", dest="outputfile", help="output[compulsory]") | |
| 92 parser.add_option("-k", "--kmer", dest="kmer", default=12, help="Choose kmer value (<=16), a big kmer value can take more RAM(4Go).[compulsory]") | |
| 93 parser.add_option("-i", "--inputFasta", dest="inputFastaFile", help="Reference genome file, fasta format.[compulsory]") | |
| 94 parser.add_option("-p", "--pairedEnd", dest="pairedEndFile", default=None, help="Input paired-end fastq file.") | |
| 95 parser.add_option("-A", "--outputFormat", dest="outputFormat", default="sam", help="Choose an output format [sam, goby (need to re-compile with appropriate options)].") | |
| 96 #Splicing options for RNA-Seq | |
| 97 parser.add_option("-N","--novelsplicing", dest="novelsplicing", default=0, help="Look for novel splicing (0=no (default), 1=yes)") | |
| 98 parser.add_option("-s","--use-splicing", dest="useSplicing",action="store", type="string", help="Look for splicing involving known sites or known introns (in <STRING>.iit), at short or long distances") | |
| 99 parser.add_option("-w","--localsplicedist", dest="localsplicedist", default=200000, help="Definition of local novel splicing event (default 200000)") | |
| 100 parser.add_option("-e","--local-splice-penality", dest="localSplicePenality", default=0, help="Penalty for a local splice (default 0). Counts against mismatches allowed") | |
| 101 parser.add_option("-E","--distant-splice-penality", dest="distantSplicePenality", default=1, help="Penalty for a distant splice (default 1). A distant splice is one where the intron length exceeds the value of -w, or --localsplicedist, or is an inversion, scramble, or translocation between two different chromosomes Counts against mismatches allowed") | |
| 102 parser.add_option("-K","--distant-splice-endlength", dest="distantSpliceEndlength", default=16, help="Minimum length at end required for distant spliced alignments (default 16, min allowed is the value of -k, or kmer size)") | |
| 103 parser.add_option("-l","--shortend-splice-endlength", dest="shortendSpliceEndlength", default=2, help="Minimum length at end required for short-end spliced alignments (default 2, but unless known splice sites are provided with the -s flag, GSNAP may still need the end length to be the value of -k, or kmer size to find a given splice") | |
| 104 | |
| 105 #Specific paired-ends reads | |
| 106 parser.add_option("--pairmax-dna", dest="pairmaxDna", default=1000, help="Max total genomic length for DNA-Seq paired reads, or other reads without splicing (default 1000).") | |
| 107 parser.add_option("--pairmax-rna", dest="pairmaxRna", default=2000, help="Max total genomic length for RNA-Seq paired reads, or other reads that could have a splice (default 200000).") | |
| 108 parser.add_option("--pairexpect", dest="pairexpect", default=200, help="Expected paired-end length, used for calling splices in medial part of paired-end reads (default 200)") | |
| 109 parser.add_option("--pairdev", dest="pairdev", default=25, help="Allowable deviation from expected paired-end length, used for calling splices in medial part of paired-end reads (default 25)") | |
| 110 | |
| 111 (options, args) = parser.parse_args() | |
| 112 | |
| 113 workingDir = os.path.dirname(options.inputFastaFile) | |
| 114 | |
| 115 file = open(options.inputTxt,"r") | |
| 116 lines = file.readlines() | |
| 117 inputFileNames = [] | |
| 118 gsnapOutputNames = [] | |
| 119 outputName = options.outputTxtFile | |
| 120 resDirName = os.path.dirname(outputName) + '/' | |
| 121 out = open(outputName, "w") | |
| 122 for line in lines: | |
| 123 timeId = time.strftime("%Y%m%d%H%M%S") | |
| 124 tab = line.split() | |
| 125 inputFileNames.append(tab[1]) | |
| 126 OutputName = resDirName + tab[0] + '_samOutput_%s.sam' % timeId | |
| 127 gsnapOutputNames.append(OutputName) | |
| 128 out.write(tab[0] + '\t' + OutputName + '\n') | |
| 129 file.close() | |
| 130 out.close() | |
| 131 | |
| 132 if options.pairedEndFile: | |
| 133 revFile = open(options.pairedEndFile,"r") | |
| 134 lines = revFile.readlines() | |
| 135 inputRevFileNames = [] | |
| 136 for line in lines: | |
| 137 revTab = line.split() | |
| 138 inputRevFileNames.append(revTab[1]) | |
| 139 revFile.close() | |
| 140 | |
| 141 #Create gsnap make | |
| 142 lCmdsTuples =[] | |
| 143 acronym = "gsnap_make" | |
| 144 jobdb = TableJobAdaptatorFactory.createJobInstance() | |
| 145 iLauncher = Launcher(jobdb, os.getcwd(), "", "", os.getcwd(), os.getcwd(), "jobs", "", acronym, acronym, False, True) | |
| 146 cmds = [] | |
| 147 cmd_setup = "gmap_setup -d %s -D %s -k %s %s;" % (options.genomeName, workingDir, options.kmer, options.inputFastaFile) | |
| 148 cmds.append(cmd_setup) | |
| 149 cmd_make_coords = "make -f Makefile.%s coords;" % options.genomeName | |
| 150 cmds.append(cmd_make_coords) | |
| 151 cmd_make_gmapdb = "make -f Makefile.%s gmapdb;" % options.genomeName | |
| 152 cmds.append(cmd_make_gmapdb) | |
| 153 cmd_make_install = "make -f Makefile.%s install;" % options.genomeName | |
| 154 cmds.append(cmd_make_install) | |
| 155 cmd_index = iLauncher.getSystemCommand("", cmds) | |
| 156 cmd2Launch = [] | |
| 157 cmdStart = [] | |
| 158 cmdFinish = [] | |
| 159 cmd2Launch.append(cmd_index) | |
| 160 lCmdsTuples.append(_map(iLauncher, cmd2Launch, cmdStart, cmdFinish)) | |
| 161 iLauncher.runLauncherForMultipleJobs(acronym, lCmdsTuples, True) | |
| 162 | |
| 163 acronym = "gsnap" | |
| 164 jobdb = TableJobAdaptatorFactory.createJobInstance() | |
| 165 iLauncher = Launcher(jobdb, os.getcwd(), "", "", os.getcwd(), os.getcwd(), "jobs", "", acronym, acronym, False, True) | |
| 166 lCmdsTuples = [] | |
| 167 dCutOut2Out = {} | |
| 168 lAllFile2remove = [] | |
| 169 numberOfBatch = 20 #usually for testing, working on to find a value for default launch on galaxy | |
| 170 for i in range(len(inputFileNames)): | |
| 171 lCutOutput = [] | |
| 172 for j in range(numberOfBatch): | |
| 173 cutOutput = "%s_out_%s" % (inputFileNames[i], j) | |
| 174 lCutOutput.append(cutOutput) | |
| 175 lAllFile2remove.extend(lCutOutput) | |
| 176 cmd2Launch = [] | |
| 177 if options.pairedEndFile: | |
| 178 inputRevFile = inputRevFileNames[i] | |
| 179 else: | |
| 180 inputRevFile = "" | |
| 181 cmd2Launch.append(_createGsnapCommand(iLauncher, options, workingDir, inputFileNames[i], inputRevFile, cutOutput, j, numberOfBatch)) | |
| 182 cmdStart = [] | |
| 183 cmdFinish = [] | |
| 184 lCmdsTuples.append(_map(iLauncher, cmd2Launch, cmdStart, cmdFinish)) | |
| 185 dCutOut2Out[gsnapOutputNames[i]] = lCutOutput | |
| 186 iLauncher.runLauncherForMultipleJobs(acronym, lCmdsTuples, True) | |
| 187 | |
| 188 joinSAM(dCutOut2Out) | |
| 189 FileUtils.removeFilesFromListIfExist(lAllFile2remove) | |
| 190 | |
| 191 if options.outputTar != None: | |
| 192 toTar(options.outputTar, gsnapOutputNames) | |
| 193 | |
| 194 | |
| 195 if __name__=="__main__": __main__() | 
