18
|
1 #!/usr/bin/env python
|
|
2
|
|
3 import optparse, os, shutil, subprocess, sys, tempfile, fileinput, tarfile, glob
|
|
4 import time
|
|
5 from commons.core.launcher.Launcher import Launcher
|
|
6 from commons.core.sql.TableJobAdaptatorFactory import TableJobAdaptatorFactory
|
|
7 from commons.core.utils.FileUtils import FileUtils
|
|
8 from optparse import OptionParser
|
|
9
|
|
10 def stop_err( msg ):
|
|
11 sys.stderr.write( "%s\n" % msg )
|
|
12 sys.exit()
|
|
13
|
|
14 def toTar(tarFileName, accepted_hits_outputNames):
|
|
15 tfile = tarfile.open(tarFileName + ".tmp.tar", "w")
|
|
16 currentPath = os.getcwd()
|
|
17 os.chdir(dir)
|
|
18 for file in accepted_hits_outputNames:
|
|
19 relativeFileName = os.path.basename(file)
|
|
20 tfile.add(relativeFileName)
|
|
21 os.system("mv %s %s" % (tarFileName + ".tmp.tar", tarFileName))
|
|
22 tfile.close()
|
|
23 os.chdir(currentPath)
|
|
24
|
|
25 def joinSAM(dCutOut2Out):
|
|
26 for key in dCutOut2Out.keys():
|
|
27 FileUtils.catFilesFromList(dCutOut2Out[key],key, False)
|
|
28
|
|
29 def _map(iLauncher, cmd, cmdStart, cmdFinish ):
|
|
30 lCmds = []
|
|
31 lCmds.extend(cmd)
|
|
32 lCmdStart = []
|
|
33 lCmdStart.extend(cmdStart)
|
|
34 lCmdFinish = []
|
|
35 lCmdFinish.extend(cmdFinish)
|
|
36 return(iLauncher.prepareCommands_withoutIndentation(lCmds, lCmdStart, lCmdFinish))
|
|
37
|
|
38 def _createGsnapSplicingOptions(options):
|
|
39 lArgs = []
|
|
40 lArgs.append("-N %s" % options.novelsplicing)
|
|
41 if options.useSplicing:
|
|
42 lArgs.append("-s %s" % options.useSplicing)
|
|
43 lArgs.append("-w %s" % options.localsplicedist)
|
|
44 lArgs.append("-e %s" % options.localSplicePenality)
|
|
45 lArgs.append("-E %s" % options.distantSplicePenality)
|
|
46 lArgs.append("-K %s" % options.distantSpliceEndlength)
|
|
47 lArgs.append("-l %s" % options.shortendSpliceEndlength)
|
|
48
|
|
49
|
|
50 return lArgs
|
|
51
|
|
52 def _createGsnapPairedEndOptions(options):
|
|
53 lArgs = []
|
|
54 if not(options.useSplicing or options.pairedEndFile):
|
|
55 lArgs.append("--pairmax-dna %s" % options.pairmaxRna)
|
|
56 if options.useSplicing or options.pairedEndFile:
|
|
57 lArgs.append("--pairmax-rna %s" % options.pairmaxRna)
|
|
58 lArgs.append("--pairexpect=%s" % options.pairexpect)
|
|
59 lArgs.append("--pairdev=%s" % options.pairedev)
|
|
60
|
|
61
|
|
62
|
|
63 def _createGsnapCommand(iLauncher, options, workingDir, inputFileNames, inputRevFilesNames, outputFileName, batchNumber, numberOfBatch):
|
|
64 lArgs = []
|
|
65 lArgs.append("-d %s" % options.genomeName)
|
|
66 lArgs.append("-k %s" % options.kmer)
|
|
67 lArgs.append("-D %s" % workingDir)
|
|
68 lArgs.append("-A %s" % options.outputFormat)
|
|
69 lArgs.append("-q %s/%s" % (batchNumber, numberOfBatch))
|
|
70 lArgs.append("--no-sam-headers")
|
|
71 lArgs.append(inputFileNames)
|
|
72 print 'N option: %s, pairedEndFile option: %s' %(options.novelsplicing, options.pairedEndFile)
|
|
73 if options.pairedEndFile:
|
|
74 lArgs.append(inputRevFilesNames)
|
|
75 if options.novelsplicing == '1':
|
|
76 lArgs.extend(_createGsnapSplicingOptions(options))
|
|
77 elif options.pairedEndFile:
|
|
78 lArgs.extend(_createGsnapPairedEndOptions(options))
|
|
79
|
|
80 lArgs.append("> %s" % outputFileName)
|
|
81 return iLauncher.getSystemCommand("gsnap", lArgs)
|
|
82
|
|
83 def __main__():
|
|
84 #Parse Command Line
|
|
85 description = "GMAP/GSNAP version:2012-12-20."
|
|
86 parser = OptionParser(description = description)
|
|
87 parser.add_option('-o', '--outputTxtFile', dest='outputTxtFile', help='for Differential expression analysis pipeline, new output option gives a txt output containing the list of mapping results.')
|
|
88 parser.add_option("-q", "--inputTxt", dest="inputTxt", action="store", type="string", help="input, a txt file for a list of input reads files [compulsory]")
|
|
89 parser.add_option('-t', '--tar', dest='outputTar', default=None, help='output all accepted hits results in a tar file.' )
|
|
90 parser.add_option("-d", "--genomeName", dest="genomeName", help="Define the reference genome name.[compulsory]")
|
|
91 # parser.add_option("-o", "--outputFile", dest="outputfile", help="output[compulsory]")
|
|
92 parser.add_option("-k", "--kmer", dest="kmer", default=12, help="Choose kmer value (<=16), a big kmer value can take more RAM(4Go).[compulsory]")
|
|
93 parser.add_option("-i", "--inputFasta", dest="inputFastaFile", help="Reference genome file, fasta format.[compulsory]")
|
|
94 parser.add_option("-p", "--pairedEnd", dest="pairedEndFile", default=None, help="Input paired-end fastq file.")
|
|
95 parser.add_option("-A", "--outputFormat", dest="outputFormat", default="sam", help="Choose an output format [sam, goby (need to re-compile with appropriate options)].")
|
|
96 #Splicing options for RNA-Seq
|
|
97 parser.add_option("-N","--novelsplicing", dest="novelsplicing", default=0, help="Look for novel splicing (0=no (default), 1=yes)")
|
|
98 parser.add_option("-s","--use-splicing", dest="useSplicing",action="store", type="string", help="Look for splicing involving known sites or known introns (in <STRING>.iit), at short or long distances")
|
|
99 parser.add_option("-w","--localsplicedist", dest="localsplicedist", default=200000, help="Definition of local novel splicing event (default 200000)")
|
|
100 parser.add_option("-e","--local-splice-penality", dest="localSplicePenality", default=0, help="Penalty for a local splice (default 0). Counts against mismatches allowed")
|
|
101 parser.add_option("-E","--distant-splice-penality", dest="distantSplicePenality", default=1, help="Penalty for a distant splice (default 1). A distant splice is one where the intron length exceeds the value of -w, or --localsplicedist, or is an inversion, scramble, or translocation between two different chromosomes Counts against mismatches allowed")
|
|
102 parser.add_option("-K","--distant-splice-endlength", dest="distantSpliceEndlength", default=16, help="Minimum length at end required for distant spliced alignments (default 16, min allowed is the value of -k, or kmer size)")
|
|
103 parser.add_option("-l","--shortend-splice-endlength", dest="shortendSpliceEndlength", default=2, help="Minimum length at end required for short-end spliced alignments (default 2, but unless known splice sites are provided with the -s flag, GSNAP may still need the end length to be the value of -k, or kmer size to find a given splice")
|
|
104
|
|
105 #Specific paired-ends reads
|
|
106 parser.add_option("--pairmax-dna", dest="pairmaxDna", default=1000, help="Max total genomic length for DNA-Seq paired reads, or other reads without splicing (default 1000).")
|
|
107 parser.add_option("--pairmax-rna", dest="pairmaxRna", default=2000, help="Max total genomic length for RNA-Seq paired reads, or other reads that could have a splice (default 200000).")
|
|
108 parser.add_option("--pairexpect", dest="pairexpect", default=200, help="Expected paired-end length, used for calling splices in medial part of paired-end reads (default 200)")
|
|
109 parser.add_option("--pairdev", dest="pairdev", default=25, help="Allowable deviation from expected paired-end length, used for calling splices in medial part of paired-end reads (default 25)")
|
|
110
|
|
111 (options, args) = parser.parse_args()
|
|
112
|
|
113 workingDir = os.path.dirname(options.inputFastaFile)
|
|
114
|
|
115 file = open(options.inputTxt,"r")
|
|
116 lines = file.readlines()
|
|
117 inputFileNames = []
|
|
118 gsnapOutputNames = []
|
|
119 outputName = options.outputTxtFile
|
|
120 resDirName = os.path.dirname(outputName) + '/'
|
|
121 out = open(outputName, "w")
|
|
122 for line in lines:
|
|
123 timeId = time.strftime("%Y%m%d%H%M%S")
|
|
124 tab = line.split()
|
|
125 inputFileNames.append(tab[1])
|
|
126 OutputName = resDirName + tab[0] + '_samOutput_%s.sam' % timeId
|
|
127 gsnapOutputNames.append(OutputName)
|
|
128 out.write(tab[0] + '\t' + OutputName + '\n')
|
|
129 file.close()
|
|
130 out.close()
|
|
131
|
|
132 if options.pairedEndFile:
|
|
133 revFile = open(options.pairedEndFile,"r")
|
|
134 lines = revFile.readlines()
|
|
135 inputRevFileNames = []
|
|
136 for line in lines:
|
|
137 revTab = line.split()
|
|
138 inputRevFileNames.append(revTab[1])
|
|
139 revFile.close()
|
|
140
|
|
141 #Create gsnap make
|
|
142 lCmdsTuples =[]
|
|
143 acronym = "gsnap_make"
|
|
144 jobdb = TableJobAdaptatorFactory.createJobInstance()
|
|
145 iLauncher = Launcher(jobdb, os.getcwd(), "", "", os.getcwd(), os.getcwd(), "jobs", "", acronym, acronym, False, True)
|
|
146 cmds = []
|
|
147 cmd_setup = "gmap_setup -d %s -D %s -k %s %s;" % (options.genomeName, workingDir, options.kmer, options.inputFastaFile)
|
|
148 cmds.append(cmd_setup)
|
|
149 cmd_make_coords = "make -f Makefile.%s coords;" % options.genomeName
|
|
150 cmds.append(cmd_make_coords)
|
|
151 cmd_make_gmapdb = "make -f Makefile.%s gmapdb;" % options.genomeName
|
|
152 cmds.append(cmd_make_gmapdb)
|
|
153 cmd_make_install = "make -f Makefile.%s install;" % options.genomeName
|
|
154 cmds.append(cmd_make_install)
|
|
155 cmd_index = iLauncher.getSystemCommand("", cmds)
|
|
156 cmd2Launch = []
|
|
157 cmdStart = []
|
|
158 cmdFinish = []
|
|
159 cmd2Launch.append(cmd_index)
|
|
160 lCmdsTuples.append(_map(iLauncher, cmd2Launch, cmdStart, cmdFinish))
|
|
161 iLauncher.runLauncherForMultipleJobs(acronym, lCmdsTuples, True)
|
|
162
|
|
163 acronym = "gsnap"
|
|
164 jobdb = TableJobAdaptatorFactory.createJobInstance()
|
|
165 iLauncher = Launcher(jobdb, os.getcwd(), "", "", os.getcwd(), os.getcwd(), "jobs", "", acronym, acronym, False, True)
|
|
166 lCmdsTuples = []
|
|
167 dCutOut2Out = {}
|
|
168 lAllFile2remove = []
|
|
169 numberOfBatch = 20 #usually for testing, working on to find a value for default launch on galaxy
|
|
170 for i in range(len(inputFileNames)):
|
|
171 lCutOutput = []
|
|
172 for j in range(numberOfBatch):
|
|
173 cutOutput = "%s_out_%s" % (inputFileNames[i], j)
|
|
174 lCutOutput.append(cutOutput)
|
|
175 lAllFile2remove.extend(lCutOutput)
|
|
176 cmd2Launch = []
|
|
177 if options.pairedEndFile:
|
|
178 inputRevFile = inputRevFileNames[i]
|
|
179 else:
|
|
180 inputRevFile = ""
|
|
181 cmd2Launch.append(_createGsnapCommand(iLauncher, options, workingDir, inputFileNames[i], inputRevFile, cutOutput, j, numberOfBatch))
|
|
182 cmdStart = []
|
|
183 cmdFinish = []
|
|
184 lCmdsTuples.append(_map(iLauncher, cmd2Launch, cmdStart, cmdFinish))
|
|
185 dCutOut2Out[gsnapOutputNames[i]] = lCutOutput
|
|
186 iLauncher.runLauncherForMultipleJobs(acronym, lCmdsTuples, True)
|
|
187
|
|
188 joinSAM(dCutOut2Out)
|
|
189 FileUtils.removeFilesFromListIfExist(lAllFile2remove)
|
|
190
|
|
191 if options.outputTar != None:
|
|
192 toTar(options.outputTar, gsnapOutputNames)
|
|
193
|
|
194
|
|
195 if __name__=="__main__": __main__()
|