18
|
1
|
|
2 #!/usr/bin/env python
|
|
3
|
|
4 import optparse, os, shutil, subprocess, sys, tempfile, fileinput, tarfile, glob
|
|
5 from commons.core.launcher.Launcher import Launcher
|
|
6 from commons.core.sql.TableJobAdaptatorFactory import TableJobAdaptatorFactory
|
|
7 from commons.core.utils.FileUtils import FileUtils
|
|
8
|
|
9 def stop_err( msg ):
|
|
10 sys.stderr.write( "%s\n" % msg )
|
|
11 sys.exit()
|
|
12
|
|
13 def toTar(tarFileName, accepted_hits_outputNames):
|
|
14 tfile = tarfile.open(tarFileName + ".tmp.tar", "w")
|
|
15 currentPath = os.getcwd()
|
|
16 os.chdir(dir)
|
|
17 for file in accepted_hits_outputNames:
|
|
18 relativeFileName = os.path.basename(file)
|
|
19 tfile.add(relativeFileName)
|
|
20 os.system("mv %s %s" % (tarFileName + ".tmp.tar", tarFileName))
|
|
21 tfile.close()
|
|
22 os.chdir(currentPath)
|
|
23
|
|
24 def splitFastQ(fileName, nbOfSeqPerBatch):
|
|
25 nbOfLinesPerFile = nbOfSeqPerBatch * 4
|
|
26 lOutput = []
|
|
27 filePrefix, fileExt = os.path.splitext(os.path.basename(fileName))
|
|
28 resDir = os.path.dirname(fileName)
|
|
29 with open(fileName) as inF:
|
|
30 fileNb = 1
|
|
31 line = inF.readline()
|
|
32 if not line or nbOfLinesPerFile == 0:
|
|
33 outFileName = "%s/%s-%s%s" %(resDir, filePrefix, fileNb, fileExt)
|
|
34 lOutput.append(outFileName)
|
|
35 f = open(outFileName, "wb")
|
|
36 shutil.copyfileobj(open(fileName, "rb"), f)
|
|
37 f.close()
|
|
38 else:
|
|
39 while line:
|
|
40 outFileName = "%s/%s-%s%s" %(resDir, filePrefix, fileNb, fileExt)
|
|
41 lOutput.append(outFileName)
|
|
42 with open(outFileName, "w") as outF:
|
|
43 lineNb = 1
|
|
44 while lineNb <= nbOfLinesPerFile and line:
|
|
45 outF.write(line)
|
|
46 line = inF.readline()
|
|
47 lineNb += 1
|
|
48 fileNb += 1
|
|
49 return lOutput
|
|
50
|
|
51 def joinBAM(dCutOut2Out):
|
|
52 for key in dCutOut2Out.keys():
|
|
53 fh = open(key, "w")
|
|
54 fh.close()
|
|
55 nbFile = 0
|
|
56 cmd = "samtools merge -f %s" % key
|
|
57 for fileName in dCutOut2Out[key]:
|
|
58 nbFile = nbFile + 1
|
|
59 if nbFile < 225:
|
|
60 cmd += " %s" % fileName
|
|
61 else:
|
|
62 nbFile = 0
|
|
63 cmd += ";mv %s tmpBAM;" % (key)
|
|
64 cmd += "samtools merge -f %s tmpBAM %s" % (key, fileName)
|
|
65 proc = subprocess.Popen( args=cmd , shell=True)
|
|
66 returncode = proc.wait()
|
|
67
|
|
68
|
|
69 def _map(iLauncher, cmd, cmdStart, cmdFinish ):
|
|
70 lCmds = []
|
|
71 lCmds.extend(cmd)
|
|
72 lCmdStart = []
|
|
73 lCmdStart.extend(cmdStart)
|
|
74 lCmdFinish = []
|
|
75 lCmdFinish.extend(cmdFinish)
|
|
76 return(iLauncher.prepareCommands_withoutIndentation(lCmds, lCmdStart, lCmdFinish))
|
|
77
|
|
78 def _createTopHatCommand(iLauncher, options, index_paths, inputFileNames, inputRevFilesNames, space):
|
|
79 lArgs = []
|
|
80 lArgs.append('-p %s %s' % ( options.num_threads, space ))
|
|
81 if options.single_paired == 'paired':
|
|
82 lArgs.append('-r %s ' % options.mate_inner_dist)
|
|
83 if options.settings == 'preSet':
|
|
84 lArgs.append(index_paths)
|
|
85 lArgs.append(inputFileNames)
|
|
86 if options.input2:
|
|
87 lArgs.append(inputRevFilesNames)
|
|
88 return iLauncher.getSystemCommand("tophat", lArgs)
|
|
89 else:
|
|
90 if int( options.min_anchor_length ) >= 3:
|
|
91 lArgs.append('-a %s ' % options.min_anchor_length)
|
|
92 else:
|
|
93 raise Exception, 'Minimum anchor length must be 3 or greater'
|
|
94 lArgs.append('-m %s ' % options.splice_mismatches)
|
|
95 lArgs.append('-i %s ' % options.min_intron_length)
|
|
96 lArgs.append('-I %s ' % options.max_intron_length)
|
|
97 if float( options.junction_filter ) != 0.0:
|
|
98 lArgs.append('-F %s ' % options.junction_filter)
|
|
99 lArgs.append('-g %s ' % options.max_multihits)
|
|
100 # Custom junctions options.
|
|
101 if options.gene_model_annotations:
|
|
102 lArgs.append('-G %s ' % options.gene_model_annotations)
|
|
103 if options.raw_juncs:
|
|
104 lArgs.append('-j %s ' % options.raw_juncs)
|
|
105 if options.no_novel_juncs:
|
|
106 lArgs.append('--no-novel-juncs ')
|
|
107 if options.library_type:
|
|
108 lArgs.append('--library-type %s ' % options.library_type)
|
|
109 if options.no_novel_indels:
|
|
110 lArgs.append('--no-novel-indels ')
|
|
111 else:
|
|
112 if options.max_insertion_length:
|
|
113 lArgs.append('--max-insertion-length %i ' % int( options.max_insertion_length ))
|
|
114 if options.max_deletion_length:
|
|
115 lArgs.append('--max-deletion-length %i ' % int( options.max_deletion_length ))
|
|
116 # Max options do not work for Tophat v1.2.0, despite documentation to the contrary. (Fixed in version 1.3.1)
|
|
117 # need to warn user of this fact
|
|
118 #sys.stdout.write( "Max insertion length and max deletion length options don't work in Tophat v1.2.0\n" )
|
|
119
|
|
120 # Search type options.
|
|
121 if options.coverage_search:
|
|
122 lArgs.append('--coverage-search --min-coverage-intron %s --max-coverage-intron %s ' % ( options.min_coverage_intron, options.max_coverage_intron ))
|
|
123 else:
|
|
124 lArgs.append('--no-coverage-search ')
|
|
125 if options.closure_search:
|
|
126 lArgs.append('--closure-search --min-closure-exon %s --min-closure-intron %s --max-closure-intron %s ' % ( options.min_closure_exon, options.min_closure_intron, options.max_closure_intron ))
|
|
127 else:
|
|
128 lArgs.append('--no-closure-search ')
|
|
129 if options.microexon_search:
|
|
130 lArgs.append('--microexon-search ')
|
|
131 if options.single_paired == 'paired':
|
|
132 lArgs.append('--mate-std-dev %s ' % options.mate_std_dev)
|
|
133 if options.initial_read_mismatches:
|
|
134 lArgs.append('--initial-read-mismatches %d ' % int( options.initial_read_mismatches ))
|
|
135 if options.seg_mismatches:
|
|
136 lArgs.append('--segment-mismatches %d ' % int( options.seg_mismatches ))
|
|
137 if options.seg_length:
|
|
138 lArgs.append('--segment-length %d ' % int( options.seg_length ))
|
|
139 if options.min_segment_intron:
|
|
140 lArgs.append('--min-segment-intron %d ' % int( options.min_segment_intron ))
|
|
141 if options.max_segment_intron:
|
|
142 lArgs.append('--max-segment-intron %d ' % int( options.max_segment_intron ))
|
|
143 lArgs.append(index_paths)
|
|
144 lArgs.append(inputFileNames)
|
|
145 if options.input2:
|
|
146 lArgs.append(inputRevFilesNames)
|
|
147 return iLauncher.getSystemCommand("tophat", lArgs)
|
|
148
|
|
149
|
|
150
|
|
151 def __main__():
|
|
152 #Parse Command Line
|
|
153 parser = optparse.OptionParser()
|
|
154 parser.add_option('-o', '--outputTxtFile', dest='outputTxtFile', help='for Differential expression analysis pipeline, new output option gives a txt output containing the list of mapping results.')
|
|
155 parser.add_option('-t', '--tar', dest='outputTar', default=None, help='output all accepted hits results in a tar file.' )
|
|
156 parser.add_option( '-p', '--num-threads', dest='num_threads', help='Use this many threads to align reads. The default is 1.' )
|
|
157 parser.add_option( '-C', '--color-space', dest='color_space', action='store_true', help='This indicates color-space data' )
|
|
158 parser.add_option( '-J', '--junctions-output', dest='junctions_output_file', default='junctions_output.bed', help='Junctions output file; formate is BED.' )
|
|
159 parser.add_option( '-H', '--hits-output', dest='accepted_hits_output_file', default='hits_output.bam', help='Accepted hits output file; formate is BAM.' )
|
|
160 parser.add_option( '', '--own-file', dest='own_file', help='' )
|
|
161 parser.add_option( '-D', '--indexes-path', dest='index_path', help='Indexes directory; location of .ebwt and .fa files.' )
|
|
162 parser.add_option( '-r', '--mate-inner-dist', dest='mate_inner_dist', help='This is the expected (mean) inner distance between mate pairs. \
|
|
163 For, example, for paired end runs with fragments selected at 300bp, \
|
|
164 where each end is 50bp, you should set -r to be 200. There is no default, \
|
|
165 and this parameter is required for paired end runs.')
|
|
166 parser.add_option( '', '--mate-std-dev', dest='mate_std_dev', help='Standard deviation of distribution on inner distances between male pairs.' )
|
|
167 parser.add_option( '-a', '--min-anchor-length', dest='min_anchor_length',
|
|
168 help='The "anchor length". TopHat will report junctions spanned by reads with at least this many bases on each side of the junction.' )
|
|
169 parser.add_option( '-m', '--splice-mismatches', dest='splice_mismatches', help='The maximum number of mismatches that can appear in the anchor region of a spliced alignment.' )
|
|
170 parser.add_option( '-i', '--min-intron-length', dest='min_intron_length',
|
|
171 help='The minimum intron length. TopHat will ignore donor/acceptor pairs closer than this many bases apart.' )
|
|
172 parser.add_option( '-I', '--max-intron-length', dest='max_intron_length',
|
|
173 help='The maximum intron length. When searching for junctions ab initio, TopHat will ignore donor/acceptor pairs farther than this many bases apart, except when such a pair is supported by a split segment alignment of a long read.' )
|
|
174 parser.add_option( '-F', '--junction_filter', dest='junction_filter', help='Filter out junctions supported by too few alignments (number of reads divided by average depth of coverage)' )
|
|
175 parser.add_option( '-g', '--max_multihits', dest='max_multihits', help='Maximum number of alignments to be allowed' )
|
|
176 parser.add_option( '', '--initial-read-mismatches', dest='initial_read_mismatches', help='Number of mismatches allowed in the initial read mapping' )
|
|
177 parser.add_option( '', '--seg-mismatches', dest='seg_mismatches', help='Number of mismatches allowed in each segment alignment for reads mapped independently' )
|
|
178 parser.add_option( '', '--seg-length', dest='seg_length', help='Minimum length of read segments' )
|
|
179 parser.add_option( '', '--library-type', dest='library_type', help='TopHat will treat the reads as strand specific. Every read alignment will have an XS attribute tag. Consider supplying library type options below to select the correct RNA-seq protocol.' )
|
|
180 parser.add_option( '', '--allow-indels', action="store_true", help='Allow indel search. Indel search is disabled by default.(Not used since version 1.3.0)' )
|
|
181 parser.add_option( '', '--max-insertion-length', dest='max_insertion_length', help='The maximum insertion length. The default is 3.' )
|
|
182 parser.add_option( '', '--max-deletion-length', dest='max_deletion_length', help='The maximum deletion length. The default is 3.' )
|
|
183
|
|
184 # Options for supplying own junctions
|
|
185 parser.add_option( '-G', '--GTF', dest='gene_model_annotations', help='Supply TopHat with a list of gene model annotations. \
|
|
186 TopHat will use the exon records in this file to build \
|
|
187 a set of known splice junctions for each gene, and will \
|
|
188 attempt to align reads to these junctions even if they \
|
|
189 would not normally be covered by the initial mapping.')
|
|
190 parser.add_option( '-j', '--raw-juncs', dest='raw_juncs', help='Supply TopHat with a list of raw junctions. Junctions are \
|
|
191 specified one per line, in a tab-delimited format. Records \
|
|
192 look like: <chrom> <left> <right> <+/-> left and right are \
|
|
193 zero-based coordinates, and specify the last character of the \
|
|
194 left sequenced to be spliced to the first character of the right \
|
|
195 sequence, inclusive.')
|
|
196 parser.add_option( '', '--no-novel-juncs', action="store_true", dest='no_novel_juncs', help="Only look for junctions indicated in the \
|
|
197 supplied GFF file. (ignored without -G)")
|
|
198 parser.add_option( '', '--no-novel-indels', action="store_true", dest='no_novel_indels', help="Skip indel search. Indel search is enabled by default.")
|
|
199 # Types of search.
|
|
200 parser.add_option( '', '--microexon-search', action="store_true", dest='microexon_search', help='With this option, the pipeline will attempt to find alignments incident to microexons. Works only for reads 50bp or longer.')
|
|
201 parser.add_option( '', '--closure-search', action="store_true", dest='closure_search', help='Enables the mate pair closure-based search for junctions. Closure-based search should only be used when the expected inner distance between mates is small (<= 50bp)')
|
|
202 parser.add_option( '', '--no-closure-search', action="store_false", dest='closure_search' )
|
|
203 parser.add_option( '', '--coverage-search', action="store_true", dest='coverage_search', help='Enables the coverage based search for junctions. Use when coverage search is disabled by default (such as for reads 75bp or longer), for maximum sensitivity.')
|
|
204 parser.add_option( '', '--no-coverage-search', action="store_false", dest='coverage_search' )
|
|
205 parser.add_option( '', '--min-segment-intron', dest='min_segment_intron', help='Minimum intron length that may be found during split-segment search' )
|
|
206 parser.add_option( '', '--max-segment-intron', dest='max_segment_intron', help='Maximum intron length that may be found during split-segment search' )
|
|
207 parser.add_option( '', '--min-closure-exon', dest='min_closure_exon', help='Minimum length for exonic hops in potential splice graph' )
|
|
208 parser.add_option( '', '--min-closure-intron', dest='min_closure_intron', help='Minimum intron length that may be found during closure search' )
|
|
209 parser.add_option( '', '--max-closure-intron', dest='max_closure_intron', help='Maximum intron length that may be found during closure search' )
|
|
210 parser.add_option( '', '--min-coverage-intron', dest='min_coverage_intron', help='Minimum intron length that may be found during coverage search' )
|
|
211 parser.add_option( '', '--max-coverage-intron', dest='max_coverage_intron', help='Maximum intron length that may be found during coverage search' )
|
|
212
|
|
213 # Wrapper options.
|
|
214 parser.add_option( '-1', '--input1', dest='input1', help='A list of the (forward or single-end) reads files of Sanger FASTQ format, txt format' )
|
|
215 parser.add_option( '-2', '--input2', dest='input2', help='The list of reverse reads file in Sanger FASTQ format' )
|
|
216 parser.add_option( '', '--single-paired', dest='single_paired', help='' )
|
|
217 parser.add_option( '', '--settings', dest='settings', help='' )
|
|
218
|
|
219 (options, args) = parser.parse_args()
|
|
220
|
|
221 # output version # of tool
|
|
222 try:
|
|
223 tmp_files = []
|
|
224 tmp = tempfile.NamedTemporaryFile().name
|
|
225 tmp_files.append(tmp)
|
|
226 tmp_stdout = open( tmp, 'wb' )
|
|
227 proc = subprocess.Popen( args='tophat -v', shell=True, stdout=tmp_stdout )
|
|
228 tmp_stdout.close()
|
|
229 returncode = proc.wait()
|
|
230 stdout = open( tmp_stdout.name, 'rb' ).readline().strip()
|
|
231 if stdout:
|
|
232 sys.stdout.write( '%s\n' % stdout )
|
|
233 else:
|
|
234 raise Exception
|
|
235 except:
|
|
236 sys.stdout.write( 'Could not determine Tophat version\n' )
|
|
237
|
|
238 # Color or base space
|
|
239 space = ''
|
|
240 if options.color_space:
|
|
241 space = '-C'
|
|
242
|
|
243
|
|
244 #reads = options.input1
|
|
245 file = open(options.input1,"r")
|
|
246 lines = file.readlines()
|
|
247 inputFileNames = []
|
|
248 accepted_hits_outputNames = []
|
|
249 outputName = options.outputTxtFile
|
|
250 resDirName = os.path.dirname(outputName) + '/'
|
|
251 out = open(outputName, "w")
|
|
252 for line in lines:
|
|
253 tab = line.split()
|
|
254 inputFileNames.append(tab[1])
|
|
255 aHitOutName = resDirName + tab[0] + '_' + options.accepted_hits_output_file
|
|
256 accepted_hits_outputNames.append(aHitOutName)
|
|
257 out.write(tab[0] + '\t' + aHitOutName + '\n')
|
|
258 file.close()
|
|
259 out.close()
|
|
260
|
|
261 if options.input2:
|
|
262 revFile = open(options.input2,"r")
|
|
263 lines = revFile.readlines()
|
|
264 inputRevFileNames = []
|
|
265 for line in lines:
|
|
266 revTab = line.split()
|
|
267 inputRevFileNames.append(revTab[1])
|
|
268 revFile.close()
|
|
269
|
|
270
|
|
271 # Creat bowtie index if necessary.
|
|
272 tmp_index_dirs = []
|
|
273 index_paths = []
|
|
274 tmp_index_dir = tempfile.mkdtemp(dir="%s" % os.getcwd())
|
|
275 tmp_index_dirs.append(tmp_index_dir)
|
|
276 if options.own_file:
|
|
277 index_path = os.path.join( tmp_index_dir, '.'.join( os.path.split( options.own_file )[1].split( '.' )[:-1] ) )
|
|
278 index_paths.append(index_path)
|
|
279 try:
|
|
280 os.link( options.own_file, index_path + '.fa' )
|
|
281 except:
|
|
282 # Tophat prefers (but doesn't require) fasta file to be in same directory, with .fa extension
|
|
283 pass
|
|
284 lCmdsTuples =[]
|
|
285 acronym = "tophat_index"
|
|
286 jobdb = TableJobAdaptatorFactory.createJobInstance()
|
|
287 iLauncher = Launcher(jobdb, os.getcwd(), "", "", os.getcwd(), os.getcwd(), "jobs", "", acronym, acronym, False, True)
|
|
288 cmd_index = iLauncher.getSystemCommand("bowtie-build", [space, "-f %s" % options.own_file, index_path])
|
|
289 cmd2Launch = []
|
|
290 cmdStart = []
|
|
291 cmdFinish = []
|
|
292 cmd2Launch.append(cmd_index)
|
|
293 lCmdsTuples.append(_map(iLauncher, cmd2Launch, cmdStart, cmdFinish))
|
|
294 iLauncher.runLauncherForMultipleJobs(acronym, lCmdsTuples, True)
|
|
295 else:
|
|
296 for file in inputFileNames:
|
|
297 tmp_index_dir = tempfile.mkdtemp()
|
|
298 index_path = tmp_index_dir + '/' + os.path.basename(file).split('.')[0]
|
|
299 index_paths.append(index_path)
|
|
300 tmp_index_dirs.append(tmp_index_dir)
|
|
301
|
|
302
|
|
303
|
|
304 acronym = "tophat"
|
|
305 jobdb = TableJobAdaptatorFactory.createJobInstance()
|
|
306 iLauncher = Launcher(jobdb, os.getcwd(), "", "", os.getcwd(), os.getcwd(), "jobs", "", acronym, acronym, False, True)
|
|
307 lCmdsTuples = []
|
|
308 dCutOut2Out = {}
|
|
309 lAllFile2remove = []
|
|
310 # for inputFileName in inputFileNames:
|
|
311 for i in range(len(inputFileNames)):
|
|
312 lCutOutput = []
|
|
313 lCutInputFile = splitFastQ(inputFileNames[i], 20000)
|
|
314 lAllFile2remove.extend(lCutInputFile)
|
|
315 if options.input2:
|
|
316 lCutPairInputFile = splitFastQ(inputRevFileNames[i], 20000)
|
|
317 lAllFile2remove.extend(lCutPairInputFile)
|
|
318 for j in range(len(lCutInputFile)):
|
|
319 cutOutput = "%s_out" % lCutInputFile[j]
|
|
320 lCutOutput.append(cutOutput)
|
|
321 lAllFile2remove.extend(lCutOutput)
|
|
322 cmd2Launch = []
|
|
323 if options.input2:
|
|
324 inputRevFile = lCutPairInputFile[j]
|
|
325 else:
|
|
326 inputRevFile = ""
|
|
327 if options.own_file:
|
|
328 cmd2Launch.append(_createTopHatCommand(iLauncher, options, index_paths[0], lCutInputFile[j], inputRevFile, space))
|
|
329 else:
|
|
330 cmd2Launch.append(_createTopHatCommand(iLauncher, options, index_paths[i], lCutInputFile[j], inputRevFile, space))
|
|
331 cmdStart = []
|
|
332 cmdFinish = ["shutil.copyfile( os.path.join( 'tophat_out', 'accepted_hits.bam' ), '%s')" % cutOutput]
|
|
333 lCmdsTuples.append(_map(iLauncher, cmd2Launch, cmdStart, cmdFinish))
|
|
334 dCutOut2Out[accepted_hits_outputNames[i]] = lCutOutput
|
|
335 iLauncher.runLauncherForMultipleJobs(acronym, lCmdsTuples, True)
|
|
336
|
|
337 joinBAM(dCutOut2Out)
|
|
338 FileUtils.removeFilesFromListIfExist(lAllFile2remove)
|
|
339
|
|
340 if options.outputTar != None:
|
|
341 toTar(options.outputTar, accepted_hits_outputNames)
|
|
342
|
|
343
|
|
344 # Clean up temp dirs
|
|
345 for tmp_index_dir in tmp_index_dirs:
|
|
346 if os.path.exists( tmp_index_dir ):
|
|
347 shutil.rmtree( tmp_index_dir )
|
|
348
|
|
349 for tmp in tmp_files:
|
|
350 os.remove(tmp)
|
|
351
|
|
352
|
|
353 if __name__=="__main__": __main__()
|