Mercurial > repos > yufei-luo > s_mart
view SMART/DiffExpAnal/tophat_parallel_unSQL.py @ 32:3441fe98a2ba
Deleted selected files
author | m-zytnicki |
---|---|
date | Tue, 30 Apr 2013 14:34:10 -0400 |
parents | 94ab73e8a190 |
children |
line wrap: on
line source
#!/usr/bin/env python import optparse, os, shutil, subprocess, sys, tempfile, fileinput, tarfile, glob from commons.core.launcher.Launcher import Launcher from commons.core.sql.TableJobAdaptatorFactory import TableJobAdaptatorFactory from commons.core.utils.FileUtils import FileUtils def stop_err( msg ): sys.stderr.write( "%s\n" % msg ) sys.exit() def toTar(tarFileName, accepted_hits_outputNames): tfile = tarfile.open(tarFileName + ".tmp.tar", "w") currentPath = os.getcwd() os.chdir(dir) for file in accepted_hits_outputNames: relativeFileName = os.path.basename(file) tfile.add(relativeFileName) os.system("mv %s %s" % (tarFileName + ".tmp.tar", tarFileName)) tfile.close() os.chdir(currentPath) def splitFastQ(fileName, nbOfSeqPerBatch): nbOfLinesPerFile = nbOfSeqPerBatch * 4 lOutput = [] filePrefix, fileExt = os.path.splitext(os.path.basename(fileName)) resDir = os.path.dirname(fileName) with open(fileName) as inF: fileNb = 1 line = inF.readline() if not line or nbOfLinesPerFile == 0: outFileName = "%s/%s-%s%s" %(resDir, filePrefix, fileNb, fileExt) lOutput.append(outFileName) f = open(outFileName, "wb") shutil.copyfileobj(open(fileName, "rb"), f) f.close() else: while line: outFileName = "%s/%s-%s%s" %(resDir, filePrefix, fileNb, fileExt) lOutput.append(outFileName) with open(outFileName, "w") as outF: lineNb = 1 while lineNb <= nbOfLinesPerFile and line: outF.write(line) line = inF.readline() lineNb += 1 fileNb += 1 return lOutput def joinBAM(dCutOut2Out): for key in dCutOut2Out.keys(): fh = open(key, "w") fh.close() nbFile = 0 cmd = "samtools merge -f %s" % key for fileName in dCutOut2Out[key]: nbFile = nbFile + 1 if nbFile < 225: cmd += " %s" % fileName else: nbFile = 0 cmd += ";mv %s tmpBAM;" % (key) cmd += "samtools merge -f %s tmpBAM %s" % (key, fileName) proc = subprocess.Popen( args=cmd , shell=True) returncode = proc.wait() def _map(iLauncher, cmd, cmdStart, cmdFinish ): lCmds = [] lCmds.extend(cmd) lCmdStart = [] lCmdStart.extend(cmdStart) lCmdFinish = [] lCmdFinish.extend(cmdFinish) return(iLauncher.prepareCommands_withoutIndentation(lCmds, lCmdStart, lCmdFinish)) def _createTopHatCommand(iLauncher, options, index_paths, inputFileNames, inputRevFilesNames, space): lArgs = [] lArgs.append('-p %s %s' % ( options.num_threads, space )) if options.single_paired == 'paired': lArgs.append('-r %s ' % options.mate_inner_dist) if options.settings == 'preSet': lArgs.append(index_paths) lArgs.append(inputFileNames) if options.input2: lArgs.append(inputRevFilesNames) return iLauncher.getSystemCommand("tophat", lArgs) else: if int( options.min_anchor_length ) >= 3: lArgs.append('-a %s ' % options.min_anchor_length) else: raise Exception, 'Minimum anchor length must be 3 or greater' lArgs.append('-m %s ' % options.splice_mismatches) lArgs.append('-i %s ' % options.min_intron_length) lArgs.append('-I %s ' % options.max_intron_length) if float( options.junction_filter ) != 0.0: lArgs.append('-F %s ' % options.junction_filter) lArgs.append('-g %s ' % options.max_multihits) # Custom junctions options. if options.gene_model_annotations: lArgs.append('-G %s ' % options.gene_model_annotations) if options.raw_juncs: lArgs.append('-j %s ' % options.raw_juncs) if options.no_novel_juncs: lArgs.append('--no-novel-juncs ') if options.library_type: lArgs.append('--library-type %s ' % options.library_type) if options.no_novel_indels: lArgs.append('--no-novel-indels ') else: if options.max_insertion_length: lArgs.append('--max-insertion-length %i ' % int( options.max_insertion_length )) if options.max_deletion_length: lArgs.append('--max-deletion-length %i ' % int( options.max_deletion_length )) # Max options do not work for Tophat v1.2.0, despite documentation to the contrary. (Fixed in version 1.3.1) # need to warn user of this fact #sys.stdout.write( "Max insertion length and max deletion length options don't work in Tophat v1.2.0\n" ) # Search type options. if options.coverage_search: lArgs.append('--coverage-search --min-coverage-intron %s --max-coverage-intron %s ' % ( options.min_coverage_intron, options.max_coverage_intron )) else: lArgs.append('--no-coverage-search ') if options.closure_search: lArgs.append('--closure-search --min-closure-exon %s --min-closure-intron %s --max-closure-intron %s ' % ( options.min_closure_exon, options.min_closure_intron, options.max_closure_intron )) else: lArgs.append('--no-closure-search ') if options.microexon_search: lArgs.append('--microexon-search ') if options.single_paired == 'paired': lArgs.append('--mate-std-dev %s ' % options.mate_std_dev) if options.initial_read_mismatches: lArgs.append('--initial-read-mismatches %d ' % int( options.initial_read_mismatches )) if options.seg_mismatches: lArgs.append('--segment-mismatches %d ' % int( options.seg_mismatches )) if options.seg_length: lArgs.append('--segment-length %d ' % int( options.seg_length )) if options.min_segment_intron: lArgs.append('--min-segment-intron %d ' % int( options.min_segment_intron )) if options.max_segment_intron: lArgs.append('--max-segment-intron %d ' % int( options.max_segment_intron )) lArgs.append(index_paths) lArgs.append(inputFileNames) if options.input2: lArgs.append(inputRevFilesNames) return iLauncher.getSystemCommand("tophat", lArgs) def __main__(): #Parse Command Line parser = optparse.OptionParser() parser.add_option('-o', '--outputTxtFile', dest='outputTxtFile', help='for Differential expression analysis pipeline, new output option gives a txt output containing the list of mapping results.') parser.add_option('-t', '--tar', dest='outputTar', default=None, help='output all accepted hits results in a tar file.' ) parser.add_option( '-p', '--num-threads', dest='num_threads', help='Use this many threads to align reads. The default is 1.' ) parser.add_option( '-C', '--color-space', dest='color_space', action='store_true', help='This indicates color-space data' ) parser.add_option( '-J', '--junctions-output', dest='junctions_output_file', default='junctions_output.bed', help='Junctions output file; formate is BED.' ) parser.add_option( '-H', '--hits-output', dest='accepted_hits_output_file', default='hits_output.bam', help='Accepted hits output file; formate is BAM.' ) parser.add_option( '', '--own-file', dest='own_file', help='' ) parser.add_option( '-D', '--indexes-path', dest='index_path', help='Indexes directory; location of .ebwt and .fa files.' ) parser.add_option( '-r', '--mate-inner-dist', dest='mate_inner_dist', help='This is the expected (mean) inner distance between mate pairs. \ For, example, for paired end runs with fragments selected at 300bp, \ where each end is 50bp, you should set -r to be 200. There is no default, \ and this parameter is required for paired end runs.') parser.add_option( '', '--mate-std-dev', dest='mate_std_dev', help='Standard deviation of distribution on inner distances between male pairs.' ) parser.add_option( '-a', '--min-anchor-length', dest='min_anchor_length', help='The "anchor length". TopHat will report junctions spanned by reads with at least this many bases on each side of the junction.' ) parser.add_option( '-m', '--splice-mismatches', dest='splice_mismatches', help='The maximum number of mismatches that can appear in the anchor region of a spliced alignment.' ) parser.add_option( '-i', '--min-intron-length', dest='min_intron_length', help='The minimum intron length. TopHat will ignore donor/acceptor pairs closer than this many bases apart.' ) parser.add_option( '-I', '--max-intron-length', dest='max_intron_length', help='The maximum intron length. When searching for junctions ab initio, TopHat will ignore donor/acceptor pairs farther than this many bases apart, except when such a pair is supported by a split segment alignment of a long read.' ) parser.add_option( '-F', '--junction_filter', dest='junction_filter', help='Filter out junctions supported by too few alignments (number of reads divided by average depth of coverage)' ) parser.add_option( '-g', '--max_multihits', dest='max_multihits', help='Maximum number of alignments to be allowed' ) parser.add_option( '', '--initial-read-mismatches', dest='initial_read_mismatches', help='Number of mismatches allowed in the initial read mapping' ) parser.add_option( '', '--seg-mismatches', dest='seg_mismatches', help='Number of mismatches allowed in each segment alignment for reads mapped independently' ) parser.add_option( '', '--seg-length', dest='seg_length', help='Minimum length of read segments' ) parser.add_option( '', '--library-type', dest='library_type', help='TopHat will treat the reads as strand specific. Every read alignment will have an XS attribute tag. Consider supplying library type options below to select the correct RNA-seq protocol.' ) parser.add_option( '', '--allow-indels', action="store_true", help='Allow indel search. Indel search is disabled by default.(Not used since version 1.3.0)' ) parser.add_option( '', '--max-insertion-length', dest='max_insertion_length', help='The maximum insertion length. The default is 3.' ) parser.add_option( '', '--max-deletion-length', dest='max_deletion_length', help='The maximum deletion length. The default is 3.' ) # Options for supplying own junctions parser.add_option( '-G', '--GTF', dest='gene_model_annotations', help='Supply TopHat with a list of gene model annotations. \ TopHat will use the exon records in this file to build \ a set of known splice junctions for each gene, and will \ attempt to align reads to these junctions even if they \ would not normally be covered by the initial mapping.') parser.add_option( '-j', '--raw-juncs', dest='raw_juncs', help='Supply TopHat with a list of raw junctions. Junctions are \ specified one per line, in a tab-delimited format. Records \ look like: <chrom> <left> <right> <+/-> left and right are \ zero-based coordinates, and specify the last character of the \ left sequenced to be spliced to the first character of the right \ sequence, inclusive.') parser.add_option( '', '--no-novel-juncs', action="store_true", dest='no_novel_juncs', help="Only look for junctions indicated in the \ supplied GFF file. (ignored without -G)") parser.add_option( '', '--no-novel-indels', action="store_true", dest='no_novel_indels', help="Skip indel search. Indel search is enabled by default.") # Types of search. parser.add_option( '', '--microexon-search', action="store_true", dest='microexon_search', help='With this option, the pipeline will attempt to find alignments incident to microexons. Works only for reads 50bp or longer.') parser.add_option( '', '--closure-search', action="store_true", dest='closure_search', help='Enables the mate pair closure-based search for junctions. Closure-based search should only be used when the expected inner distance between mates is small (<= 50bp)') parser.add_option( '', '--no-closure-search', action="store_false", dest='closure_search' ) parser.add_option( '', '--coverage-search', action="store_true", dest='coverage_search', help='Enables the coverage based search for junctions. Use when coverage search is disabled by default (such as for reads 75bp or longer), for maximum sensitivity.') parser.add_option( '', '--no-coverage-search', action="store_false", dest='coverage_search' ) parser.add_option( '', '--min-segment-intron', dest='min_segment_intron', help='Minimum intron length that may be found during split-segment search' ) parser.add_option( '', '--max-segment-intron', dest='max_segment_intron', help='Maximum intron length that may be found during split-segment search' ) parser.add_option( '', '--min-closure-exon', dest='min_closure_exon', help='Minimum length for exonic hops in potential splice graph' ) parser.add_option( '', '--min-closure-intron', dest='min_closure_intron', help='Minimum intron length that may be found during closure search' ) parser.add_option( '', '--max-closure-intron', dest='max_closure_intron', help='Maximum intron length that may be found during closure search' ) parser.add_option( '', '--min-coverage-intron', dest='min_coverage_intron', help='Minimum intron length that may be found during coverage search' ) parser.add_option( '', '--max-coverage-intron', dest='max_coverage_intron', help='Maximum intron length that may be found during coverage search' ) # Wrapper options. parser.add_option( '-1', '--input1', dest='input1', help='A list of the (forward or single-end) reads files of Sanger FASTQ format, txt format' ) parser.add_option( '-2', '--input2', dest='input2', help='The list of reverse reads file in Sanger FASTQ format' ) parser.add_option( '', '--single-paired', dest='single_paired', help='' ) parser.add_option( '', '--settings', dest='settings', help='' ) (options, args) = parser.parse_args() # output version # of tool try: tmp_files = [] tmp = tempfile.NamedTemporaryFile().name tmp_files.append(tmp) tmp_stdout = open( tmp, 'wb' ) proc = subprocess.Popen( args='tophat -v', shell=True, stdout=tmp_stdout ) tmp_stdout.close() returncode = proc.wait() stdout = open( tmp_stdout.name, 'rb' ).readline().strip() if stdout: sys.stdout.write( '%s\n' % stdout ) else: raise Exception except: sys.stdout.write( 'Could not determine Tophat version\n' ) # Color or base space space = '' if options.color_space: space = '-C' #reads = options.input1 file = open(options.input1,"r") lines = file.readlines() inputFileNames = [] accepted_hits_outputNames = [] outputName = options.outputTxtFile resDirName = os.path.dirname(outputName) + '/' out = open(outputName, "w") for line in lines: tab = line.split() inputFileNames.append(tab[1]) aHitOutName = resDirName + tab[0] + '_' + options.accepted_hits_output_file accepted_hits_outputNames.append(aHitOutName) out.write(tab[0] + '\t' + aHitOutName + '\n') file.close() out.close() if options.input2: revFile = open(options.input2,"r") lines = revFile.readlines() inputRevFileNames = [] for line in lines: revTab = line.split() inputRevFileNames.append(revTab[1]) revFile.close() # Creat bowtie index if necessary. tmp_index_dirs = [] index_paths = [] tmp_index_dir = tempfile.mkdtemp(dir="%s" % os.getcwd()) tmp_index_dirs.append(tmp_index_dir) if options.own_file: index_path = os.path.join( tmp_index_dir, '.'.join( os.path.split( options.own_file )[1].split( '.' )[:-1] ) ) index_paths.append(index_path) try: os.link( options.own_file, index_path + '.fa' ) except: # Tophat prefers (but doesn't require) fasta file to be in same directory, with .fa extension pass lCmdsTuples =[] acronym = "tophat_index" jobdb = TableJobAdaptatorFactory.createJobInstance() iLauncher = Launcher(jobdb, os.getcwd(), "", "", os.getcwd(), os.getcwd(), "jobs", "", acronym, acronym, False, True) cmd_index = iLauncher.getSystemCommand("bowtie-build", [space, "-f %s" % options.own_file, index_path]) cmd2Launch = [] cmdStart = [] cmdFinish = [] cmd2Launch.append(cmd_index) lCmdsTuples.append(_map(iLauncher, cmd2Launch, cmdStart, cmdFinish)) iLauncher.runLauncherForMultipleJobs(acronym, lCmdsTuples, True) else: for file in inputFileNames: tmp_index_dir = tempfile.mkdtemp() index_path = tmp_index_dir + '/' + os.path.basename(file).split('.')[0] index_paths.append(index_path) tmp_index_dirs.append(tmp_index_dir) acronym = "tophat" jobdb = TableJobAdaptatorFactory.createJobInstance() iLauncher = Launcher(jobdb, os.getcwd(), "", "", os.getcwd(), os.getcwd(), "jobs", "", acronym, acronym, False, True) lCmdsTuples = [] dCutOut2Out = {} lAllFile2remove = [] # for inputFileName in inputFileNames: for i in range(len(inputFileNames)): lCutOutput = [] lCutInputFile = splitFastQ(inputFileNames[i], 20000) lAllFile2remove.extend(lCutInputFile) if options.input2: lCutPairInputFile = splitFastQ(inputRevFileNames[i], 20000) lAllFile2remove.extend(lCutPairInputFile) for j in range(len(lCutInputFile)): cutOutput = "%s_out" % lCutInputFile[j] lCutOutput.append(cutOutput) lAllFile2remove.extend(lCutOutput) cmd2Launch = [] if options.input2: inputRevFile = lCutPairInputFile[j] else: inputRevFile = "" if options.own_file: cmd2Launch.append(_createTopHatCommand(iLauncher, options, index_paths[0], lCutInputFile[j], inputRevFile, space)) else: cmd2Launch.append(_createTopHatCommand(iLauncher, options, index_paths[i], lCutInputFile[j], inputRevFile, space)) cmdStart = [] cmdFinish = ["shutil.copyfile( os.path.join( 'tophat_out', 'accepted_hits.bam' ), '%s')" % cutOutput] lCmdsTuples.append(_map(iLauncher, cmd2Launch, cmdStart, cmdFinish)) dCutOut2Out[accepted_hits_outputNames[i]] = lCutOutput iLauncher.runLauncherForMultipleJobs(acronym, lCmdsTuples, True) joinBAM(dCutOut2Out) FileUtils.removeFilesFromListIfExist(lAllFile2remove) if options.outputTar != None: toTar(options.outputTar, accepted_hits_outputNames) # Clean up temp dirs for tmp_index_dir in tmp_index_dirs: if os.path.exists( tmp_index_dir ): shutil.rmtree( tmp_index_dir ) for tmp in tmp_files: os.remove(tmp) if __name__=="__main__": __main__()