Mercurial > repos > yufei-luo > s_mart
view SMART/DiffExpAnal/compareOverlapping_parallel_unSQL.py @ 34:529e3e6a0954
Deleted selected files
author | m-zytnicki |
---|---|
date | Tue, 30 Apr 2013 14:35:27 -0400 |
parents | 94ab73e8a190 |
children |
line wrap: on
line source
#! /usr/bin/env python #This program is a wrapp for CompareOverlapping.py. import os, sys, tarfile, optparse from commons.core.launcher.Launcher import Launcher from commons.core.sql.TableJobAdaptatorFactory import TableJobAdaptatorFactory from optparse import OptionParser from commons.core.utils.FileUtils import FileUtils from commons.core.parsing.ParserChooser import ParserChooser from SMART.Java.Python.structure.TranscriptList import TranscriptList from commons.core.writer.WriterChooser import WriterChooser def stop_err( msg ): sys.stderr.write( "%s\n" % msg ) sys.exit() def toTar(tarFileName, overlapOutputNames): dir = os.path.dirname(tarFileName) tfile = tarfile.open(tarFileName + ".tmp.tar", "w") currentPath = os.getcwd() os.chdir(dir) for file in overlapOutputNames: relativeFileName = os.path.basename(file) tfile.add(relativeFileName) os.system("mv %s %s" % (tarFileName + ".tmp.tar", tarFileName)) tfile.close() os.chdir(currentPath) def _createCompareOverlappingCmd(iLauncher, options, inputFileName, annotationFile, overlapOutputName): lArgs = [] lArgs.append("-i %s" % annotationFile) lArgs.append("-f %s" % options.format1) lArgs.append("-j %s" % inputFileName) lArgs.append("-g %s" % options.format2) lArgs.append("-o %s" % overlapOutputName) if options.notOverlapping: lArgs.append("-O") if options.exclude: lArgs.append("-x") if options.distance != None: lArgs.append("-d %s" % options.distance) return(iLauncher.getSystemCommand("python %s/SMART/Java/Python/CompareOverlappingSmallQuery.py" % os.environ["REPET_PATH"], lArgs)) def _map(iLauncher, cmd, cmdStart, cmdFinish ): lCmds = [] lCmds.append(cmd) lCmdStart = [] lCmdStart.append(cmdStart) lCmdFinish = [] lCmdFinish.append(cmdFinish) return(iLauncher.prepareCommands_withoutIndentation(lCmds, lCmdStart, lCmdFinish)) def split(fileName, nbOfSeqPerBatch): filePrefix, fileExt = os.path.splitext(os.path.basename(fileName)) resDir = os.path.dirname(fileName) lInputName = [] fileNb = 1 SeqNb = 0 outFileName = "%s/%s-%s%s" %(resDir, filePrefix, fileNb, fileExt) lInputName.append(outFileName) outFile = open(outFileName, "w") f = open(fileName, "r") line = f.readline() previousRefName = "" while line != "": if not line.startswith('@SQ'): if SeqNb == nbOfSeqPerBatch: SeqNb = 0 fileNb += 1 outFile.close() outFileName = "%s/%s-%s%s" %(resDir, filePrefix, fileNb, fileExt) lInputName.append(outFileName) outFile = open(outFileName, "w") refName = line.split("\t")[2] if previousRefName != refName: SeqNb += 1 outFile.write(line) else: previousRefName = refName outFile.write(line) line = f.readline() return lInputName def join(dCutOut2Out, options): chooser = ParserChooser() chooser.findFormat("gtf") gtfParser = chooser.getParser(options.inputFileName1) ref = {} for transcript in gtfParser.getIterator(): ref[transcript.getTagValue("ID")] = transcript for key in dCutOut2Out.keys(): writerChooser = WriterChooser() writerChooser.findFormat("gff3") for inputFile in dCutOut2Out[key]: chooser = ParserChooser() chooser.findFormat("gff") gffParser = chooser.getParser(inputFile) for transcript in gffParser.getIterator(): finalTranscript = ref[transcript.getTagValue("ID")] if finalTranscript.getTagValue("nbOverlaps"): nbOverlap = int(finalTranscript.getTagValue("nbOverlaps")) + int(transcript.getTagValue("nbOverlaps")) finalTranscript.setTagValue("nbOverlaps", nbOverlap) else: finalTranscript.setTagValue("nbOverlaps", transcript.getTagValue("nbOverlaps")) if finalTranscript.getTagValue("overlapsWith") and transcript.getTagValue("overlapsWith") != None: overlapName = "--".join([finalTranscript.getTagValue("overlapsWith"), transcript.getTagValue("overlapsWith")]) finalTranscript.setTagValue("overlapsWith", overlapName) else: if transcript.getTagValue("overlapsWith") != None: finalTranscript.setTagValue("overlapsWith", transcript.getTagValue("overlapsWith")) gffWriter = writerChooser.getWriter(key) gffWriter.setTitle("S-MART") for transcript in ref.values(): gffWriter.addTranscript(transcript) gffWriter.write() transcript.deleteTag("nbOverlaps") transcript.deleteTag("overlapsWith") gffWriter.close() def __main__(): description = "Compare Overlapping wrapp script: Get the a list of data which overlap with a reference set. [Category: Data Comparison]" parser = OptionParser(description = description) parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 (for annotation) [compulsory] [format: file in transcript format given by -f]") parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of file 1 [compulsory] [format: transcript file format]") parser.add_option("", "--inputTxt", dest="inputTxt", action="store", type="string", help="input, a txt file for a list of input reads files. Should identify all reads files format, given by -g [compulsory]") #parser.add_option("-j", "--input2", dest="inputFileName2", action="store", default="inputRead", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]") parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of file 2 [compulsory] [format: transcript file format]") #parser.add_option("-o", "--output", dest="output", action="store", default=None, type="string", help="output file [compulsory] [format: output file in GFF3 format]") parser.add_option("-S", "--start1", dest="start1", action="store", default=None, type="int", help="only consider the n first nucleotides of the transcripts in file 1 (do not use it with -U) [format: int]") parser.add_option("-s", "--start2", dest="start2", action="store", default=None, type="int", help="only consider the n first nucleotides of the transcripts in file 2 (do not use it with -u) [format: int]") parser.add_option("-U", "--end1", dest="end1", action="store", default=None, type="int", help="only consider the n last nucleotides of the transcripts in file 1 (do not use it with -S) [format: int]") parser.add_option("-u", "--end2", dest="end2", action="store", default=None, type="int", help="only consider the n last nucleotides of the transcripts in file 2 (do not use it with -s) [format: int]") parser.add_option("-t", "--intron", dest="introns", action="store_true", default=False, help="also report introns [format: bool] [default: false]") parser.add_option("-E", "--5primeExtension1", dest="fivePrime1", action="store", default=None, type="int", help="extension towards 5' in file 1 [format: int]") parser.add_option("-e", "--5primeExtension2", dest="fivePrime2", action="store", default=None, type="int", help="extension towards 5' in file 2 [format: int]") parser.add_option("-N", "--3primeExtension1", dest="threePrime1", action="store", default=None, type="int", help="extension towards 3' in file 1 [format: int]") parser.add_option("-n", "--3primeExtension2", dest="threePrime2", action="store", default=None, type="int", help="extension towards 3' in file 2 [format: int]") parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="colinear only [format: bool] [default: false]") parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="antisense only [format: bool] [default: false]") parser.add_option("-d", "--distance", dest="distance", action="store", default=None, type="int", help="accept some distance between query and reference [format: int]") parser.add_option("-k", "--included", dest="included", action="store_true", default=False, help="keep only elements from file 1 which are included in an element of file 2 [format: bool] [default: false]") parser.add_option("-K", "--including", dest="including", action="store_true", default=False, help="keep only elements from file 2 which are included in an element of file 1 [format: bool] [default: false]") parser.add_option("-m", "--minOverlap", dest="minOverlap", action="store", default=None, type="int", help="minimum number of nucleotides overlapping to declare an overlap [format: int] [default: 1]") parser.add_option("-p", "--pcOverlap", dest="pcOverlap", action="store", default=None, type="int", help="minimum percentage of nucleotides to overlap to declare an overlap [format: int]") parser.add_option("-O", "--notOverlapping", dest="notOverlapping", action="store_true", default=False, help="also output not overlapping data [format: bool] [default: false]") parser.add_option("-x", "--exclude", dest="exclude", action="store_true", default=False, help="invert the match [format: bool] [default: false]") parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") parser.add_option('', '--tar', dest='outputTar', default=None, help='output all SAM results in a tar file.' ) parser.add_option( '', '--outTxt', dest='outTxtFile', help='The output list of results files on txt format.[compulsory]' ) (options, args) = parser.parse_args() #Parse the input txt file and read a list of BAM files. file = open(options.inputTxt, "r") lines = file.readlines() inputFileNames = [] overlapOutputNames = [] outputName = options.outTxtFile resDirName = os.path.dirname(outputName) + "/" #Write output txt file and define all output sam file names. out = open(outputName, "w") for line in lines: tab = line.split() inputFileNames.append(tab[1]) overlapOutName = resDirName + tab[0] + '_overlapOut.gff3' overlapOutputNames.append(overlapOutName) out.write(tab[0] + '\t' + overlapOutName + '\n') file.close() out.close() #Launch on nodes acronym = "compareOverlapping" jobdb = TableJobAdaptatorFactory.createJobInstance() iLauncher = Launcher(jobdb, os.getcwd(), "", "", os.getcwd(), os.getcwd(), "jobs", "test", acronym, acronym, False, True) #construction the commandes for each input file lCmdsTuples = [] dCutOut2Out = {} lAllFile2remove = [] for i in range(len(inputFileNames)): lCutInputFile = split(inputFileNames[i], 20000) lAllFile2remove.extend(lCutInputFile) lCutOutput = [] for cutInput in lCutInputFile: cutOutput = "%s_out" % cutInput lCutOutput.append(cutOutput) lAllFile2remove.extend(lCutOutput) cmd2Launch = _createCompareOverlappingCmd(iLauncher, options, cutInput, options.inputFileName1, cutOutput) lCmdsTuples.append(_map(iLauncher, cmd2Launch, "", "")) chooser = ParserChooser() chooser.findFormat(options.format2) dCutOut2Out[overlapOutputNames[i]] = lCutOutput iLauncher.runLauncherForMultipleJobs(acronym, lCmdsTuples, True) join(dCutOut2Out, options) FileUtils.removeFilesFromListIfExist(lAllFile2remove) if options.outputTar != None: toTar(options.outputTar, overlapOutputNames) if __name__=="__main__": __main__()