diff SMART/DiffExpAnal/compareOverlapping_parallel_unSQL.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SMART/DiffExpAnal/compareOverlapping_parallel_unSQL.py	Mon Apr 29 03:20:15 2013 -0400
@@ -0,0 +1,205 @@
+#! /usr/bin/env python
+#This program is a wrapp for CompareOverlapping.py.
+import os, sys, tarfile, optparse
+from commons.core.launcher.Launcher import Launcher
+from commons.core.sql.TableJobAdaptatorFactory import TableJobAdaptatorFactory
+from optparse import OptionParser
+from commons.core.utils.FileUtils import FileUtils
+from commons.core.parsing.ParserChooser import ParserChooser
+from SMART.Java.Python.structure.TranscriptList import TranscriptList
+from commons.core.writer.WriterChooser import WriterChooser
+
+def stop_err( msg ):
+	sys.stderr.write( "%s\n" % msg )
+	sys.exit()
+
+def toTar(tarFileName, overlapOutputNames):
+	dir = os.path.dirname(tarFileName)	
+	tfile = tarfile.open(tarFileName + ".tmp.tar", "w")
+	currentPath = os.getcwd()
+	os.chdir(dir)
+	for file in overlapOutputNames:
+		relativeFileName = os.path.basename(file)
+		tfile.add(relativeFileName)
+	os.system("mv %s %s" % (tarFileName + ".tmp.tar", tarFileName))
+	tfile.close()
+	os.chdir(currentPath)
+
+def _createCompareOverlappingCmd(iLauncher, options, inputFileName, annotationFile, overlapOutputName):
+	lArgs = []
+	lArgs.append("-i %s" % annotationFile)
+	lArgs.append("-f %s" % options.format1)
+	lArgs.append("-j %s" % inputFileName)
+	lArgs.append("-g %s" % options.format2)
+	lArgs.append("-o %s" % overlapOutputName)
+	if options.notOverlapping:
+		lArgs.append("-O")
+	if options.exclude:
+		lArgs.append("-x")
+	if options.distance != None:
+		lArgs.append("-d %s" % options.distance)
+	return(iLauncher.getSystemCommand("python %s/SMART/Java/Python/CompareOverlappingSmallQuery.py"  %  os.environ["REPET_PATH"], lArgs))
+
+def _map(iLauncher, cmd, cmdStart, cmdFinish ):
+	lCmds = []
+	lCmds.append(cmd)
+	lCmdStart = []
+	lCmdStart.append(cmdStart)
+	lCmdFinish = []
+	lCmdFinish.append(cmdFinish)
+	return(iLauncher.prepareCommands_withoutIndentation(lCmds, lCmdStart, lCmdFinish))
+
+def split(fileName, nbOfSeqPerBatch):
+	filePrefix, fileExt = os.path.splitext(os.path.basename(fileName))
+	resDir = os.path.dirname(fileName)
+	lInputName = []
+	fileNb = 1
+	SeqNb = 0
+	outFileName = "%s/%s-%s%s" %(resDir, filePrefix, fileNb, fileExt)
+	lInputName.append(outFileName)
+	outFile = open(outFileName, "w")
+	f = open(fileName, "r")
+	line = f.readline()
+	previousRefName = ""
+	while line != "":
+		if not line.startswith('@SQ'):
+			if SeqNb == nbOfSeqPerBatch:
+				SeqNb = 0
+				fileNb += 1
+				outFile.close()
+				outFileName = "%s/%s-%s%s" %(resDir, filePrefix, fileNb, fileExt)
+				lInputName.append(outFileName)
+				outFile = open(outFileName, "w")
+			refName = line.split("\t")[2]
+			if previousRefName != refName:
+				SeqNb += 1
+				outFile.write(line)
+			else:
+				previousRefName = refName
+				outFile.write(line)
+		line = f.readline()
+	return lInputName		
+
+def join(dCutOut2Out, options):
+	chooser = ParserChooser()
+	chooser.findFormat("gtf")
+	gtfParser = chooser.getParser(options.inputFileName1)
+	ref = {}
+	for transcript in gtfParser.getIterator():
+		ref[transcript.getTagValue("ID")] = transcript
+	for key in dCutOut2Out.keys():
+		writerChooser = WriterChooser()
+		writerChooser.findFormat("gff3")
+		for inputFile in dCutOut2Out[key]:
+			chooser = ParserChooser()
+			chooser.findFormat("gff")
+			gffParser = chooser.getParser(inputFile)
+			for transcript in gffParser.getIterator():
+					finalTranscript = ref[transcript.getTagValue("ID")]
+					if finalTranscript.getTagValue("nbOverlaps"):
+						nbOverlap = int(finalTranscript.getTagValue("nbOverlaps")) + int(transcript.getTagValue("nbOverlaps"))
+						finalTranscript.setTagValue("nbOverlaps", nbOverlap)
+					else:
+						finalTranscript.setTagValue("nbOverlaps", transcript.getTagValue("nbOverlaps"))
+					
+					if finalTranscript.getTagValue("overlapsWith") and transcript.getTagValue("overlapsWith") != None:
+						overlapName = "--".join([finalTranscript.getTagValue("overlapsWith"), transcript.getTagValue("overlapsWith")])
+						finalTranscript.setTagValue("overlapsWith", overlapName)
+					else:
+						if transcript.getTagValue("overlapsWith") != None:
+							finalTranscript.setTagValue("overlapsWith", transcript.getTagValue("overlapsWith"))
+
+		gffWriter = writerChooser.getWriter(key)
+		gffWriter.setTitle("S-MART")
+		for transcript in ref.values():
+				gffWriter.addTranscript(transcript)
+				gffWriter.write()
+				transcript.deleteTag("nbOverlaps")
+				transcript.deleteTag("overlapsWith")
+		gffWriter.close()	
+		
+def __main__():
+	description = "Compare Overlapping wrapp script: Get the a list of data which overlap with a reference set. [Category: Data Comparison]"
+	parser = OptionParser(description = description)
+	parser.add_option("-i", "--input1",		   dest="inputFileName1", action="store",					 type="string", help="input file 1 (for annotation) [compulsory] [format: file in transcript format given by -f]")
+	parser.add_option("-f", "--format1",		  dest="format1",		action="store",					 type="string", help="format of file 1 [compulsory] [format: transcript file format]")
+	parser.add_option("", "--inputTxt", 		dest="inputTxt", 		action="store", 				type="string", 	help="input, a txt file for a list of input reads files. Should identify all reads files format, given by -g [compulsory]")
+	#parser.add_option("-j", "--input2",		   dest="inputFileName2", action="store",	default="inputRead",	 type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]")
+	parser.add_option("-g", "--format2",		  dest="format2",		action="store",				 type="string", help="format of file 2 [compulsory] [format: transcript file format]")
+	#parser.add_option("-o", "--output",		   dest="output",		 action="store",	  default=None,  type="string", help="output file [compulsory] [format: output file in GFF3 format]")
+	parser.add_option("-S", "--start1",		   dest="start1",		 action="store",	  default=None,  type="int",	help="only consider the n first nucleotides of the transcripts in file 1 (do not use it with -U) [format: int]")
+	parser.add_option("-s", "--start2",		   dest="start2",		 action="store",	  default=None,  type="int",	help="only consider the n first nucleotides of the transcripts in file 2 (do not use it with -u) [format: int]")
+	parser.add_option("-U", "--end1",			 dest="end1",		   action="store",	  default=None,  type="int",	help="only consider the n last nucleotides of the transcripts in file 1 (do not use it with -S) [format: int]")
+	parser.add_option("-u", "--end2",			 dest="end2",		   action="store",	  default=None,  type="int",	help="only consider the n last nucleotides of the transcripts in file 2 (do not use it with -s) [format: int]")
+	parser.add_option("-t", "--intron",		   dest="introns",		action="store_true", default=False,				help="also report introns [format: bool] [default: false]")
+	parser.add_option("-E", "--5primeExtension1", dest="fivePrime1",	 action="store",	  default=None,  type="int",	help="extension towards 5' in file 1 [format: int]")
+	parser.add_option("-e", "--5primeExtension2", dest="fivePrime2",	 action="store",	  default=None,  type="int",	help="extension towards 5' in file 2 [format: int]")
+	parser.add_option("-N", "--3primeExtension1", dest="threePrime1",	action="store",	  default=None,  type="int",	help="extension towards 3' in file 1 [format: int]")
+	parser.add_option("-n", "--3primeExtension2", dest="threePrime2",	action="store",	  default=None,  type="int",	help="extension towards 3' in file 2 [format: int]")
+	parser.add_option("-c", "--colinear",		 dest="colinear",	   action="store_true", default=False,				help="colinear only [format: bool] [default: false]")
+	parser.add_option("-a", "--antisense",		dest="antisense",	  action="store_true", default=False,				help="antisense only [format: bool] [default: false]")
+	parser.add_option("-d", "--distance",		 dest="distance",	   action="store",	  default=None,	 type="int",	help="accept some distance between query and reference [format: int]")
+	parser.add_option("-k", "--included",		 dest="included",	   action="store_true", default=False,				help="keep only elements from file 1 which are included in an element of file 2 [format: bool] [default: false]")
+	parser.add_option("-K", "--including",		dest="including",	  action="store_true", default=False,				help="keep only elements from file 2 which are included in an element of file 1 [format: bool] [default: false]")
+	parser.add_option("-m", "--minOverlap",	   dest="minOverlap",	 action="store",	  default=None,	 type="int",	help="minimum number of nucleotides overlapping to declare an overlap [format: int] [default: 1]")
+	parser.add_option("-p", "--pcOverlap",		dest="pcOverlap",	  action="store",	  default=None,  type="int",	help="minimum percentage of nucleotides to overlap to declare an overlap [format: int]")
+	parser.add_option("-O", "--notOverlapping",   dest="notOverlapping", action="store_true", default=False,				help="also output not overlapping data [format: bool] [default: false]")
+	parser.add_option("-x", "--exclude",		  dest="exclude",		action="store_true", default=False,				help="invert the match [format: bool] [default: false]")
+	parser.add_option("-v", "--verbosity",		dest="verbosity",	  action="store",	  default=1,	 type="int",	help="trace level [format: int]")
+	parser.add_option('', '--tar', dest='outputTar', default=None, help='output all SAM results in a tar file.' )
+	parser.add_option( '', '--outTxt', dest='outTxtFile', help='The output list of results files on txt format.[compulsory]' )
+	(options, args) = parser.parse_args()
+	
+	
+	#Parse the input txt file and read a list of BAM files.
+	file = open(options.inputTxt, "r")
+	lines = file.readlines()
+	inputFileNames = []
+	overlapOutputNames = []
+	outputName = options.outTxtFile
+	resDirName = os.path.dirname(outputName) + "/"
+	#Write output txt file and define all output sam file names.
+	out = open(outputName, "w")
+	for line in lines:
+		tab = line.split()
+		inputFileNames.append(tab[1])
+		overlapOutName = resDirName + tab[0] + '_overlapOut.gff3'
+		overlapOutputNames.append(overlapOutName)
+		out.write(tab[0] + '\t' + overlapOutName  + '\n')
+	file.close()
+	out.close()
+	
+	#Launch on nodes
+	acronym = "compareOverlapping"
+	jobdb = TableJobAdaptatorFactory.createJobInstance()
+	iLauncher = Launcher(jobdb, os.getcwd(), "", "", os.getcwd(), os.getcwd(), "jobs", "test", acronym, acronym, False, True)
+
+
+	
+
+	#construction the commandes for each input file
+	lCmdsTuples = []
+	dCutOut2Out = {}
+	lAllFile2remove = []
+	for i in range(len(inputFileNames)):
+		lCutInputFile = split(inputFileNames[i], 20000)
+		lAllFile2remove.extend(lCutInputFile)
+		lCutOutput = []
+		for cutInput in lCutInputFile:
+			cutOutput = "%s_out" % cutInput
+			lCutOutput.append(cutOutput)
+			lAllFile2remove.extend(lCutOutput)
+			cmd2Launch = _createCompareOverlappingCmd(iLauncher, options, cutInput, options.inputFileName1, cutOutput)
+			lCmdsTuples.append(_map(iLauncher, cmd2Launch, "", ""))
+		chooser = ParserChooser()
+		chooser.findFormat(options.format2)
+		dCutOut2Out[overlapOutputNames[i]] = lCutOutput
+	iLauncher.runLauncherForMultipleJobs(acronym, lCmdsTuples, True)
+	
+	join(dCutOut2Out, options)
+	FileUtils.removeFilesFromListIfExist(lAllFile2remove)
+
+	if options.outputTar != None:
+		toTar(options.outputTar, overlapOutputNames)	
+
+if __name__=="__main__": __main__()		
\ No newline at end of file