view SMART/DiffExpAnal/compareOverlapping_parallel_unSQL.py @ 34:529e3e6a0954

Deleted selected files
author m-zytnicki
date Tue, 30 Apr 2013 14:35:27 -0400
parents 94ab73e8a190
children
line wrap: on
line source

#! /usr/bin/env python
#This program is a wrapp for CompareOverlapping.py.
import os, sys, tarfile, optparse
from commons.core.launcher.Launcher import Launcher
from commons.core.sql.TableJobAdaptatorFactory import TableJobAdaptatorFactory
from optparse import OptionParser
from commons.core.utils.FileUtils import FileUtils
from commons.core.parsing.ParserChooser import ParserChooser
from SMART.Java.Python.structure.TranscriptList import TranscriptList
from commons.core.writer.WriterChooser import WriterChooser

def stop_err( msg ):
	sys.stderr.write( "%s\n" % msg )
	sys.exit()

def toTar(tarFileName, overlapOutputNames):
	dir = os.path.dirname(tarFileName)	
	tfile = tarfile.open(tarFileName + ".tmp.tar", "w")
	currentPath = os.getcwd()
	os.chdir(dir)
	for file in overlapOutputNames:
		relativeFileName = os.path.basename(file)
		tfile.add(relativeFileName)
	os.system("mv %s %s" % (tarFileName + ".tmp.tar", tarFileName))
	tfile.close()
	os.chdir(currentPath)

def _createCompareOverlappingCmd(iLauncher, options, inputFileName, annotationFile, overlapOutputName):
	lArgs = []
	lArgs.append("-i %s" % annotationFile)
	lArgs.append("-f %s" % options.format1)
	lArgs.append("-j %s" % inputFileName)
	lArgs.append("-g %s" % options.format2)
	lArgs.append("-o %s" % overlapOutputName)
	if options.notOverlapping:
		lArgs.append("-O")
	if options.exclude:
		lArgs.append("-x")
	if options.distance != None:
		lArgs.append("-d %s" % options.distance)
	return(iLauncher.getSystemCommand("python %s/SMART/Java/Python/CompareOverlappingSmallQuery.py"  %  os.environ["REPET_PATH"], lArgs))

def _map(iLauncher, cmd, cmdStart, cmdFinish ):
	lCmds = []
	lCmds.append(cmd)
	lCmdStart = []
	lCmdStart.append(cmdStart)
	lCmdFinish = []
	lCmdFinish.append(cmdFinish)
	return(iLauncher.prepareCommands_withoutIndentation(lCmds, lCmdStart, lCmdFinish))

def split(fileName, nbOfSeqPerBatch):
	filePrefix, fileExt = os.path.splitext(os.path.basename(fileName))
	resDir = os.path.dirname(fileName)
	lInputName = []
	fileNb = 1
	SeqNb = 0
	outFileName = "%s/%s-%s%s" %(resDir, filePrefix, fileNb, fileExt)
	lInputName.append(outFileName)
	outFile = open(outFileName, "w")
	f = open(fileName, "r")
	line = f.readline()
	previousRefName = ""
	while line != "":
		if not line.startswith('@SQ'):
			if SeqNb == nbOfSeqPerBatch:
				SeqNb = 0
				fileNb += 1
				outFile.close()
				outFileName = "%s/%s-%s%s" %(resDir, filePrefix, fileNb, fileExt)
				lInputName.append(outFileName)
				outFile = open(outFileName, "w")
			refName = line.split("\t")[2]
			if previousRefName != refName:
				SeqNb += 1
				outFile.write(line)
			else:
				previousRefName = refName
				outFile.write(line)
		line = f.readline()
	return lInputName		

def join(dCutOut2Out, options):
	chooser = ParserChooser()
	chooser.findFormat("gtf")
	gtfParser = chooser.getParser(options.inputFileName1)
	ref = {}
	for transcript in gtfParser.getIterator():
		ref[transcript.getTagValue("ID")] = transcript
	for key in dCutOut2Out.keys():
		writerChooser = WriterChooser()
		writerChooser.findFormat("gff3")
		for inputFile in dCutOut2Out[key]:
			chooser = ParserChooser()
			chooser.findFormat("gff")
			gffParser = chooser.getParser(inputFile)
			for transcript in gffParser.getIterator():
					finalTranscript = ref[transcript.getTagValue("ID")]
					if finalTranscript.getTagValue("nbOverlaps"):
						nbOverlap = int(finalTranscript.getTagValue("nbOverlaps")) + int(transcript.getTagValue("nbOverlaps"))
						finalTranscript.setTagValue("nbOverlaps", nbOverlap)
					else:
						finalTranscript.setTagValue("nbOverlaps", transcript.getTagValue("nbOverlaps"))
					
					if finalTranscript.getTagValue("overlapsWith") and transcript.getTagValue("overlapsWith") != None:
						overlapName = "--".join([finalTranscript.getTagValue("overlapsWith"), transcript.getTagValue("overlapsWith")])
						finalTranscript.setTagValue("overlapsWith", overlapName)
					else:
						if transcript.getTagValue("overlapsWith") != None:
							finalTranscript.setTagValue("overlapsWith", transcript.getTagValue("overlapsWith"))

		gffWriter = writerChooser.getWriter(key)
		gffWriter.setTitle("S-MART")
		for transcript in ref.values():
				gffWriter.addTranscript(transcript)
				gffWriter.write()
				transcript.deleteTag("nbOverlaps")
				transcript.deleteTag("overlapsWith")
		gffWriter.close()	
		
def __main__():
	description = "Compare Overlapping wrapp script: Get the a list of data which overlap with a reference set. [Category: Data Comparison]"
	parser = OptionParser(description = description)
	parser.add_option("-i", "--input1",		   dest="inputFileName1", action="store",					 type="string", help="input file 1 (for annotation) [compulsory] [format: file in transcript format given by -f]")
	parser.add_option("-f", "--format1",		  dest="format1",		action="store",					 type="string", help="format of file 1 [compulsory] [format: transcript file format]")
	parser.add_option("", "--inputTxt", 		dest="inputTxt", 		action="store", 				type="string", 	help="input, a txt file for a list of input reads files. Should identify all reads files format, given by -g [compulsory]")
	#parser.add_option("-j", "--input2",		   dest="inputFileName2", action="store",	default="inputRead",	 type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]")
	parser.add_option("-g", "--format2",		  dest="format2",		action="store",				 type="string", help="format of file 2 [compulsory] [format: transcript file format]")
	#parser.add_option("-o", "--output",		   dest="output",		 action="store",	  default=None,  type="string", help="output file [compulsory] [format: output file in GFF3 format]")
	parser.add_option("-S", "--start1",		   dest="start1",		 action="store",	  default=None,  type="int",	help="only consider the n first nucleotides of the transcripts in file 1 (do not use it with -U) [format: int]")
	parser.add_option("-s", "--start2",		   dest="start2",		 action="store",	  default=None,  type="int",	help="only consider the n first nucleotides of the transcripts in file 2 (do not use it with -u) [format: int]")
	parser.add_option("-U", "--end1",			 dest="end1",		   action="store",	  default=None,  type="int",	help="only consider the n last nucleotides of the transcripts in file 1 (do not use it with -S) [format: int]")
	parser.add_option("-u", "--end2",			 dest="end2",		   action="store",	  default=None,  type="int",	help="only consider the n last nucleotides of the transcripts in file 2 (do not use it with -s) [format: int]")
	parser.add_option("-t", "--intron",		   dest="introns",		action="store_true", default=False,				help="also report introns [format: bool] [default: false]")
	parser.add_option("-E", "--5primeExtension1", dest="fivePrime1",	 action="store",	  default=None,  type="int",	help="extension towards 5' in file 1 [format: int]")
	parser.add_option("-e", "--5primeExtension2", dest="fivePrime2",	 action="store",	  default=None,  type="int",	help="extension towards 5' in file 2 [format: int]")
	parser.add_option("-N", "--3primeExtension1", dest="threePrime1",	action="store",	  default=None,  type="int",	help="extension towards 3' in file 1 [format: int]")
	parser.add_option("-n", "--3primeExtension2", dest="threePrime2",	action="store",	  default=None,  type="int",	help="extension towards 3' in file 2 [format: int]")
	parser.add_option("-c", "--colinear",		 dest="colinear",	   action="store_true", default=False,				help="colinear only [format: bool] [default: false]")
	parser.add_option("-a", "--antisense",		dest="antisense",	  action="store_true", default=False,				help="antisense only [format: bool] [default: false]")
	parser.add_option("-d", "--distance",		 dest="distance",	   action="store",	  default=None,	 type="int",	help="accept some distance between query and reference [format: int]")
	parser.add_option("-k", "--included",		 dest="included",	   action="store_true", default=False,				help="keep only elements from file 1 which are included in an element of file 2 [format: bool] [default: false]")
	parser.add_option("-K", "--including",		dest="including",	  action="store_true", default=False,				help="keep only elements from file 2 which are included in an element of file 1 [format: bool] [default: false]")
	parser.add_option("-m", "--minOverlap",	   dest="minOverlap",	 action="store",	  default=None,	 type="int",	help="minimum number of nucleotides overlapping to declare an overlap [format: int] [default: 1]")
	parser.add_option("-p", "--pcOverlap",		dest="pcOverlap",	  action="store",	  default=None,  type="int",	help="minimum percentage of nucleotides to overlap to declare an overlap [format: int]")
	parser.add_option("-O", "--notOverlapping",   dest="notOverlapping", action="store_true", default=False,				help="also output not overlapping data [format: bool] [default: false]")
	parser.add_option("-x", "--exclude",		  dest="exclude",		action="store_true", default=False,				help="invert the match [format: bool] [default: false]")
	parser.add_option("-v", "--verbosity",		dest="verbosity",	  action="store",	  default=1,	 type="int",	help="trace level [format: int]")
	parser.add_option('', '--tar', dest='outputTar', default=None, help='output all SAM results in a tar file.' )
	parser.add_option( '', '--outTxt', dest='outTxtFile', help='The output list of results files on txt format.[compulsory]' )
	(options, args) = parser.parse_args()
	
	
	#Parse the input txt file and read a list of BAM files.
	file = open(options.inputTxt, "r")
	lines = file.readlines()
	inputFileNames = []
	overlapOutputNames = []
	outputName = options.outTxtFile
	resDirName = os.path.dirname(outputName) + "/"
	#Write output txt file and define all output sam file names.
	out = open(outputName, "w")
	for line in lines:
		tab = line.split()
		inputFileNames.append(tab[1])
		overlapOutName = resDirName + tab[0] + '_overlapOut.gff3'
		overlapOutputNames.append(overlapOutName)
		out.write(tab[0] + '\t' + overlapOutName  + '\n')
	file.close()
	out.close()
	
	#Launch on nodes
	acronym = "compareOverlapping"
	jobdb = TableJobAdaptatorFactory.createJobInstance()
	iLauncher = Launcher(jobdb, os.getcwd(), "", "", os.getcwd(), os.getcwd(), "jobs", "test", acronym, acronym, False, True)


	

	#construction the commandes for each input file
	lCmdsTuples = []
	dCutOut2Out = {}
	lAllFile2remove = []
	for i in range(len(inputFileNames)):
		lCutInputFile = split(inputFileNames[i], 20000)
		lAllFile2remove.extend(lCutInputFile)
		lCutOutput = []
		for cutInput in lCutInputFile:
			cutOutput = "%s_out" % cutInput
			lCutOutput.append(cutOutput)
			lAllFile2remove.extend(lCutOutput)
			cmd2Launch = _createCompareOverlappingCmd(iLauncher, options, cutInput, options.inputFileName1, cutOutput)
			lCmdsTuples.append(_map(iLauncher, cmd2Launch, "", ""))
		chooser = ParserChooser()
		chooser.findFormat(options.format2)
		dCutOut2Out[overlapOutputNames[i]] = lCutOutput
	iLauncher.runLauncherForMultipleJobs(acronym, lCmdsTuples, True)
	
	join(dCutOut2Out, options)
	FileUtils.removeFilesFromListIfExist(lAllFile2remove)

	if options.outputTar != None:
		toTar(options.outputTar, overlapOutputNames)	

if __name__=="__main__": __main__()