Mercurial > repos > yufei-luo > s_mart

diff SMART/DiffExpAnal/compareOverlapping_parallel.py @ 31:0ab839023fe4
Uploaded
author: m-zytnicki
date: Tue, 30 Apr 2013 14:33:21 -0400
parents: 94ab73e8a190
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SMART/DiffExpAnal/compareOverlapping_parallel.py	Tue Apr 30 14:33:21 2013 -0400
@@ -0,0 +1,175 @@
+#! /usr/bin/env python
+#This program is a wrapp for CompareOverlapping.py.
+import optparse, os, sys, subprocess, tempfile, shutil, tarfile, glob
+import os, struct, time, random
+from optparse import OptionParser
+from commons.core.parsing.ParserChooser import ParserChooser
+from commons.core.writer.Gff3Writer import Gff3Writer
+from SMART.Java.Python.CompareOverlapping import CompareOverlapping
+from SMART.Java.Python.structure.Transcript import Transcript
+from SMART.Java.Python.structure.Interval import Interval
+from SMART.Java.Python.ncList.NCList import NCList
+from SMART.Java.Python.ncList.NCListCursor import NCListCursor
+from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle
+from SMART.Java.Python.ncList.FileSorter import FileSorter
+from SMART.Java.Python.misc.Progress import Progress
+from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
+from SMART.Java.Python.misc import Utils
+
+
+
+def stop_err( msg ):
+	sys.stderr.write( "%s\n" % msg )
+	sys.exit()
+
+def toTar(tarFileName, overlapOutputNames):
+	dir = os.path.dirname(tarFileName)	
+	tfile = tarfile.open(tarFileName + ".tmp.tar", "w")
+	currentPath = os.getcwd()
+	os.chdir(dir)
+	for file in overlapOutputNames:
+		relativeFileName = os.path.basename(file)
+		tfile.add(relativeFileName)
+	os.system("mv %s %s" % (tarFileName + ".tmp.tar", tarFileName))
+	tfile.close()
+	os.chdir(currentPath)
+
+def __main__():
+	description = "Compare Overlapping wrapp script: Get the a list of data which overlap with a reference set. [Category: Data Comparison]"
+	parser = OptionParser(description = description)
+	parser.add_option("-i", "--input1",		   dest="inputFileName1", action="store",					 type="string", help="input file 1 (for annotation) [compulsory] [format: file in transcript format given by -f]")
+	parser.add_option("-f", "--format1",		  dest="format1",		action="store",					 type="string", help="format of file 1 [compulsory] [format: transcript file format]")
+	parser.add_option("", "--inputTxt", 		dest="inputTxt", 		action="store", 				type="string", 	help="input, a txt file for a list of input reads files. Should identify all reads files format, given by -g [compulsory]")
+	#parser.add_option("-j", "--input2",		   dest="inputFileName2", action="store",	default="inputRead",	 type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]")
+	parser.add_option("-g", "--format2",		  dest="format2",		action="store",				 type="string", help="format of file 2 [compulsory] [format: transcript file format]")
+	#parser.add_option("-o", "--output",		   dest="output",		 action="store",	  default=None,  type="string", help="output file [compulsory] [format: output file in GFF3 format]")
+	parser.add_option("-S", "--start1",		   dest="start1",		 action="store",	  default=None,  type="int",	help="only consider the n first nucleotides of the transcripts in file 1 (do not use it with -U) [format: int]")
+	parser.add_option("-s", "--start2",		   dest="start2",		 action="store",	  default=None,  type="int",	help="only consider the n first nucleotides of the transcripts in file 2 (do not use it with -u) [format: int]")
+	parser.add_option("-U", "--end1",			 dest="end1",		   action="store",	  default=None,  type="int",	help="only consider the n last nucleotides of the transcripts in file 1 (do not use it with -S) [format: int]")
+	parser.add_option("-u", "--end2",			 dest="end2",		   action="store",	  default=None,  type="int",	help="only consider the n last nucleotides of the transcripts in file 2 (do not use it with -s) [format: int]")
+	parser.add_option("-t", "--intron",		   dest="introns",		action="store_true", default=False,				help="also report introns [format: bool] [default: false]")
+	parser.add_option("-E", "--5primeExtension1", dest="fivePrime1",	 action="store",	  default=None,  type="int",	help="extension towards 5' in file 1 [format: int]")
+	parser.add_option("-e", "--5primeExtension2", dest="fivePrime2",	 action="store",	  default=None,  type="int",	help="extension towards 5' in file 2 [format: int]")
+	parser.add_option("-N", "--3primeExtension1", dest="threePrime1",	action="store",	  default=None,  type="int",	help="extension towards 3' in file 1 [format: int]")
+	parser.add_option("-n", "--3primeExtension2", dest="threePrime2",	action="store",	  default=None,  type="int",	help="extension towards 3' in file 2 [format: int]")
+	parser.add_option("-c", "--colinear",		 dest="colinear",	   action="store_true", default=False,				help="colinear only [format: bool] [default: false]")
+	parser.add_option("-a", "--antisense",		dest="antisense",	  action="store_true", default=False,				help="antisense only [format: bool] [default: false]")
+	parser.add_option("-d", "--distance",		 dest="distance",	   action="store",	  default=None,	 type="int",	help="accept some distance between query and reference [format: int]")
+	parser.add_option("-k", "--included",		 dest="included",	   action="store_true", default=False,				help="keep only elements from file 1 which are included in an element of file 2 [format: bool] [default: false]")
+	parser.add_option("-K", "--including",		dest="including",	  action="store_true", default=False,				help="keep only elements from file 2 which are included in an element of file 1 [format: bool] [default: false]")
+	parser.add_option("-m", "--minOverlap",	   dest="minOverlap",	 action="store",	  default=None,	 type="int",	help="minimum number of nucleotides overlapping to declare an overlap [format: int] [default: 1]")
+	parser.add_option("-p", "--pcOverlap",		dest="pcOverlap",	  action="store",	  default=None,  type="int",	help="minimum percentage of nucleotides to overlap to declare an overlap [format: int]")
+	parser.add_option("-O", "--notOverlapping",   dest="notOverlapping", action="store_true", default=False,				help="also output not overlapping data [format: bool] [default: false]")
+	parser.add_option("-x", "--exclude",		  dest="exclude",		action="store_true", default=False,				help="invert the match [format: bool] [default: false]")
+	parser.add_option("-v", "--verbosity",		dest="verbosity",	  action="store",	  default=1,	 type="int",	help="trace level [format: int]")
+	parser.add_option('', '--tar', dest='outputTar', default=None, help='output all SAM results in a tar file.' )
+	parser.add_option( '', '--outTxt', dest='outTxtFile', help='The output list of results files on txt format.[compulsory]' )
+	(options, args) = parser.parse_args()
+	
+	
+	#Parse the input txt file and read a list of BAM files.
+	file = open(options.inputTxt, "r")
+	lines = file.readlines()
+	inputFileNames = []
+	overlapOutputNames = []
+	outputName = options.outTxtFile
+	resDirName = os.path.dirname(outputName) + "/"
+	#Write output txt file and define all output sam file names.
+	out = open(outputName, "w")
+	for line in lines:
+		tab = line.split()
+		inputFileNames.append(tab[1])
+		overlapOutName = resDirName + tab[0] + '_overlapOut_%s.gff3' % random.randrange(0, 10000)
+		overlapOutputNames.append(overlapOutName)
+		out.write(tab[0] + '\t' + overlapOutName  + '\n')
+	file.close()
+	out.close()
+	
+	#construction the commandes for each input file
+	cmds = []
+	for i in range(len(inputFileNames)):
+		absFile = sys.argv[0]
+		absDir = os.path.dirname(absFile)
+		parentDir = os.path.abspath(os.path.join(absDir, os.path.pardir))
+		cmd = "python %s/Java/Python/CompareOverlappingSmallQuery.py " % parentDir
+		opts = "-i %s -f %s -j %s -g %s -o %s " % (options.inputFileName1, options.format1, inputFileNames[i], options.format2, overlapOutputNames[i])
+		#if options.start1 != None:
+		#	opts += "-S %s " % options.start1
+		#if options.start2 != None:
+		#	opts += "-s %s " % options.start2
+		#if options.end1 != None:
+		#	opts += "-U %s " % options.end1
+		#if options.end2 != None:
+		#	opts += "-u %s " % options.end2
+		#if options.fivePrime1 != None:
+		#	opts += "-E %s " % options.fivePrime1
+		#if options.fivePrime2 != None:
+		#	opts += "-e %s " % options.fivePrime2
+		#if options.threePrime1 != None:
+		#	opts += "-N %s " % options.threePrime1
+		#if options.threePrime2 != None:
+		#	opts += "-n %s " % options.threePrime2
+		#if options.colinear:
+		#	opts += "-c "
+		#if options.antisense:
+		#	opts +="-a "
+		#if options.included:
+		#	opts += "-k "
+		#if options.including:
+		#	opts += "-K "
+		#if options.pcOverlap != None:
+		#	opts += "-p %s " % options.pcOverlap
+		if options.notOverlapping:
+			opts += "-O "
+		if options.exclude:
+			opts += "-x "
+		if options.distance != None:
+			opts += "-d %s " % options.distance
+		#if options.minOverlap != None:
+		#	opts += "-m %s " % options.minOverlap
+		cmd += opts
+		cmds.append(cmd)
+
+
+	print "les commandes sont %s \n" % cmds
+
+	tmp_files = []	
+	for i in range(len(cmds)):
+		try:
+			tmp_out = tempfile.NamedTemporaryFile().name
+			tmp_files.append(tmp_out)
+			tmp_stdout = open( tmp_out, 'wb' )
+			tmp_err = tempfile.NamedTemporaryFile().name
+			tmp_files.append(tmp_err)
+			tmp_stderr = open( tmp_err, 'wb' )
+			proc = subprocess.Popen( args=cmds[i], shell=True, cwd=".", stdout=tmp_stdout, stderr=tmp_stderr )
+			returncode = proc.wait()
+			tmp_stderr.close()
+			# get stderr, allowing for case where it's very large
+			tmp_stderr = open( tmp_err, 'rb' )
+			stderr = ''
+			buffsize = 1048576
+			try:
+				while True:
+					stderr += tmp_stderr.read( buffsize )
+					if not stderr or len( stderr ) % buffsize != 0:
+						break
+			except OverflowError:
+				pass
+			tmp_stdout.close()
+			tmp_stderr.close()
+			if returncode != 0:
+				raise Exception, stderr
+		except Exception, e:
+			stop_err( 'Error in :\n' + str( e ) )
+
+	if options.outputTar != None:
+		toTar(options.outputTar, overlapOutputNames)	
+	
+	for tmp_file in tmp_files:
+		os.remove(tmp_file)
+
+
+if __name__=="__main__": __main__()		
+		
+
author	m-zytnicki
date	Tue, 30 Apr 2013 14:33:21 -0400
parents	94ab73e8a190
children