comparison SMART/DiffExpAnal/compareOverlapping_parallel.py @ 31:0ab839023fe4

Uploaded
author m-zytnicki
date Tue, 30 Apr 2013 14:33:21 -0400
parents 94ab73e8a190
children
comparison
equal deleted inserted replaced
30:5677346472b5 31:0ab839023fe4
1 #! /usr/bin/env python
2 #This program is a wrapp for CompareOverlapping.py.
3 import optparse, os, sys, subprocess, tempfile, shutil, tarfile, glob
4 import os, struct, time, random
5 from optparse import OptionParser
6 from commons.core.parsing.ParserChooser import ParserChooser
7 from commons.core.writer.Gff3Writer import Gff3Writer
8 from SMART.Java.Python.CompareOverlapping import CompareOverlapping
9 from SMART.Java.Python.structure.Transcript import Transcript
10 from SMART.Java.Python.structure.Interval import Interval
11 from SMART.Java.Python.ncList.NCList import NCList
12 from SMART.Java.Python.ncList.NCListCursor import NCListCursor
13 from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle
14 from SMART.Java.Python.ncList.FileSorter import FileSorter
15 from SMART.Java.Python.misc.Progress import Progress
16 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
17 from SMART.Java.Python.misc import Utils
18
19
20
21 def stop_err( msg ):
22 sys.stderr.write( "%s\n" % msg )
23 sys.exit()
24
25 def toTar(tarFileName, overlapOutputNames):
26 dir = os.path.dirname(tarFileName)
27 tfile = tarfile.open(tarFileName + ".tmp.tar", "w")
28 currentPath = os.getcwd()
29 os.chdir(dir)
30 for file in overlapOutputNames:
31 relativeFileName = os.path.basename(file)
32 tfile.add(relativeFileName)
33 os.system("mv %s %s" % (tarFileName + ".tmp.tar", tarFileName))
34 tfile.close()
35 os.chdir(currentPath)
36
37 def __main__():
38 description = "Compare Overlapping wrapp script: Get the a list of data which overlap with a reference set. [Category: Data Comparison]"
39 parser = OptionParser(description = description)
40 parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 (for annotation) [compulsory] [format: file in transcript format given by -f]")
41 parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of file 1 [compulsory] [format: transcript file format]")
42 parser.add_option("", "--inputTxt", dest="inputTxt", action="store", type="string", help="input, a txt file for a list of input reads files. Should identify all reads files format, given by -g [compulsory]")
43 #parser.add_option("-j", "--input2", dest="inputFileName2", action="store", default="inputRead", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]")
44 parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of file 2 [compulsory] [format: transcript file format]")
45 #parser.add_option("-o", "--output", dest="output", action="store", default=None, type="string", help="output file [compulsory] [format: output file in GFF3 format]")
46 parser.add_option("-S", "--start1", dest="start1", action="store", default=None, type="int", help="only consider the n first nucleotides of the transcripts in file 1 (do not use it with -U) [format: int]")
47 parser.add_option("-s", "--start2", dest="start2", action="store", default=None, type="int", help="only consider the n first nucleotides of the transcripts in file 2 (do not use it with -u) [format: int]")
48 parser.add_option("-U", "--end1", dest="end1", action="store", default=None, type="int", help="only consider the n last nucleotides of the transcripts in file 1 (do not use it with -S) [format: int]")
49 parser.add_option("-u", "--end2", dest="end2", action="store", default=None, type="int", help="only consider the n last nucleotides of the transcripts in file 2 (do not use it with -s) [format: int]")
50 parser.add_option("-t", "--intron", dest="introns", action="store_true", default=False, help="also report introns [format: bool] [default: false]")
51 parser.add_option("-E", "--5primeExtension1", dest="fivePrime1", action="store", default=None, type="int", help="extension towards 5' in file 1 [format: int]")
52 parser.add_option("-e", "--5primeExtension2", dest="fivePrime2", action="store", default=None, type="int", help="extension towards 5' in file 2 [format: int]")
53 parser.add_option("-N", "--3primeExtension1", dest="threePrime1", action="store", default=None, type="int", help="extension towards 3' in file 1 [format: int]")
54 parser.add_option("-n", "--3primeExtension2", dest="threePrime2", action="store", default=None, type="int", help="extension towards 3' in file 2 [format: int]")
55 parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="colinear only [format: bool] [default: false]")
56 parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="antisense only [format: bool] [default: false]")
57 parser.add_option("-d", "--distance", dest="distance", action="store", default=None, type="int", help="accept some distance between query and reference [format: int]")
58 parser.add_option("-k", "--included", dest="included", action="store_true", default=False, help="keep only elements from file 1 which are included in an element of file 2 [format: bool] [default: false]")
59 parser.add_option("-K", "--including", dest="including", action="store_true", default=False, help="keep only elements from file 2 which are included in an element of file 1 [format: bool] [default: false]")
60 parser.add_option("-m", "--minOverlap", dest="minOverlap", action="store", default=None, type="int", help="minimum number of nucleotides overlapping to declare an overlap [format: int] [default: 1]")
61 parser.add_option("-p", "--pcOverlap", dest="pcOverlap", action="store", default=None, type="int", help="minimum percentage of nucleotides to overlap to declare an overlap [format: int]")
62 parser.add_option("-O", "--notOverlapping", dest="notOverlapping", action="store_true", default=False, help="also output not overlapping data [format: bool] [default: false]")
63 parser.add_option("-x", "--exclude", dest="exclude", action="store_true", default=False, help="invert the match [format: bool] [default: false]")
64 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")
65 parser.add_option('', '--tar', dest='outputTar', default=None, help='output all SAM results in a tar file.' )
66 parser.add_option( '', '--outTxt', dest='outTxtFile', help='The output list of results files on txt format.[compulsory]' )
67 (options, args) = parser.parse_args()
68
69
70 #Parse the input txt file and read a list of BAM files.
71 file = open(options.inputTxt, "r")
72 lines = file.readlines()
73 inputFileNames = []
74 overlapOutputNames = []
75 outputName = options.outTxtFile
76 resDirName = os.path.dirname(outputName) + "/"
77 #Write output txt file and define all output sam file names.
78 out = open(outputName, "w")
79 for line in lines:
80 tab = line.split()
81 inputFileNames.append(tab[1])
82 overlapOutName = resDirName + tab[0] + '_overlapOut_%s.gff3' % random.randrange(0, 10000)
83 overlapOutputNames.append(overlapOutName)
84 out.write(tab[0] + '\t' + overlapOutName + '\n')
85 file.close()
86 out.close()
87
88 #construction the commandes for each input file
89 cmds = []
90 for i in range(len(inputFileNames)):
91 absFile = sys.argv[0]
92 absDir = os.path.dirname(absFile)
93 parentDir = os.path.abspath(os.path.join(absDir, os.path.pardir))
94 cmd = "python %s/Java/Python/CompareOverlappingSmallQuery.py " % parentDir
95 opts = "-i %s -f %s -j %s -g %s -o %s " % (options.inputFileName1, options.format1, inputFileNames[i], options.format2, overlapOutputNames[i])
96 #if options.start1 != None:
97 # opts += "-S %s " % options.start1
98 #if options.start2 != None:
99 # opts += "-s %s " % options.start2
100 #if options.end1 != None:
101 # opts += "-U %s " % options.end1
102 #if options.end2 != None:
103 # opts += "-u %s " % options.end2
104 #if options.fivePrime1 != None:
105 # opts += "-E %s " % options.fivePrime1
106 #if options.fivePrime2 != None:
107 # opts += "-e %s " % options.fivePrime2
108 #if options.threePrime1 != None:
109 # opts += "-N %s " % options.threePrime1
110 #if options.threePrime2 != None:
111 # opts += "-n %s " % options.threePrime2
112 #if options.colinear:
113 # opts += "-c "
114 #if options.antisense:
115 # opts +="-a "
116 #if options.included:
117 # opts += "-k "
118 #if options.including:
119 # opts += "-K "
120 #if options.pcOverlap != None:
121 # opts += "-p %s " % options.pcOverlap
122 if options.notOverlapping:
123 opts += "-O "
124 if options.exclude:
125 opts += "-x "
126 if options.distance != None:
127 opts += "-d %s " % options.distance
128 #if options.minOverlap != None:
129 # opts += "-m %s " % options.minOverlap
130 cmd += opts
131 cmds.append(cmd)
132
133
134 print "les commandes sont %s \n" % cmds
135
136 tmp_files = []
137 for i in range(len(cmds)):
138 try:
139 tmp_out = tempfile.NamedTemporaryFile().name
140 tmp_files.append(tmp_out)
141 tmp_stdout = open( tmp_out, 'wb' )
142 tmp_err = tempfile.NamedTemporaryFile().name
143 tmp_files.append(tmp_err)
144 tmp_stderr = open( tmp_err, 'wb' )
145 proc = subprocess.Popen( args=cmds[i], shell=True, cwd=".", stdout=tmp_stdout, stderr=tmp_stderr )
146 returncode = proc.wait()
147 tmp_stderr.close()
148 # get stderr, allowing for case where it's very large
149 tmp_stderr = open( tmp_err, 'rb' )
150 stderr = ''
151 buffsize = 1048576
152 try:
153 while True:
154 stderr += tmp_stderr.read( buffsize )
155 if not stderr or len( stderr ) % buffsize != 0:
156 break
157 except OverflowError:
158 pass
159 tmp_stdout.close()
160 tmp_stderr.close()
161 if returncode != 0:
162 raise Exception, stderr
163 except Exception, e:
164 stop_err( 'Error in :\n' + str( e ) )
165
166 if options.outputTar != None:
167 toTar(options.outputTar, overlapOutputNames)
168
169 for tmp_file in tmp_files:
170 os.remove(tmp_file)
171
172
173 if __name__=="__main__": __main__()
174
175