comparison pyPRADA_1.2/prada-guess-ft @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:acc2ca1a3ba4
1 #!/usr/bin/env python
2
3 #GUESS-ft:General UsEr defined Supervised Search for fusion transcript.
4 #This program is to perform targeted search of gene fusions from RNAseq data.
5 #It uses samtools, pysam package. It requires a bam file and other reference files.
6 #Note that it is a less accurate tool than prada-fusion.
7
8 import pysam
9 import subprocess
10 import os
11 import time
12 import sys
13 import bioclass
14 import ioprada
15
16 args=sys.argv
17
18 #Get all parameters
19 help_menu='''\nUsage: prada-guess-ft GeneA GeneB -conf xx.txt -inputbam X -mm 1 -minmapq 30 -junL X -outdir X -unmap X\n
20 **Parameters**:
21 -conf the configure file. see prada-fusion -conf for details
22 -inputbam the input bam file
23 -mm number of mismatch allowed
24 -minmapq mininum mapping quality for reads to be used in fusion finding
25 -junL length of exons to be used for junctions. see prada-fusion
26 -outdir output directory
27 -unmap the numapped reads. useful if user need to run guess-ft multiple times
28 '''
29
30 if '-h' in args or '-help' in args or len(args)==1:
31 print help_menu
32 sys.exit(0)
33
34 ##################################################################################
35 Gene_A=args[1]
36 Gene_B=args[2]
37 if '-inputbam' not in args:
38 sys.exit('Input BAM needed')
39 else:
40 i=args.index('-inputbam')
41 sourcefile=args[i+1]
42 if '-outdir' not in args:
43 outdir='./'
44 else:
45 i=args.index('-outdir')
46 outdir=args[i+1]
47 if not os.path.exists(outdir):
48 os.mkdir(outdir)
49 if '-unmap' not in args: #get unmapped.bam yourself
50 unmapbam='%s/one.end.unmapped.bam'%outdir
51 extract_mask=1
52 else:
53 i=args.index('-unmap')
54 unmapbam=args[i+1]
55 extract_mask=0
56 if '-mm' not in args:
57 mm=1
58 else:
59 i=args.index('-mm')
60 mm=int(args[i+1])
61 #minimum mapping quality for reads as fusion evidences
62 if '-minmapq' not in args:
63 minmapq=30
64 else:
65 i=args.index('-minmapq')
66 minmapq=int(args[i+1])
67
68 if '-junL' not in args:
69 sys.exit('-junL needed')
70 else:
71 i=args.index('-junL')
72 junL=int(args[i+1])
73
74 prada_path=os.path.dirname(os.path.abspath(__file__)) ####
75 ref_search_path=[prada_path,os.getcwd()] #search path for ref file if not specified in command line
76
77 if '-conf' in args:
78 i=args.index('-conf')
79 reffile=args[i+1]
80 if os.path.exists(reffile):
81 pass
82 else:
83 for pth in ref_search_path:
84 new_reffile='%s/%s'%(pth, os.path.basename(reffile))
85 if os.path.exists(new_reffile):
86 reffile=new_reffile
87 break
88 else:
89 sys.exit('ERROR: ref file %s not found'%reffile)
90 else:
91 reffile='%s/conf.txt'%prada_path
92 if not os.path.exists(reffile):
93 sys.exit('ERROR: No default conf.txt found and none specified')
94
95 #reference files
96 refdict=ioprada.read_conf(reffile)
97 ref_anno=refdict['--REF--']['ref_anno']
98 ref_map=refdict['--REF--']['ref_map']
99 ref_fasta=refdict['--REF--']['ref_fasta']
100 featurefile=refdict['--REF--']['feature_file']
101
102 samtools='%s/tools/samtools-0.1.16/samtools'%prada_path
103 bwa='%s/tools/bwa-0.5.7-mh/bwa'%prada_path
104
105 print 'GUESS start: %s'%time.ctime()
106 print 'CMD: %s'%('\t'.join(args))
107
108 ##################################################################################
109 ##get gene position information
110
111 gA,gB=ioprada.read_feature_genes(featurefile,Gene_A,Gene_B)
112 if gA is None:
113 sys.exit('%s not found'%Gene_A)
114 if gB is None:
115 sys.exit('%s not found'%Gene_B)
116
117 #Generate unmapped reads.
118 if extract_mask:
119 cmd='%s view -b -f 4 -F 8 %s > %s'%(samtools,sourcefile,unmapbam)
120 cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)
121 while True:
122 if cmdout.poll() is None:
123 print 'Extracting unmapped reads...'
124 time.sleep(120)
125 pass
126 if cmdout.poll()==0:
127 print 'Extracted unmapped reads'
128 break
129 if cmdout.poll() is not None and cmdout.poll() != 0:
130 raise Exception('Error extracting unmapped reads')
131
132 #Generate junction db
133 juncfile='%s_%s.junction.fasta'%(Gene_A,Gene_B)
134 cmd='perl %s/make_exon_junctions.pl %s %s %s %s %s %d > %s/%s'%(prada_path,Gene_A,Gene_B,ref_anno,ref_map,ref_fasta,junL,outdir,juncfile)
135 cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)
136 while True:
137 if cmdout.poll() is None:
138 print 'Generating junction db. Waiting...'
139 time.sleep(20)
140 pass
141 if cmdout.poll()==0:
142 print 'Generated Junction DB.'
143 break
144 if cmdout.poll() is not None and cmdout.poll() != 0:
145 raise Exception('Error generated Junction DB.')
146
147 #read into the sam file, get gene-strand information
148 print 'Finding discordant read pairs.'
149 samfile = pysam.Samfile(sourcefile, "rb" )
150
151 #mapping quality based read collection
152 #strand information considered
153 #mismatch filter considered
154 a_reads=[]
155 b_reads=[]
156 for alignedread in samfile.fetch(gA.chr,gA.start-1,gA.end):
157 if alignedread.mapq >= minmapq:
158 readastrd='-1' if alignedread.is_reverse else '1'
159 mmf=[x[1] for x in alignedread.tags if x[0]=='NM'][0]
160 if readastrd==gA.strand and mmf <= mm:
161 a_reads.append(alignedread)
162
163 for alignedread in samfile.fetch(gB.chr,gB.start-1,gB.end):
164 if alignedread.mapq >= minmapq:
165 readbstrd='-1' if alignedread.is_reverse else '1'
166 mmf=[x[1] for x in alignedread.tags if x[0]=='NM'][0]
167 if readbstrd != gB.strand and mmf <= mm:
168 b_reads.append(alignedread)
169 samfile.close()
170
171 ar_ids=[x.qname for x in a_reads]
172 br_ids=[x.qname for x in b_reads]
173 disc=list(set(ar_ids).intersection(set(br_ids))) #discordant pairs
174
175 #detect read pair that one end map to one partner, yet the other does not align
176 print 'Determining potential junction spanning reads.'
177 rpaonly=[] #reads that only map to gene A -- going to map the other end to junctions
178 for rd in a_reads:
179 if rd.mate_is_unmapped:
180 rpaonly.append(rd)
181 rpbonly=[] #reads that only map to gene B -- going to map the other end to junctions
182 for rd in b_reads:
183 if rd.mate_is_unmapped:
184 rpbonly.append(rd)
185 rpaonly_names=[x.qname for x in rpaonly]
186 rpbonly_names=[x.qname for x in rpbonly]
187
188 #find read sequences
189 print 'Extracting unmapped read sequences.'
190 print 'mate unmapped read for gene A:',len(rpaonly)
191 print 'mate unmapped read for gene B:',len(rpbonly)
192 samfile=pysam.Samfile(unmapbam,'rb')
193 taga='%s-%s_a'%(Gene_A,Gene_B)
194 tagb='%s-%s_b'%(Gene_A,Gene_B)
195 resfq_a=open('%s/unmapreads_%s.fq'%(outdir,taga),'w')
196 resfq_b=open('%s/unmapreads_%s.fq'%(outdir,tagb),'w')
197 for item in samfile:
198 if item.qname in rpaonly_names:
199 resfq_a.write('@%s\n'%item.qname)
200 resfq_a.write('%s\n'%item.seq)
201 resfq_a.write('+\n')
202 resfq_a.write('%s\n'%item.qual)
203 if item.qname in rpbonly_names:
204 resfq_b.write('@%s\n'%item.qname)
205 resfq_b.write('%s\n'%item.seq)
206 resfq_b.write('+\n')
207 resfq_b.write('%s\n'%item.qual)
208 resfq_a.close()
209 resfq_b.close()
210 samfile.close()
211
212 ##indexing junction db
213 print 'Aligning reads to junction db'
214 cmd='%s index %s/%s'%(bwa,outdir,juncfile)
215 cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)
216 while True:
217 if cmdout.poll() is None:
218 time.sleep(3)
219 pass
220 if cmdout.poll()==0:
221 print 'Junction DB indexed.'
222 break
223 if cmdout.poll() is not None and cmdout.poll() != 0:
224 raise Exception('Error building junction db index')
225
226 ##align the unmapped reads to junction database
227 for rs in [taga,tagb]:
228 cmd='%s aln -n %d -R 100 %s/%s %s/unmapreads_%s.fq > %s/%s.sai'%(bwa,mm,outdir,juncfile,outdir,rs,outdir,rs)
229 cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)
230 while True:
231 if cmdout.poll() is None:
232 time.sleep(5)
233 pass
234 if cmdout.poll()==0:
235 print 'Aligned unmapped reads group %s'%rs
236 break
237 if cmdout.poll() is not None and cmdout.poll() != 0:
238 raise Exception('Error aligning unmapped reads for group %s'%rs)
239
240 cmd='%s samse -n 1000 %s/%s %s/%s.sai %s/unmapreads_%s.fq > %s/%s.sam'%(bwa,outdir,juncfile,outdir,rs,outdir,rs,outdir,rs)
241 cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)
242 while True:
243 if cmdout.poll() is None:
244 time.sleep(2)
245 pass
246 if cmdout.poll()==0:
247 print 'Converting to sam for group %s'%rs
248 break
249 if cmdout.poll() is not None and cmdout.poll() != 0:
250 raise Exception('Error converting to sam for group %s'%rs)
251
252 #parse results
253 qualrd_a=[]
254 junc_a=[]
255 samfile=pysam.Samfile('%s/%s.sam'%(outdir,taga),'r')
256 for rd in samfile:
257 if not rd.is_unmapped and rd.is_reverse:
258 qualrd_a.append(rd)
259 junc_a.append(samfile.getrname(rd.tid))
260 samfile.close()
261 qualrd_b=[]
262 junc_b=[]
263 samfile=pysam.Samfile('%s/%s.sam'%(outdir,tagb),'r')
264 for rd in samfile:
265 if not rd.is_unmapped and not rd.is_reverse:
266 qualrd_b.append(rd)
267 junc_b.append(samfile.getrname(rd.tid))
268 samfile.close()
269
270 junc_span=[]
271 junc_span.extend(qualrd_a)
272 junc_span.extend(qualrd_b)
273
274 junc_name=[]
275 junc_name.extend(junc_a)
276 junc_name.extend(junc_b)
277
278 #Generate a summary report
279 sumfile=open('%s/%s_%s.GUESS.summary.txt'%(outdir,Gene_A,Gene_B),'w')
280 sumfile.write('%s\t%s\n'%(Gene_A,Gene_B))
281 sumfile.write('\n')
282 sumfile.write('>discordant\n')
283 for rdname in disc:
284 ia=ar_ids.index(rdname)
285 ib=br_ids.index(rdname)
286 reada=a_reads[ia]
287 readb=b_reads[ib]
288 mm_a=[x[1] for x in reada.tags if x[0]=='NM'][0]
289 mm_b=[x[1] for x in readb.tags if x[0]=='NM'][0]
290 ss1='%s\tF\t%s.%d.mm%d'%(reada.qname,Gene_A,reada.pos+1,mm_a) #pysam use 0-based coordinates
291 ss2='%s\tR\t%s.%d.mm%d'%(readb.qname,Gene_B,readb.pos+1,mm_b)
292 sumfile.write('%s\n'%ss1)
293 sumfile.write('%s\n'%ss2)
294
295 sumfile.write('\n')
296 sumfile.write('>spanning\n')
297 for i in range(len(junc_span)):
298 rd=junc_span[i]
299 jname=junc_name[i]
300 mm_j=[x[1] for x in rd.tags if x[0]=='NM'][0]
301 ss='%s\t%s.mm%d'%(rd.qname,jname,mm_j)
302 sumfile.write('%s\n'%ss)
303
304 sumfile.write('\n')
305 sumfile.write('>junction\n')
306 juncol=[]
307 for item in set(junc_name):
308 nn=junc_name.count(item)
309 juncol.append([item,nn])
310 juncol=sorted(juncol,key=lambda x:x[1],reverse=True)
311 for item in juncol:
312 sumfile.write('%s\t%s\n'%(item[0],item[1]))
313
314 sumfile.write('\n')
315 sumfile.write('>summary\n')
316 sumfile.write('Number of Discordant Pairs = %d\n'%len(disc))
317 sumfile.write('Number of Fusion Reads = %d\n'%len(junc_span))
318 sumfile.write('Number of Distinct Junctions = %d\n'%len(set(junc_name)))
319
320 sumfile.close()
321 print 'Done: %s'%time.ctime()