Mercurial > repos > siyuan > prada
comparison pyPRADA_1.2/prada-guess-ft @ 0:acc2ca1a3ba4
Uploaded
author | siyuan |
---|---|
date | Thu, 20 Feb 2014 00:44:58 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:acc2ca1a3ba4 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 #GUESS-ft:General UsEr defined Supervised Search for fusion transcript. | |
4 #This program is to perform targeted search of gene fusions from RNAseq data. | |
5 #It uses samtools, pysam package. It requires a bam file and other reference files. | |
6 #Note that it is a less accurate tool than prada-fusion. | |
7 | |
8 import pysam | |
9 import subprocess | |
10 import os | |
11 import time | |
12 import sys | |
13 import bioclass | |
14 import ioprada | |
15 | |
16 args=sys.argv | |
17 | |
18 #Get all parameters | |
19 help_menu='''\nUsage: prada-guess-ft GeneA GeneB -conf xx.txt -inputbam X -mm 1 -minmapq 30 -junL X -outdir X -unmap X\n | |
20 **Parameters**: | |
21 -conf the configure file. see prada-fusion -conf for details | |
22 -inputbam the input bam file | |
23 -mm number of mismatch allowed | |
24 -minmapq mininum mapping quality for reads to be used in fusion finding | |
25 -junL length of exons to be used for junctions. see prada-fusion | |
26 -outdir output directory | |
27 -unmap the numapped reads. useful if user need to run guess-ft multiple times | |
28 ''' | |
29 | |
30 if '-h' in args or '-help' in args or len(args)==1: | |
31 print help_menu | |
32 sys.exit(0) | |
33 | |
34 ################################################################################## | |
35 Gene_A=args[1] | |
36 Gene_B=args[2] | |
37 if '-inputbam' not in args: | |
38 sys.exit('Input BAM needed') | |
39 else: | |
40 i=args.index('-inputbam') | |
41 sourcefile=args[i+1] | |
42 if '-outdir' not in args: | |
43 outdir='./' | |
44 else: | |
45 i=args.index('-outdir') | |
46 outdir=args[i+1] | |
47 if not os.path.exists(outdir): | |
48 os.mkdir(outdir) | |
49 if '-unmap' not in args: #get unmapped.bam yourself | |
50 unmapbam='%s/one.end.unmapped.bam'%outdir | |
51 extract_mask=1 | |
52 else: | |
53 i=args.index('-unmap') | |
54 unmapbam=args[i+1] | |
55 extract_mask=0 | |
56 if '-mm' not in args: | |
57 mm=1 | |
58 else: | |
59 i=args.index('-mm') | |
60 mm=int(args[i+1]) | |
61 #minimum mapping quality for reads as fusion evidences | |
62 if '-minmapq' not in args: | |
63 minmapq=30 | |
64 else: | |
65 i=args.index('-minmapq') | |
66 minmapq=int(args[i+1]) | |
67 | |
68 if '-junL' not in args: | |
69 sys.exit('-junL needed') | |
70 else: | |
71 i=args.index('-junL') | |
72 junL=int(args[i+1]) | |
73 | |
74 prada_path=os.path.dirname(os.path.abspath(__file__)) #### | |
75 ref_search_path=[prada_path,os.getcwd()] #search path for ref file if not specified in command line | |
76 | |
77 if '-conf' in args: | |
78 i=args.index('-conf') | |
79 reffile=args[i+1] | |
80 if os.path.exists(reffile): | |
81 pass | |
82 else: | |
83 for pth in ref_search_path: | |
84 new_reffile='%s/%s'%(pth, os.path.basename(reffile)) | |
85 if os.path.exists(new_reffile): | |
86 reffile=new_reffile | |
87 break | |
88 else: | |
89 sys.exit('ERROR: ref file %s not found'%reffile) | |
90 else: | |
91 reffile='%s/conf.txt'%prada_path | |
92 if not os.path.exists(reffile): | |
93 sys.exit('ERROR: No default conf.txt found and none specified') | |
94 | |
95 #reference files | |
96 refdict=ioprada.read_conf(reffile) | |
97 ref_anno=refdict['--REF--']['ref_anno'] | |
98 ref_map=refdict['--REF--']['ref_map'] | |
99 ref_fasta=refdict['--REF--']['ref_fasta'] | |
100 featurefile=refdict['--REF--']['feature_file'] | |
101 | |
102 samtools='%s/tools/samtools-0.1.16/samtools'%prada_path | |
103 bwa='%s/tools/bwa-0.5.7-mh/bwa'%prada_path | |
104 | |
105 print 'GUESS start: %s'%time.ctime() | |
106 print 'CMD: %s'%('\t'.join(args)) | |
107 | |
108 ################################################################################## | |
109 ##get gene position information | |
110 | |
111 gA,gB=ioprada.read_feature_genes(featurefile,Gene_A,Gene_B) | |
112 if gA is None: | |
113 sys.exit('%s not found'%Gene_A) | |
114 if gB is None: | |
115 sys.exit('%s not found'%Gene_B) | |
116 | |
117 #Generate unmapped reads. | |
118 if extract_mask: | |
119 cmd='%s view -b -f 4 -F 8 %s > %s'%(samtools,sourcefile,unmapbam) | |
120 cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) | |
121 while True: | |
122 if cmdout.poll() is None: | |
123 print 'Extracting unmapped reads...' | |
124 time.sleep(120) | |
125 pass | |
126 if cmdout.poll()==0: | |
127 print 'Extracted unmapped reads' | |
128 break | |
129 if cmdout.poll() is not None and cmdout.poll() != 0: | |
130 raise Exception('Error extracting unmapped reads') | |
131 | |
132 #Generate junction db | |
133 juncfile='%s_%s.junction.fasta'%(Gene_A,Gene_B) | |
134 cmd='perl %s/make_exon_junctions.pl %s %s %s %s %s %d > %s/%s'%(prada_path,Gene_A,Gene_B,ref_anno,ref_map,ref_fasta,junL,outdir,juncfile) | |
135 cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) | |
136 while True: | |
137 if cmdout.poll() is None: | |
138 print 'Generating junction db. Waiting...' | |
139 time.sleep(20) | |
140 pass | |
141 if cmdout.poll()==0: | |
142 print 'Generated Junction DB.' | |
143 break | |
144 if cmdout.poll() is not None and cmdout.poll() != 0: | |
145 raise Exception('Error generated Junction DB.') | |
146 | |
147 #read into the sam file, get gene-strand information | |
148 print 'Finding discordant read pairs.' | |
149 samfile = pysam.Samfile(sourcefile, "rb" ) | |
150 | |
151 #mapping quality based read collection | |
152 #strand information considered | |
153 #mismatch filter considered | |
154 a_reads=[] | |
155 b_reads=[] | |
156 for alignedread in samfile.fetch(gA.chr,gA.start-1,gA.end): | |
157 if alignedread.mapq >= minmapq: | |
158 readastrd='-1' if alignedread.is_reverse else '1' | |
159 mmf=[x[1] for x in alignedread.tags if x[0]=='NM'][0] | |
160 if readastrd==gA.strand and mmf <= mm: | |
161 a_reads.append(alignedread) | |
162 | |
163 for alignedread in samfile.fetch(gB.chr,gB.start-1,gB.end): | |
164 if alignedread.mapq >= minmapq: | |
165 readbstrd='-1' if alignedread.is_reverse else '1' | |
166 mmf=[x[1] for x in alignedread.tags if x[0]=='NM'][0] | |
167 if readbstrd != gB.strand and mmf <= mm: | |
168 b_reads.append(alignedread) | |
169 samfile.close() | |
170 | |
171 ar_ids=[x.qname for x in a_reads] | |
172 br_ids=[x.qname for x in b_reads] | |
173 disc=list(set(ar_ids).intersection(set(br_ids))) #discordant pairs | |
174 | |
175 #detect read pair that one end map to one partner, yet the other does not align | |
176 print 'Determining potential junction spanning reads.' | |
177 rpaonly=[] #reads that only map to gene A -- going to map the other end to junctions | |
178 for rd in a_reads: | |
179 if rd.mate_is_unmapped: | |
180 rpaonly.append(rd) | |
181 rpbonly=[] #reads that only map to gene B -- going to map the other end to junctions | |
182 for rd in b_reads: | |
183 if rd.mate_is_unmapped: | |
184 rpbonly.append(rd) | |
185 rpaonly_names=[x.qname for x in rpaonly] | |
186 rpbonly_names=[x.qname for x in rpbonly] | |
187 | |
188 #find read sequences | |
189 print 'Extracting unmapped read sequences.' | |
190 print 'mate unmapped read for gene A:',len(rpaonly) | |
191 print 'mate unmapped read for gene B:',len(rpbonly) | |
192 samfile=pysam.Samfile(unmapbam,'rb') | |
193 taga='%s-%s_a'%(Gene_A,Gene_B) | |
194 tagb='%s-%s_b'%(Gene_A,Gene_B) | |
195 resfq_a=open('%s/unmapreads_%s.fq'%(outdir,taga),'w') | |
196 resfq_b=open('%s/unmapreads_%s.fq'%(outdir,tagb),'w') | |
197 for item in samfile: | |
198 if item.qname in rpaonly_names: | |
199 resfq_a.write('@%s\n'%item.qname) | |
200 resfq_a.write('%s\n'%item.seq) | |
201 resfq_a.write('+\n') | |
202 resfq_a.write('%s\n'%item.qual) | |
203 if item.qname in rpbonly_names: | |
204 resfq_b.write('@%s\n'%item.qname) | |
205 resfq_b.write('%s\n'%item.seq) | |
206 resfq_b.write('+\n') | |
207 resfq_b.write('%s\n'%item.qual) | |
208 resfq_a.close() | |
209 resfq_b.close() | |
210 samfile.close() | |
211 | |
212 ##indexing junction db | |
213 print 'Aligning reads to junction db' | |
214 cmd='%s index %s/%s'%(bwa,outdir,juncfile) | |
215 cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) | |
216 while True: | |
217 if cmdout.poll() is None: | |
218 time.sleep(3) | |
219 pass | |
220 if cmdout.poll()==0: | |
221 print 'Junction DB indexed.' | |
222 break | |
223 if cmdout.poll() is not None and cmdout.poll() != 0: | |
224 raise Exception('Error building junction db index') | |
225 | |
226 ##align the unmapped reads to junction database | |
227 for rs in [taga,tagb]: | |
228 cmd='%s aln -n %d -R 100 %s/%s %s/unmapreads_%s.fq > %s/%s.sai'%(bwa,mm,outdir,juncfile,outdir,rs,outdir,rs) | |
229 cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) | |
230 while True: | |
231 if cmdout.poll() is None: | |
232 time.sleep(5) | |
233 pass | |
234 if cmdout.poll()==0: | |
235 print 'Aligned unmapped reads group %s'%rs | |
236 break | |
237 if cmdout.poll() is not None and cmdout.poll() != 0: | |
238 raise Exception('Error aligning unmapped reads for group %s'%rs) | |
239 | |
240 cmd='%s samse -n 1000 %s/%s %s/%s.sai %s/unmapreads_%s.fq > %s/%s.sam'%(bwa,outdir,juncfile,outdir,rs,outdir,rs,outdir,rs) | |
241 cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) | |
242 while True: | |
243 if cmdout.poll() is None: | |
244 time.sleep(2) | |
245 pass | |
246 if cmdout.poll()==0: | |
247 print 'Converting to sam for group %s'%rs | |
248 break | |
249 if cmdout.poll() is not None and cmdout.poll() != 0: | |
250 raise Exception('Error converting to sam for group %s'%rs) | |
251 | |
252 #parse results | |
253 qualrd_a=[] | |
254 junc_a=[] | |
255 samfile=pysam.Samfile('%s/%s.sam'%(outdir,taga),'r') | |
256 for rd in samfile: | |
257 if not rd.is_unmapped and rd.is_reverse: | |
258 qualrd_a.append(rd) | |
259 junc_a.append(samfile.getrname(rd.tid)) | |
260 samfile.close() | |
261 qualrd_b=[] | |
262 junc_b=[] | |
263 samfile=pysam.Samfile('%s/%s.sam'%(outdir,tagb),'r') | |
264 for rd in samfile: | |
265 if not rd.is_unmapped and not rd.is_reverse: | |
266 qualrd_b.append(rd) | |
267 junc_b.append(samfile.getrname(rd.tid)) | |
268 samfile.close() | |
269 | |
270 junc_span=[] | |
271 junc_span.extend(qualrd_a) | |
272 junc_span.extend(qualrd_b) | |
273 | |
274 junc_name=[] | |
275 junc_name.extend(junc_a) | |
276 junc_name.extend(junc_b) | |
277 | |
278 #Generate a summary report | |
279 sumfile=open('%s/%s_%s.GUESS.summary.txt'%(outdir,Gene_A,Gene_B),'w') | |
280 sumfile.write('%s\t%s\n'%(Gene_A,Gene_B)) | |
281 sumfile.write('\n') | |
282 sumfile.write('>discordant\n') | |
283 for rdname in disc: | |
284 ia=ar_ids.index(rdname) | |
285 ib=br_ids.index(rdname) | |
286 reada=a_reads[ia] | |
287 readb=b_reads[ib] | |
288 mm_a=[x[1] for x in reada.tags if x[0]=='NM'][0] | |
289 mm_b=[x[1] for x in readb.tags if x[0]=='NM'][0] | |
290 ss1='%s\tF\t%s.%d.mm%d'%(reada.qname,Gene_A,reada.pos+1,mm_a) #pysam use 0-based coordinates | |
291 ss2='%s\tR\t%s.%d.mm%d'%(readb.qname,Gene_B,readb.pos+1,mm_b) | |
292 sumfile.write('%s\n'%ss1) | |
293 sumfile.write('%s\n'%ss2) | |
294 | |
295 sumfile.write('\n') | |
296 sumfile.write('>spanning\n') | |
297 for i in range(len(junc_span)): | |
298 rd=junc_span[i] | |
299 jname=junc_name[i] | |
300 mm_j=[x[1] for x in rd.tags if x[0]=='NM'][0] | |
301 ss='%s\t%s.mm%d'%(rd.qname,jname,mm_j) | |
302 sumfile.write('%s\n'%ss) | |
303 | |
304 sumfile.write('\n') | |
305 sumfile.write('>junction\n') | |
306 juncol=[] | |
307 for item in set(junc_name): | |
308 nn=junc_name.count(item) | |
309 juncol.append([item,nn]) | |
310 juncol=sorted(juncol,key=lambda x:x[1],reverse=True) | |
311 for item in juncol: | |
312 sumfile.write('%s\t%s\n'%(item[0],item[1])) | |
313 | |
314 sumfile.write('\n') | |
315 sumfile.write('>summary\n') | |
316 sumfile.write('Number of Discordant Pairs = %d\n'%len(disc)) | |
317 sumfile.write('Number of Fusion Reads = %d\n'%len(junc_span)) | |
318 sumfile.write('Number of Distinct Junctions = %d\n'%len(set(junc_name))) | |
319 | |
320 sumfile.close() | |
321 print 'Done: %s'%time.ctime() |