annotate pyPRADA_1.2/prada-guess-if @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
1 #!/usr/bin/env python
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
2
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
3 #GUESS-if is to find abnormal intragenic fusions.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
4 #It is an extension of the GUESS-ft method.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
5
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
6 import pysam
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
7 import subprocess
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
8 import os
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
9 import time
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
10 import sys
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
11 import bioclass
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
12 import ioprada
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
13
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
14 args=sys.argv
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
15
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
16 #Get all parameters
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
17 help_menu='''\nUsage: prada-guess-if Gene -conf xx.txt -inputbam X -mm 1 -minmapq 30 -junL X -outdir ./ -unmap X\n
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
18 **Parameters**:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
19 -conf the configure file. see prada-fusion -conf for details
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
20 -inputbam the input bam file
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
21 -mm number of mismatch allowed
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
22 -minmapq mininum mapping quality for reads to be used in fusion finding
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
23 -junL length of exons to be used for junctions. see prada-fusion
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
24 -outdir output directory
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
25 -unmap the numapped reads. useful if user need to run guess-ft multiple times
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
26 '''
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
27
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
28 if '-h' in args or '-help' in args or len(args)==1:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
29 print help_menu
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
30 sys.exit(0)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
31
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
32 #########################################################################
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
33 target=args[1]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
34 if '-inputbam' not in args:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
35 sys.exit('Input BAM needed')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
36 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
37 i=args.index('-inputbam')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
38 sourcefile=args[i+1]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
39 if '-outdir' not in args:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
40 outdir='./'
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
41 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
42 i=args.index('-outdir')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
43 outdir=args[i+1]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
44 if not os.path.exists(outdir):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
45 os.mkdir(outdir)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
46 if '-unmap' not in args:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
47 unmapbam='%s/one.end.unmapped.bam'%outdir
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
48 extract_mask=1
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
49 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
50 i=args.index('-unmap')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
51 unmapbam=args[i+1]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
52 extract_mask=0
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
53
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
54 if '-mm' not in args:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
55 mm=1
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
56 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
57 i=args.index('-mm')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
58 mm=int(args[i+1])
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
59
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
60 #minimum mapping quality for reads as fusion evidences
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
61 if '-minmapq' not in args:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
62 minmapq=30
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
63 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
64 i=args.index('-minmapq')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
65 minmapq=int(args[i+1])
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
66
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
67 if '-junL' not in args:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
68 sys.exit('-junL needed')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
69 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
70 i=args.index('-junL')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
71 junL=int(args[i+1])
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
72
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
73 prada_path=os.path.dirname(os.path.abspath(__file__)) ####
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
74 ref_search_path=[prada_path,os.getcwd()] #search path for ref file if not specified in command line
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
75
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
76 if '-conf' in args:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
77 i=args.index('-conf')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
78 reffile=args[i+1]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
79 if os.path.exists(reffile):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
80 pass
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
81 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
82 for pth in ref_search_path:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
83 new_reffile='%s/%s'%(pth, os.path.basename(reffile))
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
84 if os.path.exists(new_reffile):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
85 reffile=new_reffile
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
86 break
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
87 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
88 sys.exit('ERROR: ref file %s not found'%reffile)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
89 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
90 reffile='%s/conf.txt'%prada_path
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
91 if not os.path.exists(reffile):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
92 sys.exit('ERROR: No default conf.txt found and none specified')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
93
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
94 #reference files
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
95 refdict=ioprada.read_conf(reffile)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
96 ref_anno=refdict['--REF--']['ref_anno']
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
97 ref_map=refdict['--REF--']['ref_map']
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
98 ref_fasta=refdict['--REF--']['ref_fasta']
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
99 featurefile=refdict['--REF--']['feature_file']
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
100
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
101 samtools='%s/tools/samtools-0.1.16/samtools'%prada_path
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
102 bwa='%s/tools/bwa-0.5.7-mh/bwa'%prada_path
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
103
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
104 #########################################################################
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
105 print 'GUESS-if start: %s'%time.ctime()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
106 print 'CMD: %s'%('\t'.join(args))
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
107
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
108 ##get gene position information
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
109 gobj=ioprada.read_feature_genes(featurefile,target)[0]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
110 if gobj is None:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
111 sys.exit('%s not found'%target)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
112
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
113 gchr=gobj.strand
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
114 gchr_rev=True if gchr=='-1' else False
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
115
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
116 #Generate unmapped reads.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
117 if extract_mask:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
118 cmd='%s view -b -f 4 -F 8 %s > %s'%(samtools,sourcefile,unmapbam)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
119 cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
120 while True:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
121 if cmdout.poll() is None:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
122 print 'Extracting unmapped reads...'
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
123 time.sleep(120)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
124 pass
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
125 if cmdout.poll()==0:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
126 print 'Extracted unmapped reads'
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
127 break
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
128 if cmdout.poll() is not None and cmdout.poll() != 0:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
129 raise Exception('Error extracting unmapped reads')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
130
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
131
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
132 #Generate junction db
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
133 juncfile='%s.junction.fasta'%target
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
134 cmd='perl %s/make_intragenic_junctions.pl %s %s %s %s %s %d > %s/%s'%(prada_path,target,target,ref_anno,ref_map,ref_fasta,junL,outdir,juncfile)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
135 cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
136 while True:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
137 if cmdout.poll() is None:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
138 print 'Generating junction db. Waiting...'
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
139 time.sleep(20)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
140 pass
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
141 if cmdout.poll()==0:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
142 print 'Generated Junction DB.'
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
143 break
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
144 if cmdout.poll() is not None and cmdout.poll() != 0:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
145 raise Exception('Error generated Junction DB.')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
146
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
147 #scan BAM file for mapping reads.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
148 samfile=pysam.Samfile(sourcefile,'rb')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
149 reads_se=[]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
150 reads_as=[]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
151 for alignedread in samfile.fetch(gobj.chr,gobj.start-1,gobj.end):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
152 if alignedread.mapq >= minmapq:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
153 mmf=[x[1] for x in alignedread.tags if x[0]=='NM'][0]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
154 if mmf <= mm:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
155 if alignedread.is_reverse==gchr_rev: #sense
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
156 reads_se.append(alignedread)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
157 else: #anti-sense
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
158 reads_as.append(alignedread)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
159 samfile.close()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
160
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
161 seonly,asonly=[],[]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
162 for rd in reads_se:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
163 if rd.mate_is_unmapped:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
164 seonly.append(rd)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
165 for rd in reads_as:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
166 if rd.mate_is_unmapped:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
167 asonly.append(rd)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
168 seunmap=[x.qname for x in seonly]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
169 asunmap=[x.qname for x in asonly]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
170
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
171 #find read sequences
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
172 print 'Extracting unmapped read sequences.'
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
173 print 'mate unmapped reads for sense strand:',len(seonly)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
174 print 'mate unmapped reads for antisense strand:',len(asonly)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
175 samfile=pysam.Samfile(unmapbam,'rb')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
176 resfq_a=open('%s/%s_se_unmap.fq'%(outdir,target),'w')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
177 resfq_b=open('%s/%s_as_unmap.fq'%(outdir,target),'w')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
178 for item in samfile:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
179 if item.qname in seunmap:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
180 resfq_a.write('@%s\n'%item.qname)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
181 resfq_a.write('%s\n'%item.seq)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
182 resfq_a.write('+\n')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
183 resfq_a.write('%s\n'%item.qual)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
184 if item.qname in asunmap:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
185 resfq_b.write('@%s\n'%item.qname)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
186 resfq_b.write('%s\n'%item.seq)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
187 resfq_b.write('+\n')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
188 resfq_b.write('%s\n'%item.qual)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
189 resfq_a.close()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
190 resfq_b.close()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
191 samfile.close()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
192
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
193 ##indexing junction db
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
194 print 'Aligning reads to junction db'
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
195 cmd='%s index %s/%s'%(bwa,outdir,juncfile)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
196 cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
197 while True:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
198 if cmdout.poll() is None:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
199 time.sleep(3)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
200 pass
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
201 if cmdout.poll()==0:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
202 print 'Junction DB indexed.'
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
203 break
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
204 if cmdout.poll() is not None and cmdout.poll() != 0:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
205 raise Exception('Error building junction db index')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
206
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
207 taga='%s_se'%target
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
208 tagb='%s_as'%target
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
209 for rs in [taga,tagb]:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
210 cmd='%s aln -n %d -R 100 %s/%s %s/%s_unmap.fq > %s/%s_unmap.sai'%(bwa,mm,outdir,juncfile,outdir,rs,outdir,rs)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
211 cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
212 while True:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
213 if cmdout.poll() is None:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
214 time.sleep(5)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
215 pass
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
216 if cmdout.poll()==0:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
217 print 'Aligned unmapped reads group %s'%rs
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
218 break
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
219 if cmdout.poll() is not None and cmdout.poll() != 0:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
220 raise Exception('Error aligning unmapped reads for group %s'%rs)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
221
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
222 cmd='%s samse -n 1000 %s/%s %s/%s_unmap.sai %s/%s_unmap.fq > %s/%s_unmap.sam'%(bwa,outdir,juncfile,outdir,rs,outdir,rs,outdir,rs)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
223 cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
224 while True:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
225 if cmdout.poll() is None:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
226 time.sleep(2)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
227 pass
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
228 if cmdout.poll()==0:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
229 print 'Converting to sam for group %s'%rs
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
230 break
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
231 if cmdout.poll() is not None and cmdout.poll() != 0:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
232 raise Exception('Error converting to sam for group %s'%rs)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
233
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
234 qualrd_a=[]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
235 junc_a=[]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
236 samfile=pysam.Samfile('%s/%s_unmap.sam'%(outdir,taga),'r')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
237 for rd in samfile:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
238 if not rd.is_unmapped and rd.is_reverse:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
239 qualrd_a.append(rd)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
240 junc_a.append(samfile.getrname(rd.tid))
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
241 samfile.close()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
242 qualrd_b=[]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
243 junc_b=[]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
244 samfile=pysam.Samfile('%s/%s_unmap.sam'%(outdir,tagb),'r')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
245 for rd in samfile:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
246 if not rd.is_unmapped and not rd.is_reverse:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
247 qualrd_b.append(rd)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
248 junc_b.append(samfile.getrname(rd.tid))
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
249 samfile.close()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
250
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
251 junc_span=[]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
252 junc_span.extend(qualrd_a)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
253 junc_span.extend(qualrd_b)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
254
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
255 junc_name=[]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
256 junc_name.extend(junc_a)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
257 junc_name.extend(junc_b)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
258
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
259 #Generate a summary report
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
260 sumfile=open('%s/%s.GUESS-IF.summary.txt'%(outdir,target),'w')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
261 sumfile.write('%s\n'%(target))
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
262 sumfile.write('\n')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
263 sumfile.write('>spanning\n')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
264 for i in range(len(junc_span)):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
265 rd=junc_span[i]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
266 jname=junc_name[i]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
267 mm_j=[x[1] for x in rd.tags if x[0]=='NM'][0]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
268 ss='%s\t%s.mm%d'%(rd.qname,jname,mm_j)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
269 sumfile.write('%s\n'%ss)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
270
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
271 sumfile.write('\n')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
272 sumfile.write('>junction\n')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
273 juncol=[]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
274 for item in set(junc_name):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
275 nn=junc_name.count(item)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
276 juncol.append([item,nn])
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
277 juncol=sorted(juncol,key=lambda x:x[1],reverse=True)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
278 for item in juncol:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
279 sumfile.write('%s\t%s\n'%(item[0],item[1]))
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
280
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
281 sumfile.write('\n')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
282 sumfile.write('>summary\n')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
283 sumfile.write('Number of Fusion Reads = %d\n'%len(junc_span))
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
284 sumfile.write('Number of Distinct Junctions = %d\n'%len(set(junc_name)))
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
285
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
286 sumfile.close()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
287 print 'Done: %s'%time.ctime()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
288