Mercurial > repos > siyuan > prada
comparison pyPRADA_1.2/prada-guess-ft @ 0:acc2ca1a3ba4
Uploaded
| author | siyuan |
|---|---|
| date | Thu, 20 Feb 2014 00:44:58 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:acc2ca1a3ba4 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 #GUESS-ft:General UsEr defined Supervised Search for fusion transcript. | |
| 4 #This program is to perform targeted search of gene fusions from RNAseq data. | |
| 5 #It uses samtools, pysam package. It requires a bam file and other reference files. | |
| 6 #Note that it is a less accurate tool than prada-fusion. | |
| 7 | |
| 8 import pysam | |
| 9 import subprocess | |
| 10 import os | |
| 11 import time | |
| 12 import sys | |
| 13 import bioclass | |
| 14 import ioprada | |
| 15 | |
| 16 args=sys.argv | |
| 17 | |
| 18 #Get all parameters | |
| 19 help_menu='''\nUsage: prada-guess-ft GeneA GeneB -conf xx.txt -inputbam X -mm 1 -minmapq 30 -junL X -outdir X -unmap X\n | |
| 20 **Parameters**: | |
| 21 -conf the configure file. see prada-fusion -conf for details | |
| 22 -inputbam the input bam file | |
| 23 -mm number of mismatch allowed | |
| 24 -minmapq mininum mapping quality for reads to be used in fusion finding | |
| 25 -junL length of exons to be used for junctions. see prada-fusion | |
| 26 -outdir output directory | |
| 27 -unmap the numapped reads. useful if user need to run guess-ft multiple times | |
| 28 ''' | |
| 29 | |
| 30 if '-h' in args or '-help' in args or len(args)==1: | |
| 31 print help_menu | |
| 32 sys.exit(0) | |
| 33 | |
| 34 ################################################################################## | |
| 35 Gene_A=args[1] | |
| 36 Gene_B=args[2] | |
| 37 if '-inputbam' not in args: | |
| 38 sys.exit('Input BAM needed') | |
| 39 else: | |
| 40 i=args.index('-inputbam') | |
| 41 sourcefile=args[i+1] | |
| 42 if '-outdir' not in args: | |
| 43 outdir='./' | |
| 44 else: | |
| 45 i=args.index('-outdir') | |
| 46 outdir=args[i+1] | |
| 47 if not os.path.exists(outdir): | |
| 48 os.mkdir(outdir) | |
| 49 if '-unmap' not in args: #get unmapped.bam yourself | |
| 50 unmapbam='%s/one.end.unmapped.bam'%outdir | |
| 51 extract_mask=1 | |
| 52 else: | |
| 53 i=args.index('-unmap') | |
| 54 unmapbam=args[i+1] | |
| 55 extract_mask=0 | |
| 56 if '-mm' not in args: | |
| 57 mm=1 | |
| 58 else: | |
| 59 i=args.index('-mm') | |
| 60 mm=int(args[i+1]) | |
| 61 #minimum mapping quality for reads as fusion evidences | |
| 62 if '-minmapq' not in args: | |
| 63 minmapq=30 | |
| 64 else: | |
| 65 i=args.index('-minmapq') | |
| 66 minmapq=int(args[i+1]) | |
| 67 | |
| 68 if '-junL' not in args: | |
| 69 sys.exit('-junL needed') | |
| 70 else: | |
| 71 i=args.index('-junL') | |
| 72 junL=int(args[i+1]) | |
| 73 | |
| 74 prada_path=os.path.dirname(os.path.abspath(__file__)) #### | |
| 75 ref_search_path=[prada_path,os.getcwd()] #search path for ref file if not specified in command line | |
| 76 | |
| 77 if '-conf' in args: | |
| 78 i=args.index('-conf') | |
| 79 reffile=args[i+1] | |
| 80 if os.path.exists(reffile): | |
| 81 pass | |
| 82 else: | |
| 83 for pth in ref_search_path: | |
| 84 new_reffile='%s/%s'%(pth, os.path.basename(reffile)) | |
| 85 if os.path.exists(new_reffile): | |
| 86 reffile=new_reffile | |
| 87 break | |
| 88 else: | |
| 89 sys.exit('ERROR: ref file %s not found'%reffile) | |
| 90 else: | |
| 91 reffile='%s/conf.txt'%prada_path | |
| 92 if not os.path.exists(reffile): | |
| 93 sys.exit('ERROR: No default conf.txt found and none specified') | |
| 94 | |
| 95 #reference files | |
| 96 refdict=ioprada.read_conf(reffile) | |
| 97 ref_anno=refdict['--REF--']['ref_anno'] | |
| 98 ref_map=refdict['--REF--']['ref_map'] | |
| 99 ref_fasta=refdict['--REF--']['ref_fasta'] | |
| 100 featurefile=refdict['--REF--']['feature_file'] | |
| 101 | |
| 102 samtools='%s/tools/samtools-0.1.16/samtools'%prada_path | |
| 103 bwa='%s/tools/bwa-0.5.7-mh/bwa'%prada_path | |
| 104 | |
| 105 print 'GUESS start: %s'%time.ctime() | |
| 106 print 'CMD: %s'%('\t'.join(args)) | |
| 107 | |
| 108 ################################################################################## | |
| 109 ##get gene position information | |
| 110 | |
| 111 gA,gB=ioprada.read_feature_genes(featurefile,Gene_A,Gene_B) | |
| 112 if gA is None: | |
| 113 sys.exit('%s not found'%Gene_A) | |
| 114 if gB is None: | |
| 115 sys.exit('%s not found'%Gene_B) | |
| 116 | |
| 117 #Generate unmapped reads. | |
| 118 if extract_mask: | |
| 119 cmd='%s view -b -f 4 -F 8 %s > %s'%(samtools,sourcefile,unmapbam) | |
| 120 cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) | |
| 121 while True: | |
| 122 if cmdout.poll() is None: | |
| 123 print 'Extracting unmapped reads...' | |
| 124 time.sleep(120) | |
| 125 pass | |
| 126 if cmdout.poll()==0: | |
| 127 print 'Extracted unmapped reads' | |
| 128 break | |
| 129 if cmdout.poll() is not None and cmdout.poll() != 0: | |
| 130 raise Exception('Error extracting unmapped reads') | |
| 131 | |
| 132 #Generate junction db | |
| 133 juncfile='%s_%s.junction.fasta'%(Gene_A,Gene_B) | |
| 134 cmd='perl %s/make_exon_junctions.pl %s %s %s %s %s %d > %s/%s'%(prada_path,Gene_A,Gene_B,ref_anno,ref_map,ref_fasta,junL,outdir,juncfile) | |
| 135 cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) | |
| 136 while True: | |
| 137 if cmdout.poll() is None: | |
| 138 print 'Generating junction db. Waiting...' | |
| 139 time.sleep(20) | |
| 140 pass | |
| 141 if cmdout.poll()==0: | |
| 142 print 'Generated Junction DB.' | |
| 143 break | |
| 144 if cmdout.poll() is not None and cmdout.poll() != 0: | |
| 145 raise Exception('Error generated Junction DB.') | |
| 146 | |
| 147 #read into the sam file, get gene-strand information | |
| 148 print 'Finding discordant read pairs.' | |
| 149 samfile = pysam.Samfile(sourcefile, "rb" ) | |
| 150 | |
| 151 #mapping quality based read collection | |
| 152 #strand information considered | |
| 153 #mismatch filter considered | |
| 154 a_reads=[] | |
| 155 b_reads=[] | |
| 156 for alignedread in samfile.fetch(gA.chr,gA.start-1,gA.end): | |
| 157 if alignedread.mapq >= minmapq: | |
| 158 readastrd='-1' if alignedread.is_reverse else '1' | |
| 159 mmf=[x[1] for x in alignedread.tags if x[0]=='NM'][0] | |
| 160 if readastrd==gA.strand and mmf <= mm: | |
| 161 a_reads.append(alignedread) | |
| 162 | |
| 163 for alignedread in samfile.fetch(gB.chr,gB.start-1,gB.end): | |
| 164 if alignedread.mapq >= minmapq: | |
| 165 readbstrd='-1' if alignedread.is_reverse else '1' | |
| 166 mmf=[x[1] for x in alignedread.tags if x[0]=='NM'][0] | |
| 167 if readbstrd != gB.strand and mmf <= mm: | |
| 168 b_reads.append(alignedread) | |
| 169 samfile.close() | |
| 170 | |
| 171 ar_ids=[x.qname for x in a_reads] | |
| 172 br_ids=[x.qname for x in b_reads] | |
| 173 disc=list(set(ar_ids).intersection(set(br_ids))) #discordant pairs | |
| 174 | |
| 175 #detect read pair that one end map to one partner, yet the other does not align | |
| 176 print 'Determining potential junction spanning reads.' | |
| 177 rpaonly=[] #reads that only map to gene A -- going to map the other end to junctions | |
| 178 for rd in a_reads: | |
| 179 if rd.mate_is_unmapped: | |
| 180 rpaonly.append(rd) | |
| 181 rpbonly=[] #reads that only map to gene B -- going to map the other end to junctions | |
| 182 for rd in b_reads: | |
| 183 if rd.mate_is_unmapped: | |
| 184 rpbonly.append(rd) | |
| 185 rpaonly_names=[x.qname for x in rpaonly] | |
| 186 rpbonly_names=[x.qname for x in rpbonly] | |
| 187 | |
| 188 #find read sequences | |
| 189 print 'Extracting unmapped read sequences.' | |
| 190 print 'mate unmapped read for gene A:',len(rpaonly) | |
| 191 print 'mate unmapped read for gene B:',len(rpbonly) | |
| 192 samfile=pysam.Samfile(unmapbam,'rb') | |
| 193 taga='%s-%s_a'%(Gene_A,Gene_B) | |
| 194 tagb='%s-%s_b'%(Gene_A,Gene_B) | |
| 195 resfq_a=open('%s/unmapreads_%s.fq'%(outdir,taga),'w') | |
| 196 resfq_b=open('%s/unmapreads_%s.fq'%(outdir,tagb),'w') | |
| 197 for item in samfile: | |
| 198 if item.qname in rpaonly_names: | |
| 199 resfq_a.write('@%s\n'%item.qname) | |
| 200 resfq_a.write('%s\n'%item.seq) | |
| 201 resfq_a.write('+\n') | |
| 202 resfq_a.write('%s\n'%item.qual) | |
| 203 if item.qname in rpbonly_names: | |
| 204 resfq_b.write('@%s\n'%item.qname) | |
| 205 resfq_b.write('%s\n'%item.seq) | |
| 206 resfq_b.write('+\n') | |
| 207 resfq_b.write('%s\n'%item.qual) | |
| 208 resfq_a.close() | |
| 209 resfq_b.close() | |
| 210 samfile.close() | |
| 211 | |
| 212 ##indexing junction db | |
| 213 print 'Aligning reads to junction db' | |
| 214 cmd='%s index %s/%s'%(bwa,outdir,juncfile) | |
| 215 cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) | |
| 216 while True: | |
| 217 if cmdout.poll() is None: | |
| 218 time.sleep(3) | |
| 219 pass | |
| 220 if cmdout.poll()==0: | |
| 221 print 'Junction DB indexed.' | |
| 222 break | |
| 223 if cmdout.poll() is not None and cmdout.poll() != 0: | |
| 224 raise Exception('Error building junction db index') | |
| 225 | |
| 226 ##align the unmapped reads to junction database | |
| 227 for rs in [taga,tagb]: | |
| 228 cmd='%s aln -n %d -R 100 %s/%s %s/unmapreads_%s.fq > %s/%s.sai'%(bwa,mm,outdir,juncfile,outdir,rs,outdir,rs) | |
| 229 cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) | |
| 230 while True: | |
| 231 if cmdout.poll() is None: | |
| 232 time.sleep(5) | |
| 233 pass | |
| 234 if cmdout.poll()==0: | |
| 235 print 'Aligned unmapped reads group %s'%rs | |
| 236 break | |
| 237 if cmdout.poll() is not None and cmdout.poll() != 0: | |
| 238 raise Exception('Error aligning unmapped reads for group %s'%rs) | |
| 239 | |
| 240 cmd='%s samse -n 1000 %s/%s %s/%s.sai %s/unmapreads_%s.fq > %s/%s.sam'%(bwa,outdir,juncfile,outdir,rs,outdir,rs,outdir,rs) | |
| 241 cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) | |
| 242 while True: | |
| 243 if cmdout.poll() is None: | |
| 244 time.sleep(2) | |
| 245 pass | |
| 246 if cmdout.poll()==0: | |
| 247 print 'Converting to sam for group %s'%rs | |
| 248 break | |
| 249 if cmdout.poll() is not None and cmdout.poll() != 0: | |
| 250 raise Exception('Error converting to sam for group %s'%rs) | |
| 251 | |
| 252 #parse results | |
| 253 qualrd_a=[] | |
| 254 junc_a=[] | |
| 255 samfile=pysam.Samfile('%s/%s.sam'%(outdir,taga),'r') | |
| 256 for rd in samfile: | |
| 257 if not rd.is_unmapped and rd.is_reverse: | |
| 258 qualrd_a.append(rd) | |
| 259 junc_a.append(samfile.getrname(rd.tid)) | |
| 260 samfile.close() | |
| 261 qualrd_b=[] | |
| 262 junc_b=[] | |
| 263 samfile=pysam.Samfile('%s/%s.sam'%(outdir,tagb),'r') | |
| 264 for rd in samfile: | |
| 265 if not rd.is_unmapped and not rd.is_reverse: | |
| 266 qualrd_b.append(rd) | |
| 267 junc_b.append(samfile.getrname(rd.tid)) | |
| 268 samfile.close() | |
| 269 | |
| 270 junc_span=[] | |
| 271 junc_span.extend(qualrd_a) | |
| 272 junc_span.extend(qualrd_b) | |
| 273 | |
| 274 junc_name=[] | |
| 275 junc_name.extend(junc_a) | |
| 276 junc_name.extend(junc_b) | |
| 277 | |
| 278 #Generate a summary report | |
| 279 sumfile=open('%s/%s_%s.GUESS.summary.txt'%(outdir,Gene_A,Gene_B),'w') | |
| 280 sumfile.write('%s\t%s\n'%(Gene_A,Gene_B)) | |
| 281 sumfile.write('\n') | |
| 282 sumfile.write('>discordant\n') | |
| 283 for rdname in disc: | |
| 284 ia=ar_ids.index(rdname) | |
| 285 ib=br_ids.index(rdname) | |
| 286 reada=a_reads[ia] | |
| 287 readb=b_reads[ib] | |
| 288 mm_a=[x[1] for x in reada.tags if x[0]=='NM'][0] | |
| 289 mm_b=[x[1] for x in readb.tags if x[0]=='NM'][0] | |
| 290 ss1='%s\tF\t%s.%d.mm%d'%(reada.qname,Gene_A,reada.pos+1,mm_a) #pysam use 0-based coordinates | |
| 291 ss2='%s\tR\t%s.%d.mm%d'%(readb.qname,Gene_B,readb.pos+1,mm_b) | |
| 292 sumfile.write('%s\n'%ss1) | |
| 293 sumfile.write('%s\n'%ss2) | |
| 294 | |
| 295 sumfile.write('\n') | |
| 296 sumfile.write('>spanning\n') | |
| 297 for i in range(len(junc_span)): | |
| 298 rd=junc_span[i] | |
| 299 jname=junc_name[i] | |
| 300 mm_j=[x[1] for x in rd.tags if x[0]=='NM'][0] | |
| 301 ss='%s\t%s.mm%d'%(rd.qname,jname,mm_j) | |
| 302 sumfile.write('%s\n'%ss) | |
| 303 | |
| 304 sumfile.write('\n') | |
| 305 sumfile.write('>junction\n') | |
| 306 juncol=[] | |
| 307 for item in set(junc_name): | |
| 308 nn=junc_name.count(item) | |
| 309 juncol.append([item,nn]) | |
| 310 juncol=sorted(juncol,key=lambda x:x[1],reverse=True) | |
| 311 for item in juncol: | |
| 312 sumfile.write('%s\t%s\n'%(item[0],item[1])) | |
| 313 | |
| 314 sumfile.write('\n') | |
| 315 sumfile.write('>summary\n') | |
| 316 sumfile.write('Number of Discordant Pairs = %d\n'%len(disc)) | |
| 317 sumfile.write('Number of Fusion Reads = %d\n'%len(junc_span)) | |
| 318 sumfile.write('Number of Distinct Junctions = %d\n'%len(set(junc_name))) | |
| 319 | |
| 320 sumfile.close() | |
| 321 print 'Done: %s'%time.ctime() |
