Mercurial > repos > siyuan > prada
comparison pyPRADA_1.2/prada-fusion @ 0:acc2ca1a3ba4
Uploaded
| author | siyuan |
|---|---|
| date | Thu, 20 Feb 2014 00:44:58 -0500 |
| parents | |
| children | f17965495ec9 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:acc2ca1a3ba4 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 #PRADA: Pipeline for RnAseq Data Analysis | |
| 4 #Fusion detection module, algorithm published by Michael Berger et al (Genome Res, 2010) at Broad Institute. | |
| 5 #Implemented by Siyuan Zheng, Wandaliz Torres-Garcia and Rahul Vegesna. | |
| 6 #Copy Right: The Univ of Texas MD Anderson Cancer Center, Department of Bioinformatics and Computational Biology | |
| 7 #Contact: Roel Verhaak (rverhaak@mdanderson.org) | |
| 8 #Citation: to be added | |
| 9 #Tested with Python v2.6 and v2.7 | |
| 10 #The program requires NM tag and high quality mapping reads with mapping score more than -minmapq. | |
| 11 #Last modified: 04/11/2013 | |
| 12 | |
| 13 ###################################################################################### | |
| 14 import sys | |
| 15 import time | |
| 16 import os | |
| 17 import os.path | |
| 18 import subprocess | |
| 19 import operator | |
| 20 import pysam | |
| 21 import bioclass | |
| 22 import gfclass | |
| 23 import ioprada | |
| 24 import privutils | |
| 25 from Bio import SeqIO,Seq | |
| 26 | |
| 27 ###################################################################################### | |
| 28 args=sys.argv | |
| 29 | |
| 30 help_menu='''\nPipeline for RNAseq Data Analaysis - fusion detection (PRADA). | |
| 31 **Command**: | |
| 32 prada-fusion -bam XX.bam -conf xx.txt -tag XX -mm 1 -junL XX -minmapq 30 -outdir ./ | |
| 33 **Parameters**: | |
| 34 -h print help message | |
| 35 -bam input BAM file, must has a .bam suffix. BAM is the output from PRADA preprocess module. | |
| 36 -conf config file for references and parameters. Use conf.txt in py-PRADA installation folder if none specified. | |
| 37 -tag a tag to describe the sample, used to name output files. Default is ''. | |
| 38 -mm number of mismatches allowed in discordant pairs and fusion spanning reads.Default is 1. | |
| 39 -junL length of sequences taken from EACH side of exons when making hypothetical junctions. No default. | |
| 40 -minmapq minimum read mapping quality to be considered as fusion evidences. Default is 30. | |
| 41 -outdir output directory. | |
| 42 -v print version | |
| 43 ''' | |
| 44 | |
| 45 if '-h' in args or '-help' in args or len(args)==1: | |
| 46 print help_menu | |
| 47 sys.exit(0) | |
| 48 | |
| 49 if '-v' in args: | |
| 50 import version | |
| 51 print version.version | |
| 52 sys.exit(0) | |
| 53 | |
| 54 if '-bam' not in args: | |
| 55 sys.exit('ERROR: BAM file needed') | |
| 56 i=args.index('-bam') | |
| 57 bampath=args[i+1] | |
| 58 bampath=os.path.abspath(bampath) | |
| 59 bam=os.path.basename(bampath) | |
| 60 if bam[-4:] != '.bam': | |
| 61 sys.exit('ERROR: BAM file must have suffix .bam') | |
| 62 | |
| 63 #Mismatch allowed. This filter is applied at the very end of the pipeline. | |
| 64 #I strongly suggest 1. We also record how many junction spanning reads (JSRs) are perfect matched. | |
| 65 if '-mm' not in args: | |
| 66 mm=1 | |
| 67 else: | |
| 68 i=args.index('-mm') | |
| 69 mm=int(args[i+1]) | |
| 70 | |
| 71 #junL should be less than the read length in the dataset. I suggest it to be read_length*0.8 | |
| 72 if '-junL' not in args: | |
| 73 sys.exit('ERROR: junL must be specified') | |
| 74 i=args.index('-junL') | |
| 75 overlap=int(args[i+1]) | |
| 76 | |
| 77 #minimum mapping quality for reads as fusion evidences | |
| 78 if '-minmapq' not in args: | |
| 79 minmapq=30 | |
| 80 else: | |
| 81 i=args.index('-minmapq') | |
| 82 minmapq=int(args[i+1]) | |
| 83 | |
| 84 if '-outdir' not in args: | |
| 85 outpath=os.path.abspath('./') | |
| 86 else: | |
| 87 i=args.index('-outdir') | |
| 88 outpath=os.path.abspath(args[i+1]) | |
| 89 if os.path.exists(outpath): | |
| 90 print 'WARNING: outdir %s exists'%outpath | |
| 91 else: | |
| 92 os.mkdir(outpath) | |
| 93 | |
| 94 if '-tag' not in args: | |
| 95 docstring='prada' | |
| 96 else: | |
| 97 i=args.index('-tag') | |
| 98 docstring=args[i+1] | |
| 99 | |
| 100 #by default, search conf.txt in the prada folder | |
| 101 prada_path=os.path.dirname(os.path.abspath(__file__)) #path to find libraries and executives. | |
| 102 ref_search_path=[prada_path,os.getcwd()] #search path for conf file if not specified in command line | |
| 103 | |
| 104 if '-conf' in args: | |
| 105 i=args.index('-ref') | |
| 106 reffile=args[i+1] | |
| 107 if os.path.exists(reffile): | |
| 108 pass | |
| 109 else: | |
| 110 for pth in ref_search_path: | |
| 111 new_reffile='%s/%s'%(pth, os.path.basename(reffile)) | |
| 112 if os.path.exists(new_reffile): | |
| 113 reffile=new_reffile | |
| 114 break | |
| 115 else: | |
| 116 sys.exit('ERROR: conf file %s not found'%reffile) | |
| 117 else: | |
| 118 reffile='%s/conf.txt'%prada_path | |
| 119 if not os.path.exists(reffile): | |
| 120 sys.exit('ERROR: No default conf.txt found and none specified') | |
| 121 | |
| 122 #Now print all input parameters | |
| 123 print 'CMD: %s'%('\t'.join(args)) | |
| 124 | |
| 125 #reference files pointing to the annotation files. | |
| 126 refdict=ioprada.read_conf(reffile) | |
| 127 featurefile=refdict['--REF--']['feature_file'] | |
| 128 txseqfile=refdict['--REF--']['tx_seq_file'] | |
| 129 txcatfile=refdict['--REF--']['txcat_file'] | |
| 130 cdsfile=refdict['--REF--']['cds_file'] | |
| 131 | |
| 132 #underlying utilities, automatically detected | |
| 133 #these are customized tools. update is needed. | |
| 134 samtools='%s/tools/samtools-0.1.16/samtools'%prada_path | |
| 135 bwa='%s/tools/bwa-0.5.7-mh/bwa'%prada_path | |
| 136 blastn='%s/tools/blastn'%prada_path | |
| 137 | |
| 138 ###################################################################################### | |
| 139 print 'step 0: loading gene annotations @ %s'%time.ctime() | |
| 140 #call functions in ioprada module // | |
| 141 txdb,genedb=ioprada.read_feature(featurefile,verbose=True) | |
| 142 tx_primary=ioprada.read_tx_cat(txcatfile) | |
| 143 tx_cds=ioprada.read_cds(cdsfile) | |
| 144 | |
| 145 ###################################################################################### | |
| 146 print 'step 1: finding discordant pairs @ %s'%time.ctime() | |
| 147 | |
| 148 #We sift through all exons of protein coding genes and get the mapping reads. | |
| 149 #Within, we exclude low mapping quality reads and PCR duplicates. For pairs that the two ends | |
| 150 #map to different genes, we all it a discordant pair. | |
| 151 #This is a step for finding all possible candidate fusions. | |
| 152 | |
| 153 samfile=pysam.Samfile(bampath,'rb') | |
| 154 | |
| 155 read1_ab={} | |
| 156 read2_ab={} | |
| 157 db1={} | |
| 158 db2={} | |
| 159 | |
| 160 i,N=0,len(genedb) | |
| 161 for gene in genedb: | |
| 162 i+=1 | |
| 163 if i%200==0: | |
| 164 print '%d/%d genes processed for discordant pairs'%(i,N) | |
| 165 g=genedb[gene] | |
| 166 exons=g.get_exons() | |
| 167 for e in exons.values(): | |
| 168 for rd in samfile.fetch(e.chr,e.start-1,e.end): | |
| 169 if rd.mapq < minmapq: | |
| 170 continue | |
| 171 if rd.is_duplicate: | |
| 172 continue | |
| 173 if rd.mate_is_unmapped: #at this point, only consider pairs | |
| 174 continue | |
| 175 if rd.rnext == rd.tid and rd.mpos <= g.end and rd.mpos >= g.start-1: #remove reads that fall into the same gene range | |
| 176 continue | |
| 177 if rd.is_read1: | |
| 178 if read1_ab.has_key(rd.qname): | |
| 179 read1_ab[rd.qname].add(gene) | |
| 180 else: | |
| 181 read1_ab[rd.qname]=set([gene]) | |
| 182 db1[rd.qname]=rd | |
| 183 if rd.is_read2: | |
| 184 if read2_ab.has_key(rd.qname): | |
| 185 read2_ab[rd.qname].add(gene) | |
| 186 else: | |
| 187 read2_ab[rd.qname]=set([gene]) | |
| 188 db2[rd.qname]=rd | |
| 189 | |
| 190 ##output the discordant pairs and determine the orientation of candidate fusions | |
| 191 discordant={} #catalogue all discordant pairs, using gene pairs as keys | |
| 192 outfile=open('%s/%s.discordant.txt'%(outpath,docstring),'w') | |
| 193 title=['read','gene1','gene1_chr','read1_pos','read1_mm','read1_strand','read1_orient','gene2',\ | |
| 194 'gene2_chr','read2_pos','read2_mm','read2_strand','read2_orient'] | |
| 195 outfile.write('%s\n'%('\t'.join(title))) | |
| 196 i=0 | |
| 197 for rdid in read1_ab: | |
| 198 i+=1 | |
| 199 if i%10000==0: | |
| 200 print '%d discordant pairs processed'%i | |
| 201 if not read2_ab.has_key(rdid): #skip if not all ends are catalogued | |
| 202 continue | |
| 203 g1set=read1_ab[rdid] #consider all combinations if a read maps to multiple genes | |
| 204 g2set=read2_ab[rdid] #consider all combinations if a read maps to multiple genes | |
| 205 r1,r2=db1[rdid],db2[rdid] | |
| 206 read1strd='-1' if r1.is_reverse else '1' | |
| 207 read2strd='-1' if r2.is_reverse else '1' | |
| 208 for g1 in g1set: | |
| 209 for g2 in g2set: | |
| 210 if g1==g2: #for some uncasual cases | |
| 211 continue | |
| 212 g1obj,g2obj=genedb[g1],genedb[g2] | |
| 213 read1orient='F' if read1strd == g1obj.strand else 'R' #read1 --> gene1 | |
| 214 read2orient='F' if read2strd == g2obj.strand else 'R' #read2 --> gene2 | |
| 215 fkey='' | |
| 216 if read1orient=='F' and read2orient=='R': ##scenario I, gene1-gene2 | |
| 217 fkey=g1+'_'+g2 | |
| 218 if read1orient=='R' and read2orient=='F': ##scenario II, gene2-gene1 | |
| 219 fkey=g2+'_'+g1 | |
| 220 if fkey: | |
| 221 if discordant.has_key(fkey): | |
| 222 discordant[fkey].update({rdid:'%s:%s'%(read1orient,read2orient)}) | |
| 223 else: | |
| 224 discordant[fkey]={rdid:'%s:%s'%(read1orient,read2orient)} | |
| 225 ##output | |
| 226 nm1=str([x[1] for x in r1.tags if x[0]=='NM'][0]) #output mismatch, but does not consider it at this point | |
| 227 nm2=str([x[1] for x in r2.tags if x[0]=='NM'][0]) | |
| 228 uvec=[rdid,g1,g1obj.chr,str(r1.pos+1),nm1,read1strd,read1orient,g2,g2obj.chr,str(r2.pos+1),nm2,read2strd,read2orient] | |
| 229 outfile.write('%s\n'%('\t'.join(uvec))) | |
| 230 outfile.close() | |
| 231 | |
| 232 ########################################################################## | |
| 233 print 'step 2: finding recurrent pairs (candidates) @ %s'%time.ctime() | |
| 234 | |
| 235 #step 2 finds all candidates that have at least 2 discordant pairs. Meanwhile, filter out potential | |
| 236 #read through events. read through is defined as reads with mapping position less than 1M, while meeting | |
| 237 #the strand expectation. | |
| 238 | |
| 239 guess=[] | |
| 240 outfile=open('%s/%s.recurrent.txt'%(outpath,docstring),'w') | |
| 241 title=['geneA','geneA_chr','geneB','geneB_chr','num_pairs','IDs'] | |
| 242 outfile.write('%s\n'%('\t'.join(title))) | |
| 243 for pp in discordant: | |
| 244 if len(discordant[pp]) < 2: #consider only "recurrent" (more than 1 pair support) cases | |
| 245 continue | |
| 246 gene1,gene2=pp.split('_') | |
| 247 g1obj,g2obj=genedb[gene1],genedb[gene2] | |
| 248 rdset=discordant[pp].keys() | |
| 249 #filter read-through | |
| 250 #readthrough is defined at read level, regardless of mapping genes | |
| 251 for rd in rdset: | |
| 252 r1,r2=db1[rd],db2[rd] | |
| 253 read1strd='-1' if r1.is_reverse else '1' | |
| 254 read2strd='-1' if r2.is_reverse else '1' | |
| 255 readthrough=False | |
| 256 if db1[rd].tid == db2[rd].tid and abs(db1[rd].pos - db2[rd].pos) <= 1000000: | |
| 257 if discordant[pp][rd]=='F:R': | |
| 258 if read1strd=='1' and read2strd=='-1' and db1[rd].pos < db2[rd].pos: | |
| 259 readthrough=True | |
| 260 if read1strd=='-1' and read2strd=='1' and db1[rd].pos > db2[rd].pos: | |
| 261 readthrough=True | |
| 262 if discordant[pp][rd]=='R:F': | |
| 263 if read2strd=='1' and read1strd=='-1' and db2[rd].pos < db1[rd].pos: | |
| 264 readthrough=True | |
| 265 if read2strd=='-1' and read1strd=='1' and db2[rd].pos > db1[rd].pos: | |
| 266 readthrough=True | |
| 267 if readthrough: | |
| 268 del discordant[pp][rd] #in-place deletion!!!! Change the discordant variable in place! | |
| 269 if len(discordant[pp]) < 2: #skip all that have less than 2 supporting discordant read pairs after readthrough filter | |
| 270 continue | |
| 271 guess.append(pp) | |
| 272 #output | |
| 273 uvec2=[gene1,g1obj.chr,gene2,g2obj.chr,str(len(discordant[pp])),'|'.join(discordant[pp])] | |
| 274 outfile.write('%s\n'%('\t'.join(uvec2))) | |
| 275 outfile.close() | |
| 276 | |
| 277 ########################################################################## | |
| 278 print 'step 3: finding potential junction spanning reads @ %s'%time.ctime() | |
| 279 | |
| 280 #For all candidates, find potential junction spanning reads (JSRs). A JSR is defined as a unmapped read but with the mate mapping | |
| 281 #to either F or R partner, with high mapping quality. Since the JSR is unmapped, it is not practical to consider PCR duplicate | |
| 282 #because they are not properly marked. | |
| 283 | |
| 284 Fpartners=set() | |
| 285 Rpartners=set() | |
| 286 for pp in guess: | |
| 287 gs=pp.split('_') | |
| 288 Fpartners.add(gs[0]) | |
| 289 Rpartners.add(gs[1]) | |
| 290 AllPartners=Fpartners|Rpartners | |
| 291 | |
| 292 samfile.reset() | |
| 293 posjun={} ##catalogue all JSRs, with track of the mate mapping genes and orientation. | |
| 294 i,N=0,len(AllPartners) | |
| 295 for gene in AllPartners: | |
| 296 i+=1 | |
| 297 if i%200==0: | |
| 298 print '%d/%d genes processed for potential junc reads'%(i,N) | |
| 299 g=genedb[gene] | |
| 300 exons=g.get_exons().values() | |
| 301 for e in exons: | |
| 302 for rd in samfile.fetch(e.chr,e.start-1,e.end): | |
| 303 if rd.mapq < minmapq: | |
| 304 continue | |
| 305 if not rd.mate_is_unmapped: | |
| 306 continue | |
| 307 readstrd='-1' if rd.is_reverse else '1' | |
| 308 orient='F' if readstrd == g.strand else 'R' #mapping info of mate read. JSR per se is unmapped in BAM | |
| 309 posjun[rd.qname]={'gene':gene,'orient':orient} | |
| 310 | |
| 311 samfile.reset() | |
| 312 outfile=open('%s/%s.pos_junc_unmapped.fastq'%(outpath,docstring),'w') | |
| 313 i,N=0,len(posjun) | |
| 314 for rd in samfile: | |
| 315 if rd.mate_is_unmapped: #since the read is potential jun spanning read, all mate map to A or B | |
| 316 continue | |
| 317 if rd.is_unmapped: | |
| 318 if posjun.has_key(rd.qname): | |
| 319 i+=1 | |
| 320 if i%10000==0: | |
| 321 print 'extracted %d/%d potential junc reads'%(i,N) | |
| 322 rdname='%s_prada_%s_prada_%s'%(rd.qname,posjun[rd.qname]['gene'],posjun[rd.qname]['orient']) #_prada_ as split | |
| 323 outfile.write('@%s\n'%rdname) | |
| 324 outfile.write('%s\n'%rd.seq) | |
| 325 outfile.write('+\n') | |
| 326 outfile.write('%s\n'%rd.qual) | |
| 327 outfile.close() | |
| 328 | |
| 329 ###################################################################################### | |
| 330 print 'step 4: building junction database @ %s'%time.ctime() | |
| 331 | |
| 332 #Make hypothetical junctions between candidate fusion partners. To improve speed, we make a big junction database comprising | |
| 333 #exons of all candidates, instead of by candidate individually. This also gives the possibility to assess the JSR mapping ambiguity | |
| 334 #across many junctions. It turned out very useful in filtering out false positives. | |
| 335 #Note that in PRADA transcript sequence file, all sequences are + strand sequences. For - strand transcript, one need to | |
| 336 #reverse complement the sequence to get the real transcript sequences. | |
| 337 | |
| 338 seqdb={} | |
| 339 for record in SeqIO.parse(txseqfile,'fasta'): | |
| 340 seqdb[record.name]=record | |
| 341 | |
| 342 outfile=open('%s/%s.junction.fasta'%(outpath,docstring),'w') | |
| 343 i,N=0,len(guess) | |
| 344 for pp in guess: | |
| 345 i+=1 | |
| 346 if i%100==0: | |
| 347 print 'building junction for %d/%d pairs'%(i,N) | |
| 348 gene1,gene2=pp.split('_') | |
| 349 g1obj,g2obj=genedb[gene1],genedb[gene2] | |
| 350 eset1=g1obj.get_exons() #unique exons in gene 1 | |
| 351 eset2=g2obj.get_exons() #unique exons in gene 2 | |
| 352 #collect unique junctions | |
| 353 juncseqdict={} #save junction sequences | |
| 354 for e1 in eset1.values(): | |
| 355 for e2 in eset2.values(): | |
| 356 e1_jun_name='%s:%s:%s'%(gene1,e1.chr,e1.end) if e1.strand=='1' else '%s:%s:%s'%(gene1,e1.chr,e1.start) | |
| 357 e2_jun_name='%s:%s:%s'%(gene2,e2.chr,e2.start) if e2.strand=='1' else '%s:%s:%s'%(gene2,e2.chr,e2.end) | |
| 358 jun_name=e1_jun_name+'_'+e2_jun_name | |
| 359 tx1,tx2=txdb[e1.transcript],txdb[e2.transcript] | |
| 360 max_a=tx1.exon_relative_pos()[e1.name][1] | |
| 361 min_a=0 if max_a - overlap < 0 else max_a - overlap | |
| 362 min_b=tx2.exon_relative_pos()[e2.name][0]-1 | |
| 363 max_b=tx2.length if min_b + overlap > tx2.length else min_b + overlap | |
| 364 #reverse complementary when necessary | |
| 365 try: | |
| 366 tx1seq=seqdb[tx1.name].seq.tostring() if tx1.strand=='1' else seqdb[tx1.name].reverse_complement().seq.tostring() | |
| 367 tx2seq=seqdb[tx2.name].seq.tostring() if tx2.strand=='1' else seqdb[tx2.name].reverse_complement().seq.tostring() | |
| 368 except KeyError: #in case transcript not found in sequence file | |
| 369 continue | |
| 370 jun_seq=tx1seq[min_a:max_a]+tx2seq[min_b:max_b] | |
| 371 juncseqdict[jun_name]=jun_seq | |
| 372 for junc in juncseqdict: | |
| 373 outfile.write('>%s\n'%junc) | |
| 374 outfile.write('%s\n'%juncseqdict[junc]) | |
| 375 outfile.close() | |
| 376 samfile.close() | |
| 377 | |
| 378 #for memory efficiecy, del seqdb | |
| 379 del seqdb | |
| 380 | |
| 381 ######################################################################################## | |
| 382 print 'step 5: aligning potential junction reads to junction database @ %s'%time.ctime() | |
| 383 | |
| 384 #Mapping potential JSRs to hypothetical junction database. | |
| 385 #Allow 4 mismatches at the beginning. | |
| 386 | |
| 387 idx_cmd='%s index %s/%s.junction.fasta'%(bwa,outpath,docstring) | |
| 388 os.system(idx_cmd) | |
| 389 aln_cmd='%s aln -n 4 -R 100 %s/%s.junction.fasta %s/%s.pos_junc_unmapped.fastq > %s/%s.juncmap.sai'%(bwa,outpath,docstring,outpath,docstring,outpath,docstring) | |
| 390 os.system(aln_cmd) | |
| 391 samse_cmd='%s samse -n 1000 -s %s/%s.junction.fasta %s/%s.juncmap.sai %s/%s.pos_junc_unmapped.fastq > %s/%s.juncmap.sam'%(bwa,outpath,docstring,outpath,docstring,outpath,docstring,outpath,docstring) | |
| 392 os.system(samse_cmd) | |
| 393 sam2bam_cmd='%s view -bS %s/%s.juncmap.sam -o %s/%s.juncmap.bam'%(samtools,outpath,docstring,outpath,docstring) | |
| 394 os.system(sam2bam_cmd) | |
| 395 | |
| 396 jsam=pysam.Samfile('%s/%s.juncmap.bam'%(outpath,docstring),'rb') | |
| 397 #get the junction name directory | |
| 398 junctions=jsam.references | |
| 399 junname=dict(zip(range(0,len(junctions)),junctions)) #this is essential for quick speed. | |
| 400 junc_align={} | |
| 401 | |
| 402 #go through the BAM file for meaningful (meeting fusion orientation etc) reads | |
| 403 strd_right_reads={} | |
| 404 rdb={} #collect all junction spanning reads | |
| 405 i=0 | |
| 406 for rd in jsam: | |
| 407 i+=1 | |
| 408 if i%100000==0: | |
| 409 print '%d junction alignments parsed'%i | |
| 410 if rd.is_unmapped: | |
| 411 continue | |
| 412 read,mate_gene,mate_orient=rd.qname.split('_prada_') | |
| 413 junc=junname[rd.tid] | |
| 414 tmp=junc.split('_') | |
| 415 gene1,gene2=[x.split(':')[0] for x in tmp] | |
| 416 if gene1==mate_gene: | |
| 417 if mate_orient=='F': | |
| 418 if rd.is_reverse: | |
| 419 if strd_right_reads.has_key(rd.qname): | |
| 420 strd_right_reads[rd.qname]+=1 #count how many times the read maps | |
| 421 else: | |
| 422 strd_right_reads[rd.qname]=1 | |
| 423 rdb[rd.qname]={'read':rd,'gene1':gene1,'gene2':gene2,'junc':junc} #will overwrite, but it is OK since we only look at unique ones | |
| 424 elif gene2==mate_gene: | |
| 425 if mate_orient == 'R': | |
| 426 if not rd.is_reverse: | |
| 427 if strd_right_reads.has_key(rd.qname): | |
| 428 strd_right_reads[rd.qname]+=1 | |
| 429 else: | |
| 430 strd_right_reads[rd.qname]=1 | |
| 431 rdb[rd.qname]={'read':rd,'gene1':gene1,'gene2':gene2,'junc':junc} #will overwrite, but it is OK since we only look at unique ones | |
| 432 | |
| 433 #find uniquely mapped reads and their gene pairs | |
| 434 junc_map={} #a dictionary from junction to mapping reads | |
| 435 for rdname in strd_right_reads: | |
| 436 if strd_right_reads[rdname] > 1: #remove non-unique junction spanning reads | |
| 437 continue | |
| 438 infodict=rdb[rdname] | |
| 439 pp=infodict['gene1']+'_'+infodict['gene2'] | |
| 440 if junc_map.has_key(pp): | |
| 441 junc_map[pp].add(rdname) | |
| 442 else: | |
| 443 junc_map[pp]=set([rdname]) | |
| 444 | |
| 445 ######################################################################################## | |
| 446 print 'step 6: summarizing fusion evidences @ %s'%time.ctime() | |
| 447 | |
| 448 #Now, time to apply mismatch filter and summarize the results | |
| 449 #Candidate fusions --> guess | |
| 450 #Discordant pairs --> discordant, db1, db2 | |
| 451 #Junction reads --> junc_map, rdb | |
| 452 #Gene info --> genedb | |
| 453 | |
| 454 outfile_s=open('%s/%s.fus.candidates.txt'%(outpath,docstring),'w') | |
| 455 outfile_d=open('%s/%s.fus.evidences.txt'%(outpath,docstring),'w') | |
| 456 | |
| 457 title=['Gene_A','Gene_B','A_chr','B_chr','A_strand','B_strand','Discordant_n','JSR_n','perfectJSR_n','Junc_n','Position_Consist','Junction'] | |
| 458 outfile_s.write('%s\n'%('\t'.join(title))) | |
| 459 | |
| 460 for pp in junc_map: #all pairs with junc reads | |
| 461 gene1,gene2=pp.split('_') | |
| 462 g1obj,g2obj=genedb[gene1],genedb[gene2] | |
| 463 fus_disc=[] #collecting discordant pairs | |
| 464 for rdname in discordant[pp]: | |
| 465 #arrange read1/read2 into F/R so it will be easier for GeneFusion obj to handle | |
| 466 r1,r2=db1[rdname],db2[rdname] | |
| 467 orient=discordant[pp][rdname] | |
| 468 if orient=='F:R': | |
| 469 fus_disc.append((r1,r2)) | |
| 470 elif orient=='R:F': | |
| 471 fus_disc.append((r2,r1)) | |
| 472 fus_jsr=[] | |
| 473 if junc_map.has_key(pp): | |
| 474 for rdname in junc_map[pp]: | |
| 475 r=rdb[rdname]['read'] | |
| 476 junc=rdb[rdname]['junc'] | |
| 477 jsr=gfclass.JSR(r,junc) | |
| 478 fus_jsr.append(jsr) | |
| 479 gf=gfclass.GeneFusion(gene1,gene2,fus_disc,fus_jsr) | |
| 480 gf_new=gf.update(mm=mm) ##apply the mismatch parameter, default is 1 | |
| 481 #output the results | |
| 482 disc_n=str(len(gf_new.discordantpairs)) | |
| 483 junctions=sorted(gf_new.get_junction_freq(),key=operator.itemgetter(1),reverse=True) #sort junc by # of JSRs | |
| 484 junc_n=str(len(junctions)) | |
| 485 junc_str='|'.join([','.join([x[0],str(x[1])]) for x in junctions]) | |
| 486 jsr_n=str(len(gf_new.fusionreads)) | |
| 487 pjsr_n=str(len(gf_new.get_perfect_JSR())) | |
| 488 pos_consist=gf_new.positioncheck() | |
| 489 svec=[gene1,gene2,g1obj.chr,g2obj.chr,g1obj.strand,g2obj.strand,disc_n,jsr_n,pjsr_n,junc_n,pos_consist,junc_str] | |
| 490 outfile_s.write('%s\n'%('\t'.join(svec))) | |
| 491 outfile_d.write('@@\t%s,%s,%s\t%s,%s,%s\n'%(gene1,g1obj.chr,g1obj.strand,gene2,g2obj.chr,g2obj.strand)) | |
| 492 outfile_d.write('\n') | |
| 493 outfile_d.write('>discordant\n') | |
| 494 for rp in gf_new.discordantpairs: | |
| 495 rf,rr=rp | |
| 496 nm1=[x[1] for x in rf.tags if x[0]=='NM'][0] | |
| 497 nm2=[x[1] for x in rr.tags if x[0]=='NM'][0] | |
| 498 outfile_d.write('%s\tF\t%s.%s.mm%d\n'%(rf.qname,gene1,rf.pos+1,nm1)) ##0-based coordinates | |
| 499 outfile_d.write('%s\tR\t%s.%s.mm%d\n'%(rr.qname,gene2,rr.pos+1,nm2)) ##0-based coordinates | |
| 500 outfile_d.write('\n') | |
| 501 outfile_d.write('>spanning\n') | |
| 502 for jsr in gf_new.fusionreads: | |
| 503 r=jsr.read | |
| 504 nm=[x[1] for x in r.tags if x[0]=='NM'][0] | |
| 505 outfile_d.write('%s\t%s.mm%d\n'%(r.qname,jsr.junction,nm)) | |
| 506 outfile_d.write('\n') | |
| 507 outfile_d.write('>junction\n') | |
| 508 for junc_info in junctions: | |
| 509 outfile_d.write('%s\t%d\n'%(junc_info[0],junc_info[1])) | |
| 510 outfile_d.write('\n') | |
| 511 outfile_d.write('>summary\n') | |
| 512 outfile_d.write('Number of Discordant Pairs = %s\n'%disc_n) | |
| 513 outfile_d.write('Number of Fusion Reads = %s\n'%jsr_n) | |
| 514 outfile_d.write('Number of Perfect Fusion Reads = %s\n'%pjsr_n) | |
| 515 outfile_d.write('Number of Distinct Junctions = %s\n'%junc_n) | |
| 516 outfile_d.write('Position Consistency = %s\n'%pos_consist) | |
| 517 outfile_d.write('\n') | |
| 518 | |
| 519 outfile_s.close() | |
| 520 outfile_d.close() | |
| 521 | |
| 522 ######################################################################################## | |
| 523 print 'step 7: generating fusion lists @ %s'%time.ctime() | |
| 524 | |
| 525 #For convenience, filter the lists to candidates with | |
| 526 # 1) at least 2 discordant pairs | |
| 527 # 2) at least 1 perfect JSR | |
| 528 #meanwhile, calculate sequence similarity for each pair | |
| 529 #user may need to manually filter the lists per this measure. | |
| 530 | |
| 531 #The following code is a copy of prada-homology | |
| 532 outfile_o=open('%s/%s.fus.summary.txt'%(outpath,docstring),'w') | |
| 533 ifname='%s/%s.fus.candidates.txt'%(outpath,docstring) | |
| 534 if not os.path.exists(ifname): | |
| 535 sys.exit('ERROR: %s was not found'%ifname) | |
| 536 | |
| 537 blastseq_tmp_dir='%s/blast_tmp/'%outpath | |
| 538 if not os.path.exists(blastseq_tmp_dir): | |
| 539 os.mkdir(blastseq_tmp_dir) | |
| 540 | |
| 541 flists=[] | |
| 542 infile=open(ifname) | |
| 543 iN=0 | |
| 544 for line in open(ifname): | |
| 545 info=line.strip().split('\t') | |
| 546 if iN==0: | |
| 547 iN+=1 #skip title | |
| 548 flists.append(info) | |
| 549 continue | |
| 550 else: | |
| 551 if int(info[6])>=2 and int(info[8])>=1 and info[10] in ['PARTIALLY','YES']: | |
| 552 flists.append(info) | |
| 553 infile.close() | |
| 554 | |
| 555 if len(flists)==1: #if no candidate passes the filters | |
| 556 outfile_o.write('%s\n'%'\t'.join(flists[0])) | |
| 557 outfile_o.close() | |
| 558 print 'step done @ %s'%time.ctime() | |
| 559 sys.exit(0) | |
| 560 | |
| 561 candidates={} | |
| 562 for line in flists[1:]: | |
| 563 geneA,geneB=line[0],line[1] | |
| 564 key='%s_%s'%(geneA,geneB) | |
| 565 candidates[key]='' | |
| 566 | |
| 567 selecttranscript={} | |
| 568 for gene in genedb: | |
| 569 txs=genedb[gene].transcript | |
| 570 stx=txs[0] | |
| 571 initlen=stx.length | |
| 572 for tx in txs: | |
| 573 if tx.length >= initlen: | |
| 574 stx=tx | |
| 575 initlen=stx.length | |
| 576 selecttranscript[gene]=stx.name | |
| 577 | |
| 578 allpartners=set() | |
| 579 for item in candidates: | |
| 580 sset=set(item.split('_')) | |
| 581 allpartners=allpartners.union(sset) | |
| 582 | |
| 583 presenttxs=[] #tx that is present in our annotation | |
| 584 absent=[] #tx that is not in our annotation | |
| 585 for gene in allpartners: | |
| 586 if selecttranscript.has_key(gene): | |
| 587 presenttxs.append(selecttranscript[gene]) | |
| 588 else: | |
| 589 absent.append(gene) | |
| 590 | |
| 591 for seq_record in SeqIO.parse(txseqfile,'fasta'): | |
| 592 sid=seq_record.id | |
| 593 seq=seq_record.seq | |
| 594 if sid in presenttxs: | |
| 595 g=txdb[sid].gene | |
| 596 fastafile=open('%s/%s.fasta'%(blastseq_tmp_dir,g),'w') | |
| 597 SeqIO.write(seq_record,fastafile,'fasta') | |
| 598 fastafile.close() | |
| 599 | |
| 600 for gp in candidates: | |
| 601 geneA,geneB=gp.split('_') | |
| 602 if geneA in absent or geneB in absent: | |
| 603 candidates[gp]=['NA']*4 | |
| 604 else: | |
| 605 gaseq='%s/%s.fasta'%(blastseq_tmp_dir,geneA) | |
| 606 gaobj=SeqIO.parse(gaseq,'fasta').next() | |
| 607 gbseq='%s/%s.fasta'%(blastseq_tmp_dir,geneB) | |
| 608 gbobj=SeqIO.parse(gbseq,'fasta').next() | |
| 609 ga_len,gb_len=str(len(gaobj.seq)),str(len(gbobj.seq)) | |
| 610 a=privutils.seqblast(gaseq,gbseq,blastn) | |
| 611 if a==None: | |
| 612 candidates[gp]=['NA','NA','100','0'] | |
| 613 else: | |
| 614 candidates[gp]=a | |
| 615 | |
| 616 header=flists[0][:] | |
| 617 header.extend(['Identity','Align_Len','Evalue','BitScore']) | |
| 618 outfile_o.write('%s\n'%('\t'.join(header))) | |
| 619 | |
| 620 for info in flists[1:]: | |
| 621 geneA,geneB=info[0],info[1] | |
| 622 key='%s_%s'%(geneA,geneB) | |
| 623 vv=candidates[key] | |
| 624 row=info[:] | |
| 625 row.extend(vv) | |
| 626 outfile_o.write('%s\n'%('\t'.join(row))) | |
| 627 outfile_o.close() | |
| 628 | |
| 629 ######################################################################################## | |
| 630 print 'step done @ %s'%time.ctime() |
