Mercurial > repos > siyuan > prada
diff pyPRADA_1.2/prada-guess-ft @ 0:acc2ca1a3ba4
Uploaded
author | siyuan |
---|---|
date | Thu, 20 Feb 2014 00:44:58 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyPRADA_1.2/prada-guess-ft Thu Feb 20 00:44:58 2014 -0500 @@ -0,0 +1,321 @@ +#!/usr/bin/env python + +#GUESS-ft:General UsEr defined Supervised Search for fusion transcript. +#This program is to perform targeted search of gene fusions from RNAseq data. +#It uses samtools, pysam package. It requires a bam file and other reference files. +#Note that it is a less accurate tool than prada-fusion. + +import pysam +import subprocess +import os +import time +import sys +import bioclass +import ioprada + +args=sys.argv + +#Get all parameters +help_menu='''\nUsage: prada-guess-ft GeneA GeneB -conf xx.txt -inputbam X -mm 1 -minmapq 30 -junL X -outdir X -unmap X\n + **Parameters**: + -conf the configure file. see prada-fusion -conf for details + -inputbam the input bam file + -mm number of mismatch allowed + -minmapq mininum mapping quality for reads to be used in fusion finding + -junL length of exons to be used for junctions. see prada-fusion + -outdir output directory + -unmap the numapped reads. useful if user need to run guess-ft multiple times +''' + +if '-h' in args or '-help' in args or len(args)==1: + print help_menu + sys.exit(0) + +################################################################################## +Gene_A=args[1] +Gene_B=args[2] +if '-inputbam' not in args: + sys.exit('Input BAM needed') +else: + i=args.index('-inputbam') + sourcefile=args[i+1] +if '-outdir' not in args: + outdir='./' +else: + i=args.index('-outdir') + outdir=args[i+1] + if not os.path.exists(outdir): + os.mkdir(outdir) +if '-unmap' not in args: #get unmapped.bam yourself + unmapbam='%s/one.end.unmapped.bam'%outdir + extract_mask=1 +else: + i=args.index('-unmap') + unmapbam=args[i+1] + extract_mask=0 +if '-mm' not in args: + mm=1 +else: + i=args.index('-mm') + mm=int(args[i+1]) +#minimum mapping quality for reads as fusion evidences +if '-minmapq' not in args: + minmapq=30 +else: + i=args.index('-minmapq') + minmapq=int(args[i+1]) + +if '-junL' not in args: + sys.exit('-junL needed') +else: + i=args.index('-junL') + junL=int(args[i+1]) + +prada_path=os.path.dirname(os.path.abspath(__file__)) #### +ref_search_path=[prada_path,os.getcwd()] #search path for ref file if not specified in command line + +if '-conf' in args: + i=args.index('-conf') + reffile=args[i+1] + if os.path.exists(reffile): + pass + else: + for pth in ref_search_path: + new_reffile='%s/%s'%(pth, os.path.basename(reffile)) + if os.path.exists(new_reffile): + reffile=new_reffile + break + else: + sys.exit('ERROR: ref file %s not found'%reffile) +else: + reffile='%s/conf.txt'%prada_path + if not os.path.exists(reffile): + sys.exit('ERROR: No default conf.txt found and none specified') + +#reference files +refdict=ioprada.read_conf(reffile) +ref_anno=refdict['--REF--']['ref_anno'] +ref_map=refdict['--REF--']['ref_map'] +ref_fasta=refdict['--REF--']['ref_fasta'] +featurefile=refdict['--REF--']['feature_file'] + +samtools='%s/tools/samtools-0.1.16/samtools'%prada_path +bwa='%s/tools/bwa-0.5.7-mh/bwa'%prada_path + +print 'GUESS start: %s'%time.ctime() +print 'CMD: %s'%('\t'.join(args)) + +################################################################################## +##get gene position information + +gA,gB=ioprada.read_feature_genes(featurefile,Gene_A,Gene_B) +if gA is None: + sys.exit('%s not found'%Gene_A) +if gB is None: + sys.exit('%s not found'%Gene_B) + +#Generate unmapped reads. +if extract_mask: + cmd='%s view -b -f 4 -F 8 %s > %s'%(samtools,sourcefile,unmapbam) + cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) + while True: + if cmdout.poll() is None: + print 'Extracting unmapped reads...' + time.sleep(120) + pass + if cmdout.poll()==0: + print 'Extracted unmapped reads' + break + if cmdout.poll() is not None and cmdout.poll() != 0: + raise Exception('Error extracting unmapped reads') + +#Generate junction db +juncfile='%s_%s.junction.fasta'%(Gene_A,Gene_B) +cmd='perl %s/make_exon_junctions.pl %s %s %s %s %s %d > %s/%s'%(prada_path,Gene_A,Gene_B,ref_anno,ref_map,ref_fasta,junL,outdir,juncfile) +cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) +while True: + if cmdout.poll() is None: + print 'Generating junction db. Waiting...' + time.sleep(20) + pass + if cmdout.poll()==0: + print 'Generated Junction DB.' + break + if cmdout.poll() is not None and cmdout.poll() != 0: + raise Exception('Error generated Junction DB.') + +#read into the sam file, get gene-strand information +print 'Finding discordant read pairs.' +samfile = pysam.Samfile(sourcefile, "rb" ) + +#mapping quality based read collection +#strand information considered +#mismatch filter considered +a_reads=[] +b_reads=[] +for alignedread in samfile.fetch(gA.chr,gA.start-1,gA.end): + if alignedread.mapq >= minmapq: + readastrd='-1' if alignedread.is_reverse else '1' + mmf=[x[1] for x in alignedread.tags if x[0]=='NM'][0] + if readastrd==gA.strand and mmf <= mm: + a_reads.append(alignedread) + +for alignedread in samfile.fetch(gB.chr,gB.start-1,gB.end): + if alignedread.mapq >= minmapq: + readbstrd='-1' if alignedread.is_reverse else '1' + mmf=[x[1] for x in alignedread.tags if x[0]=='NM'][0] + if readbstrd != gB.strand and mmf <= mm: + b_reads.append(alignedread) +samfile.close() + +ar_ids=[x.qname for x in a_reads] +br_ids=[x.qname for x in b_reads] +disc=list(set(ar_ids).intersection(set(br_ids))) #discordant pairs + +#detect read pair that one end map to one partner, yet the other does not align +print 'Determining potential junction spanning reads.' +rpaonly=[] #reads that only map to gene A -- going to map the other end to junctions +for rd in a_reads: + if rd.mate_is_unmapped: + rpaonly.append(rd) +rpbonly=[] #reads that only map to gene B -- going to map the other end to junctions +for rd in b_reads: + if rd.mate_is_unmapped: + rpbonly.append(rd) +rpaonly_names=[x.qname for x in rpaonly] +rpbonly_names=[x.qname for x in rpbonly] + +#find read sequences +print 'Extracting unmapped read sequences.' +print 'mate unmapped read for gene A:',len(rpaonly) +print 'mate unmapped read for gene B:',len(rpbonly) +samfile=pysam.Samfile(unmapbam,'rb') +taga='%s-%s_a'%(Gene_A,Gene_B) +tagb='%s-%s_b'%(Gene_A,Gene_B) +resfq_a=open('%s/unmapreads_%s.fq'%(outdir,taga),'w') +resfq_b=open('%s/unmapreads_%s.fq'%(outdir,tagb),'w') +for item in samfile: + if item.qname in rpaonly_names: + resfq_a.write('@%s\n'%item.qname) + resfq_a.write('%s\n'%item.seq) + resfq_a.write('+\n') + resfq_a.write('%s\n'%item.qual) + if item.qname in rpbonly_names: + resfq_b.write('@%s\n'%item.qname) + resfq_b.write('%s\n'%item.seq) + resfq_b.write('+\n') + resfq_b.write('%s\n'%item.qual) +resfq_a.close() +resfq_b.close() +samfile.close() + +##indexing junction db +print 'Aligning reads to junction db' +cmd='%s index %s/%s'%(bwa,outdir,juncfile) +cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) +while True: + if cmdout.poll() is None: + time.sleep(3) + pass + if cmdout.poll()==0: + print 'Junction DB indexed.' + break + if cmdout.poll() is not None and cmdout.poll() != 0: + raise Exception('Error building junction db index') + +##align the unmapped reads to junction database +for rs in [taga,tagb]: + cmd='%s aln -n %d -R 100 %s/%s %s/unmapreads_%s.fq > %s/%s.sai'%(bwa,mm,outdir,juncfile,outdir,rs,outdir,rs) + cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) + while True: + if cmdout.poll() is None: + time.sleep(5) + pass + if cmdout.poll()==0: + print 'Aligned unmapped reads group %s'%rs + break + if cmdout.poll() is not None and cmdout.poll() != 0: + raise Exception('Error aligning unmapped reads for group %s'%rs) + + cmd='%s samse -n 1000 %s/%s %s/%s.sai %s/unmapreads_%s.fq > %s/%s.sam'%(bwa,outdir,juncfile,outdir,rs,outdir,rs,outdir,rs) + cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) + while True: + if cmdout.poll() is None: + time.sleep(2) + pass + if cmdout.poll()==0: + print 'Converting to sam for group %s'%rs + break + if cmdout.poll() is not None and cmdout.poll() != 0: + raise Exception('Error converting to sam for group %s'%rs) + +#parse results +qualrd_a=[] +junc_a=[] +samfile=pysam.Samfile('%s/%s.sam'%(outdir,taga),'r') +for rd in samfile: + if not rd.is_unmapped and rd.is_reverse: + qualrd_a.append(rd) + junc_a.append(samfile.getrname(rd.tid)) +samfile.close() +qualrd_b=[] +junc_b=[] +samfile=pysam.Samfile('%s/%s.sam'%(outdir,tagb),'r') +for rd in samfile: + if not rd.is_unmapped and not rd.is_reverse: + qualrd_b.append(rd) + junc_b.append(samfile.getrname(rd.tid)) +samfile.close() + +junc_span=[] +junc_span.extend(qualrd_a) +junc_span.extend(qualrd_b) + +junc_name=[] +junc_name.extend(junc_a) +junc_name.extend(junc_b) + +#Generate a summary report +sumfile=open('%s/%s_%s.GUESS.summary.txt'%(outdir,Gene_A,Gene_B),'w') +sumfile.write('%s\t%s\n'%(Gene_A,Gene_B)) +sumfile.write('\n') +sumfile.write('>discordant\n') +for rdname in disc: + ia=ar_ids.index(rdname) + ib=br_ids.index(rdname) + reada=a_reads[ia] + readb=b_reads[ib] + mm_a=[x[1] for x in reada.tags if x[0]=='NM'][0] + mm_b=[x[1] for x in readb.tags if x[0]=='NM'][0] + ss1='%s\tF\t%s.%d.mm%d'%(reada.qname,Gene_A,reada.pos+1,mm_a) #pysam use 0-based coordinates + ss2='%s\tR\t%s.%d.mm%d'%(readb.qname,Gene_B,readb.pos+1,mm_b) + sumfile.write('%s\n'%ss1) + sumfile.write('%s\n'%ss2) + +sumfile.write('\n') +sumfile.write('>spanning\n') +for i in range(len(junc_span)): + rd=junc_span[i] + jname=junc_name[i] + mm_j=[x[1] for x in rd.tags if x[0]=='NM'][0] + ss='%s\t%s.mm%d'%(rd.qname,jname,mm_j) + sumfile.write('%s\n'%ss) + +sumfile.write('\n') +sumfile.write('>junction\n') +juncol=[] +for item in set(junc_name): + nn=junc_name.count(item) + juncol.append([item,nn]) +juncol=sorted(juncol,key=lambda x:x[1],reverse=True) +for item in juncol: + sumfile.write('%s\t%s\n'%(item[0],item[1])) + +sumfile.write('\n') +sumfile.write('>summary\n') +sumfile.write('Number of Discordant Pairs = %d\n'%len(disc)) +sumfile.write('Number of Fusion Reads = %d\n'%len(junc_span)) +sumfile.write('Number of Distinct Junctions = %d\n'%len(set(junc_name))) + +sumfile.close() +print 'Done: %s'%time.ctime()