diff pyPRADA_1.2/prada-guess-ft @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pyPRADA_1.2/prada-guess-ft	Thu Feb 20 00:44:58 2014 -0500
@@ -0,0 +1,321 @@
+#!/usr/bin/env python
+
+#GUESS-ft:General UsEr defined Supervised Search for fusion transcript.
+#This program is to perform targeted search of gene fusions from RNAseq data.
+#It uses samtools, pysam package. It requires a bam file and other reference files.
+#Note that it is a less accurate tool than prada-fusion.
+
+import pysam
+import subprocess
+import os
+import time
+import sys
+import bioclass
+import ioprada
+
+args=sys.argv
+
+#Get all parameters
+help_menu='''\nUsage: prada-guess-ft GeneA GeneB -conf xx.txt -inputbam X -mm 1 -minmapq 30 -junL X -outdir X -unmap X\n
+    **Parameters**:
+        -conf		the configure file. see prada-fusion -conf for details
+        -inputbam	the input bam file
+        -mm		number of mismatch allowed
+        -minmapq	mininum mapping quality for reads to be used in fusion finding
+        -junL		length of exons to be used for junctions. see prada-fusion
+        -outdir		output directory
+        -unmap		the numapped reads. useful if user need to run guess-ft multiple times
+'''
+
+if '-h' in args or '-help' in args or len(args)==1:
+    print help_menu 
+    sys.exit(0)
+
+##################################################################################
+Gene_A=args[1]
+Gene_B=args[2]
+if '-inputbam' not in args:
+    sys.exit('Input BAM needed')
+else:
+    i=args.index('-inputbam')
+    sourcefile=args[i+1]
+if '-outdir' not in args:
+    outdir='./'
+else:
+    i=args.index('-outdir')
+    outdir=args[i+1]
+    if not os.path.exists(outdir):
+	    os.mkdir(outdir)
+if '-unmap' not in args:    #get unmapped.bam yourself
+	unmapbam='%s/one.end.unmapped.bam'%outdir
+	extract_mask=1
+else:
+	i=args.index('-unmap')
+	unmapbam=args[i+1]
+	extract_mask=0
+if '-mm' not in args:
+    mm=1
+else:
+    i=args.index('-mm')
+    mm=int(args[i+1])
+#minimum mapping quality for reads as fusion evidences
+if '-minmapq' not in args:
+    minmapq=30
+else:
+    i=args.index('-minmapq')
+    minmapq=int(args[i+1])
+
+if '-junL' not in args:
+    sys.exit('-junL needed')
+else:
+    i=args.index('-junL')
+    junL=int(args[i+1])
+
+prada_path=os.path.dirname(os.path.abspath(__file__))   ####
+ref_search_path=[prada_path,os.getcwd()]                #search path for ref file if not specified in command line
+
+if '-conf' in args:
+    i=args.index('-conf')
+    reffile=args[i+1]
+    if os.path.exists(reffile):
+        pass
+    else:
+        for pth in ref_search_path:
+            new_reffile='%s/%s'%(pth, os.path.basename(reffile))
+            if os.path.exists(new_reffile):
+                reffile=new_reffile
+                break
+        else:
+            sys.exit('ERROR: ref file %s not found'%reffile)
+else:
+    reffile='%s/conf.txt'%prada_path
+    if not os.path.exists(reffile):
+        sys.exit('ERROR: No default conf.txt found and none specified')
+
+#reference files
+refdict=ioprada.read_conf(reffile)
+ref_anno=refdict['--REF--']['ref_anno']
+ref_map=refdict['--REF--']['ref_map']
+ref_fasta=refdict['--REF--']['ref_fasta']
+featurefile=refdict['--REF--']['feature_file']
+
+samtools='%s/tools/samtools-0.1.16/samtools'%prada_path
+bwa='%s/tools/bwa-0.5.7-mh/bwa'%prada_path
+
+print 'GUESS start: %s'%time.ctime()
+print 'CMD: %s'%('\t'.join(args))
+
+##################################################################################
+##get gene position information
+
+gA,gB=ioprada.read_feature_genes(featurefile,Gene_A,Gene_B)
+if gA is None:
+    sys.exit('%s not found'%Gene_A)
+if gB is None: 
+    sys.exit('%s not found'%Gene_B)
+
+#Generate unmapped reads.
+if extract_mask:
+	cmd='%s view -b -f 4 -F 8 %s > %s'%(samtools,sourcefile,unmapbam)
+	cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)
+	while True:
+		if cmdout.poll() is None:
+			print 'Extracting unmapped reads...'
+			time.sleep(120)
+			pass
+		if cmdout.poll()==0:
+			print 'Extracted unmapped reads'
+			break
+		if cmdout.poll() is not None and cmdout.poll() != 0:
+			raise Exception('Error extracting unmapped reads')
+
+#Generate junction db
+juncfile='%s_%s.junction.fasta'%(Gene_A,Gene_B)
+cmd='perl %s/make_exon_junctions.pl %s %s %s %s %s %d > %s/%s'%(prada_path,Gene_A,Gene_B,ref_anno,ref_map,ref_fasta,junL,outdir,juncfile)
+cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)
+while True:
+	if cmdout.poll() is None:
+		print 'Generating junction db. Waiting...'
+		time.sleep(20)
+		pass
+	if cmdout.poll()==0:
+		print 'Generated Junction DB.'
+		break
+	if cmdout.poll() is not None and cmdout.poll() != 0:
+		raise Exception('Error generated Junction DB.')
+
+#read into the sam file, get gene-strand information
+print 'Finding discordant read pairs.'
+samfile = pysam.Samfile(sourcefile, "rb" )
+
+#mapping quality based read collection
+#strand information considered
+#mismatch filter considered
+a_reads=[]
+b_reads=[]
+for alignedread in samfile.fetch(gA.chr,gA.start-1,gA.end):
+	if alignedread.mapq >= minmapq:
+		readastrd='-1' if alignedread.is_reverse else '1'
+		mmf=[x[1] for x in alignedread.tags if x[0]=='NM'][0]
+		if readastrd==gA.strand and mmf <= mm:
+			a_reads.append(alignedread)
+
+for alignedread in samfile.fetch(gB.chr,gB.start-1,gB.end):
+	if alignedread.mapq >= minmapq:
+		readbstrd='-1' if alignedread.is_reverse else '1'
+		mmf=[x[1] for x in alignedread.tags if x[0]=='NM'][0]
+		if readbstrd != gB.strand and mmf <= mm:
+			b_reads.append(alignedread)
+samfile.close()
+
+ar_ids=[x.qname for x in a_reads]
+br_ids=[x.qname for x in b_reads]
+disc=list(set(ar_ids).intersection(set(br_ids)))   #discordant pairs
+
+#detect read pair that one end map to one partner, yet the other does not align
+print 'Determining potential junction spanning reads.'
+rpaonly=[]    #reads that only map to gene A -- going to map the other end to junctions
+for rd in a_reads:
+	if rd.mate_is_unmapped:
+		rpaonly.append(rd)
+rpbonly=[]    #reads that only map to gene B -- going to map the other end to junctions
+for rd in b_reads:
+    if rd.mate_is_unmapped:
+        rpbonly.append(rd)
+rpaonly_names=[x.qname for x in rpaonly]
+rpbonly_names=[x.qname for x in rpbonly]
+
+#find read sequences
+print 'Extracting unmapped read sequences.'
+print 'mate unmapped read for gene A:',len(rpaonly)
+print 'mate unmapped read for gene B:',len(rpbonly) 
+samfile=pysam.Samfile(unmapbam,'rb')
+taga='%s-%s_a'%(Gene_A,Gene_B)
+tagb='%s-%s_b'%(Gene_A,Gene_B)
+resfq_a=open('%s/unmapreads_%s.fq'%(outdir,taga),'w')
+resfq_b=open('%s/unmapreads_%s.fq'%(outdir,tagb),'w')
+for item in samfile:
+	if item.qname in rpaonly_names:
+		resfq_a.write('@%s\n'%item.qname)
+		resfq_a.write('%s\n'%item.seq)
+		resfq_a.write('+\n')
+		resfq_a.write('%s\n'%item.qual)
+	if item.qname in rpbonly_names:
+		resfq_b.write('@%s\n'%item.qname)
+		resfq_b.write('%s\n'%item.seq)
+		resfq_b.write('+\n')
+		resfq_b.write('%s\n'%item.qual)
+resfq_a.close()
+resfq_b.close()
+samfile.close()
+
+##indexing junction db
+print 'Aligning reads to junction db'
+cmd='%s index %s/%s'%(bwa,outdir,juncfile)
+cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)
+while True:
+	if cmdout.poll() is None:
+		time.sleep(3)
+		pass
+	if cmdout.poll()==0:
+		print 'Junction DB indexed.'
+		break
+	if cmdout.poll() is not None and cmdout.poll() != 0:
+		raise Exception('Error building junction db index')
+
+##align the unmapped reads to junction database
+for rs in [taga,tagb]:
+	cmd='%s aln -n %d -R 100 %s/%s %s/unmapreads_%s.fq > %s/%s.sai'%(bwa,mm,outdir,juncfile,outdir,rs,outdir,rs)
+	cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)
+	while True:
+		if cmdout.poll() is None:
+			time.sleep(5)
+			pass
+		if cmdout.poll()==0:
+			print 'Aligned unmapped reads group %s'%rs
+			break
+		if cmdout.poll() is not None and cmdout.poll() != 0:
+			raise Exception('Error aligning unmapped reads for group %s'%rs)
+
+	cmd='%s samse -n 1000 %s/%s %s/%s.sai %s/unmapreads_%s.fq > %s/%s.sam'%(bwa,outdir,juncfile,outdir,rs,outdir,rs,outdir,rs)
+	cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)
+	while True:
+		if cmdout.poll() is None:
+			time.sleep(2)
+			pass
+		if cmdout.poll()==0:
+			print 'Converting to sam for group %s'%rs
+			break
+		if cmdout.poll() is not None and cmdout.poll() != 0:
+			raise Exception('Error converting to sam for group %s'%rs)
+
+#parse results
+qualrd_a=[]
+junc_a=[]
+samfile=pysam.Samfile('%s/%s.sam'%(outdir,taga),'r')
+for rd in samfile:
+	if not rd.is_unmapped and rd.is_reverse:
+		qualrd_a.append(rd)
+		junc_a.append(samfile.getrname(rd.tid))
+samfile.close()
+qualrd_b=[]
+junc_b=[]
+samfile=pysam.Samfile('%s/%s.sam'%(outdir,tagb),'r')
+for rd in samfile:
+    if not rd.is_unmapped and not rd.is_reverse:
+		qualrd_b.append(rd)
+		junc_b.append(samfile.getrname(rd.tid))
+samfile.close()
+
+junc_span=[]
+junc_span.extend(qualrd_a)
+junc_span.extend(qualrd_b)
+
+junc_name=[]
+junc_name.extend(junc_a)
+junc_name.extend(junc_b)
+
+#Generate a summary report
+sumfile=open('%s/%s_%s.GUESS.summary.txt'%(outdir,Gene_A,Gene_B),'w')
+sumfile.write('%s\t%s\n'%(Gene_A,Gene_B))
+sumfile.write('\n')
+sumfile.write('>discordant\n')
+for rdname in disc:
+	ia=ar_ids.index(rdname)
+	ib=br_ids.index(rdname)
+	reada=a_reads[ia]
+	readb=b_reads[ib]
+	mm_a=[x[1] for x in reada.tags if x[0]=='NM'][0]
+	mm_b=[x[1] for x in readb.tags if x[0]=='NM'][0]
+	ss1='%s\tF\t%s.%d.mm%d'%(reada.qname,Gene_A,reada.pos+1,mm_a)    #pysam use 0-based coordinates
+	ss2='%s\tR\t%s.%d.mm%d'%(readb.qname,Gene_B,readb.pos+1,mm_b)	
+	sumfile.write('%s\n'%ss1)
+	sumfile.write('%s\n'%ss2)
+
+sumfile.write('\n')
+sumfile.write('>spanning\n')
+for i in range(len(junc_span)):
+	rd=junc_span[i]
+	jname=junc_name[i]
+	mm_j=[x[1] for x in rd.tags if x[0]=='NM'][0]
+	ss='%s\t%s.mm%d'%(rd.qname,jname,mm_j)
+	sumfile.write('%s\n'%ss)
+
+sumfile.write('\n')
+sumfile.write('>junction\n')
+juncol=[]
+for item in set(junc_name):
+	nn=junc_name.count(item)
+	juncol.append([item,nn])
+juncol=sorted(juncol,key=lambda x:x[1],reverse=True)
+for item in juncol:
+	sumfile.write('%s\t%s\n'%(item[0],item[1]))
+
+sumfile.write('\n')
+sumfile.write('>summary\n')
+sumfile.write('Number of Discordant Pairs = %d\n'%len(disc))
+sumfile.write('Number of Fusion Reads = %d\n'%len(junc_span))
+sumfile.write('Number of Distinct Junctions = %d\n'%len(set(junc_name)))
+
+sumfile.close()
+print 'Done: %s'%time.ctime()