diff pyPRADA_1.2/prada-guess-if @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pyPRADA_1.2/prada-guess-if	Thu Feb 20 00:44:58 2014 -0500
@@ -0,0 +1,288 @@
+#!/usr/bin/env python
+
+#GUESS-if is to find abnormal intragenic fusions. 
+#It is an extension of the GUESS-ft method. 
+
+import pysam
+import subprocess
+import os
+import time
+import sys
+import bioclass
+import ioprada
+
+args=sys.argv
+
+#Get all parameters
+help_menu='''\nUsage: prada-guess-if Gene -conf xx.txt -inputbam X -mm 1 -minmapq 30 -junL X -outdir ./ -unmap X\n
+    **Parameters**:
+        -conf        the configure file. see prada-fusion -conf for details
+        -inputbam   the input bam file
+        -mm         number of mismatch allowed
+        -minmapq    mininum mapping quality for reads to be used in fusion finding
+        -junL       length of exons to be used for junctions. see prada-fusion
+        -outdir     output directory
+        -unmap      the numapped reads. useful if user need to run guess-ft multiple times
+'''
+
+if '-h' in args or '-help' in args or len(args)==1:
+    print help_menu
+    sys.exit(0)
+
+#########################################################################
+target=args[1]
+if '-inputbam' not in args:
+    sys.exit('Input BAM needed')
+else:
+    i=args.index('-inputbam')
+    sourcefile=args[i+1]
+if '-outdir' not in args:
+    outdir='./'
+else:
+    i=args.index('-outdir')
+    outdir=args[i+1]
+    if not os.path.exists(outdir):
+        os.mkdir(outdir)
+if '-unmap' not in args:
+    unmapbam='%s/one.end.unmapped.bam'%outdir
+    extract_mask=1
+else:
+    i=args.index('-unmap')
+    unmapbam=args[i+1]
+    extract_mask=0
+
+if '-mm' not in args:
+    mm=1
+else:
+    i=args.index('-mm')
+    mm=int(args[i+1])
+
+#minimum mapping quality for reads as fusion evidences
+if '-minmapq' not in args:
+    minmapq=30
+else:
+    i=args.index('-minmapq')
+    minmapq=int(args[i+1])
+
+if '-junL' not in args:
+    sys.exit('-junL needed')
+else:
+    i=args.index('-junL')
+    junL=int(args[i+1])
+
+prada_path=os.path.dirname(os.path.abspath(__file__))   ####
+ref_search_path=[prada_path,os.getcwd()]                #search path for ref file if not specified in command line
+
+if '-conf' in args:
+    i=args.index('-conf')
+    reffile=args[i+1]
+    if os.path.exists(reffile):
+        pass
+    else:
+        for pth in ref_search_path:
+            new_reffile='%s/%s'%(pth, os.path.basename(reffile))
+            if os.path.exists(new_reffile):
+                reffile=new_reffile
+                break
+        else:
+            sys.exit('ERROR: ref file %s not found'%reffile)
+else:
+    reffile='%s/conf.txt'%prada_path
+    if not os.path.exists(reffile):
+        sys.exit('ERROR: No default conf.txt found and none specified')
+
+#reference files
+refdict=ioprada.read_conf(reffile)
+ref_anno=refdict['--REF--']['ref_anno']
+ref_map=refdict['--REF--']['ref_map']
+ref_fasta=refdict['--REF--']['ref_fasta']
+featurefile=refdict['--REF--']['feature_file']
+
+samtools='%s/tools/samtools-0.1.16/samtools'%prada_path
+bwa='%s/tools/bwa-0.5.7-mh/bwa'%prada_path
+
+#########################################################################
+print 'GUESS-if start: %s'%time.ctime()
+print 'CMD: %s'%('\t'.join(args))
+
+##get gene position information
+gobj=ioprada.read_feature_genes(featurefile,target)[0]
+if gobj is None:
+    sys.exit('%s not found'%target)
+
+gchr=gobj.strand
+gchr_rev=True if gchr=='-1' else False
+
+#Generate unmapped reads.
+if extract_mask:
+    cmd='%s view -b -f 4 -F 8 %s > %s'%(samtools,sourcefile,unmapbam)
+    cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)
+    while True:
+        if cmdout.poll() is None:
+            print 'Extracting unmapped reads...'
+            time.sleep(120)
+            pass
+        if cmdout.poll()==0:
+            print 'Extracted unmapped reads'
+            break
+        if cmdout.poll() is not None and cmdout.poll() != 0:
+            raise Exception('Error extracting unmapped reads')
+
+
+#Generate junction db
+juncfile='%s.junction.fasta'%target
+cmd='perl %s/make_intragenic_junctions.pl %s %s %s %s %s %d > %s/%s'%(prada_path,target,target,ref_anno,ref_map,ref_fasta,junL,outdir,juncfile)
+cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)
+while True:
+    if cmdout.poll() is None:
+        print 'Generating junction db. Waiting...'
+        time.sleep(20)
+        pass
+    if cmdout.poll()==0:
+        print 'Generated Junction DB.'
+        break
+    if cmdout.poll() is not None and cmdout.poll() != 0:
+        raise Exception('Error generated Junction DB.')
+
+#scan BAM file for mapping reads.
+samfile=pysam.Samfile(sourcefile,'rb')
+reads_se=[]
+reads_as=[]
+for alignedread in samfile.fetch(gobj.chr,gobj.start-1,gobj.end):
+    if alignedread.mapq >= minmapq:
+        mmf=[x[1] for x in alignedread.tags if x[0]=='NM'][0]
+        if mmf <= mm:
+            if alignedread.is_reverse==gchr_rev:   #sense
+                reads_se.append(alignedread)
+            else:                                  #anti-sense
+                reads_as.append(alignedread)
+samfile.close()
+
+seonly,asonly=[],[]
+for rd in reads_se:
+    if rd.mate_is_unmapped:
+        seonly.append(rd)
+for rd in reads_as:
+    if rd.mate_is_unmapped:
+        asonly.append(rd) 
+seunmap=[x.qname for x in seonly]
+asunmap=[x.qname for x in asonly]
+
+#find read sequences
+print 'Extracting unmapped read sequences.'
+print 'mate unmapped reads for sense strand:',len(seonly)
+print 'mate unmapped reads for antisense strand:',len(asonly)
+samfile=pysam.Samfile(unmapbam,'rb')
+resfq_a=open('%s/%s_se_unmap.fq'%(outdir,target),'w')
+resfq_b=open('%s/%s_as_unmap.fq'%(outdir,target),'w')
+for item in samfile:
+    if item.qname in seunmap:
+        resfq_a.write('@%s\n'%item.qname)
+        resfq_a.write('%s\n'%item.seq)
+        resfq_a.write('+\n')
+        resfq_a.write('%s\n'%item.qual)
+    if item.qname in asunmap:
+        resfq_b.write('@%s\n'%item.qname)
+        resfq_b.write('%s\n'%item.seq)
+        resfq_b.write('+\n')
+        resfq_b.write('%s\n'%item.qual)
+resfq_a.close()
+resfq_b.close()
+samfile.close()
+
+##indexing junction db
+print 'Aligning reads to junction db'
+cmd='%s index %s/%s'%(bwa,outdir,juncfile)
+cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)
+while True:
+    if cmdout.poll() is None:
+        time.sleep(3)
+        pass
+    if cmdout.poll()==0:
+        print 'Junction DB indexed.'
+        break
+    if cmdout.poll() is not None and cmdout.poll() != 0:
+        raise Exception('Error building junction db index')
+
+taga='%s_se'%target
+tagb='%s_as'%target
+for rs in [taga,tagb]:
+    cmd='%s aln -n %d -R 100 %s/%s %s/%s_unmap.fq > %s/%s_unmap.sai'%(bwa,mm,outdir,juncfile,outdir,rs,outdir,rs)
+    cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)
+    while True:
+        if cmdout.poll() is None:
+            time.sleep(5)
+            pass
+        if cmdout.poll()==0:
+            print 'Aligned unmapped reads group %s'%rs
+            break
+        if cmdout.poll() is not None and cmdout.poll() != 0:
+            raise Exception('Error aligning unmapped reads for group %s'%rs)
+
+    cmd='%s samse -n 1000 %s/%s %s/%s_unmap.sai %s/%s_unmap.fq > %s/%s_unmap.sam'%(bwa,outdir,juncfile,outdir,rs,outdir,rs,outdir,rs)
+    cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)
+    while True:
+        if cmdout.poll() is None:
+            time.sleep(2)
+            pass
+        if cmdout.poll()==0:
+            print 'Converting to sam for group %s'%rs
+            break
+        if cmdout.poll() is not None and cmdout.poll() != 0:
+            raise Exception('Error converting to sam for group %s'%rs)
+
+qualrd_a=[]
+junc_a=[]
+samfile=pysam.Samfile('%s/%s_unmap.sam'%(outdir,taga),'r')
+for rd in samfile:
+    if not rd.is_unmapped and rd.is_reverse:
+        qualrd_a.append(rd)
+        junc_a.append(samfile.getrname(rd.tid))
+samfile.close()
+qualrd_b=[]
+junc_b=[]
+samfile=pysam.Samfile('%s/%s_unmap.sam'%(outdir,tagb),'r')
+for rd in samfile:
+    if not rd.is_unmapped and not rd.is_reverse:
+        qualrd_b.append(rd)
+        junc_b.append(samfile.getrname(rd.tid))
+samfile.close()
+
+junc_span=[]
+junc_span.extend(qualrd_a)
+junc_span.extend(qualrd_b)
+
+junc_name=[]
+junc_name.extend(junc_a)
+junc_name.extend(junc_b)
+
+#Generate a summary report
+sumfile=open('%s/%s.GUESS-IF.summary.txt'%(outdir,target),'w')
+sumfile.write('%s\n'%(target))
+sumfile.write('\n')
+sumfile.write('>spanning\n')
+for i in range(len(junc_span)):
+    rd=junc_span[i]
+    jname=junc_name[i]
+    mm_j=[x[1] for x in rd.tags if x[0]=='NM'][0]
+    ss='%s\t%s.mm%d'%(rd.qname,jname,mm_j)
+    sumfile.write('%s\n'%ss)
+
+sumfile.write('\n')
+sumfile.write('>junction\n')
+juncol=[]
+for item in set(junc_name):
+    nn=junc_name.count(item)
+    juncol.append([item,nn])
+juncol=sorted(juncol,key=lambda x:x[1],reverse=True)
+for item in juncol:
+    sumfile.write('%s\t%s\n'%(item[0],item[1]))
+
+sumfile.write('\n')
+sumfile.write('>summary\n')
+sumfile.write('Number of Fusion Reads = %d\n'%len(junc_span))
+sumfile.write('Number of Distinct Junctions = %d\n'%len(set(junc_name)))
+
+sumfile.close()
+print 'Done: %s'%time.ctime()
+