view pyPRADA_1.2/prada-guess-if @ 1:03815b87eb65 draft

Uploaded
author siyuan
date Fri, 21 Feb 2014 18:08:37 -0500
parents acc2ca1a3ba4
children
line wrap: on
line source

#!/usr/bin/env python

#GUESS-if is to find abnormal intragenic fusions. 
#It is an extension of the GUESS-ft method. 

import pysam
import subprocess
import os
import time
import sys
import bioclass
import ioprada

args=sys.argv

#Get all parameters
help_menu='''\nUsage: prada-guess-if Gene -conf xx.txt -inputbam X -mm 1 -minmapq 30 -junL X -outdir ./ -unmap X\n
    **Parameters**:
        -conf        the configure file. see prada-fusion -conf for details
        -inputbam   the input bam file
        -mm         number of mismatch allowed
        -minmapq    mininum mapping quality for reads to be used in fusion finding
        -junL       length of exons to be used for junctions. see prada-fusion
        -outdir     output directory
        -unmap      the numapped reads. useful if user need to run guess-ft multiple times
'''

if '-h' in args or '-help' in args or len(args)==1:
    print help_menu
    sys.exit(0)

#########################################################################
target=args[1]
if '-inputbam' not in args:
    sys.exit('Input BAM needed')
else:
    i=args.index('-inputbam')
    sourcefile=args[i+1]
if '-outdir' not in args:
    outdir='./'
else:
    i=args.index('-outdir')
    outdir=args[i+1]
    if not os.path.exists(outdir):
        os.mkdir(outdir)
if '-unmap' not in args:
    unmapbam='%s/one.end.unmapped.bam'%outdir
    extract_mask=1
else:
    i=args.index('-unmap')
    unmapbam=args[i+1]
    extract_mask=0

if '-mm' not in args:
    mm=1
else:
    i=args.index('-mm')
    mm=int(args[i+1])

#minimum mapping quality for reads as fusion evidences
if '-minmapq' not in args:
    minmapq=30
else:
    i=args.index('-minmapq')
    minmapq=int(args[i+1])

if '-junL' not in args:
    sys.exit('-junL needed')
else:
    i=args.index('-junL')
    junL=int(args[i+1])

prada_path=os.path.dirname(os.path.abspath(__file__))   ####
ref_search_path=[prada_path,os.getcwd()]                #search path for ref file if not specified in command line

if '-conf' in args:
    i=args.index('-conf')
    reffile=args[i+1]
    if os.path.exists(reffile):
        pass
    else:
        for pth in ref_search_path:
            new_reffile='%s/%s'%(pth, os.path.basename(reffile))
            if os.path.exists(new_reffile):
                reffile=new_reffile
                break
        else:
            sys.exit('ERROR: ref file %s not found'%reffile)
else:
    reffile='%s/conf.txt'%prada_path
    if not os.path.exists(reffile):
        sys.exit('ERROR: No default conf.txt found and none specified')

#reference files
refdict=ioprada.read_conf(reffile)
ref_anno=refdict['--REF--']['ref_anno']
ref_map=refdict['--REF--']['ref_map']
ref_fasta=refdict['--REF--']['ref_fasta']
featurefile=refdict['--REF--']['feature_file']

samtools='%s/tools/samtools-0.1.16/samtools'%prada_path
bwa='%s/tools/bwa-0.5.7-mh/bwa'%prada_path

#########################################################################
print 'GUESS-if start: %s'%time.ctime()
print 'CMD: %s'%('\t'.join(args))

##get gene position information
gobj=ioprada.read_feature_genes(featurefile,target)[0]
if gobj is None:
    sys.exit('%s not found'%target)

gchr=gobj.strand
gchr_rev=True if gchr=='-1' else False

#Generate unmapped reads.
if extract_mask:
    cmd='%s view -b -f 4 -F 8 %s > %s'%(samtools,sourcefile,unmapbam)
    cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)
    while True:
        if cmdout.poll() is None:
            print 'Extracting unmapped reads...'
            time.sleep(120)
            pass
        if cmdout.poll()==0:
            print 'Extracted unmapped reads'
            break
        if cmdout.poll() is not None and cmdout.poll() != 0:
            raise Exception('Error extracting unmapped reads')


#Generate junction db
juncfile='%s.junction.fasta'%target
cmd='perl %s/make_intragenic_junctions.pl %s %s %s %s %s %d > %s/%s'%(prada_path,target,target,ref_anno,ref_map,ref_fasta,junL,outdir,juncfile)
cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)
while True:
    if cmdout.poll() is None:
        print 'Generating junction db. Waiting...'
        time.sleep(20)
        pass
    if cmdout.poll()==0:
        print 'Generated Junction DB.'
        break
    if cmdout.poll() is not None and cmdout.poll() != 0:
        raise Exception('Error generated Junction DB.')

#scan BAM file for mapping reads.
samfile=pysam.Samfile(sourcefile,'rb')
reads_se=[]
reads_as=[]
for alignedread in samfile.fetch(gobj.chr,gobj.start-1,gobj.end):
    if alignedread.mapq >= minmapq:
        mmf=[x[1] for x in alignedread.tags if x[0]=='NM'][0]
        if mmf <= mm:
            if alignedread.is_reverse==gchr_rev:   #sense
                reads_se.append(alignedread)
            else:                                  #anti-sense
                reads_as.append(alignedread)
samfile.close()

seonly,asonly=[],[]
for rd in reads_se:
    if rd.mate_is_unmapped:
        seonly.append(rd)
for rd in reads_as:
    if rd.mate_is_unmapped:
        asonly.append(rd) 
seunmap=[x.qname for x in seonly]
asunmap=[x.qname for x in asonly]

#find read sequences
print 'Extracting unmapped read sequences.'
print 'mate unmapped reads for sense strand:',len(seonly)
print 'mate unmapped reads for antisense strand:',len(asonly)
samfile=pysam.Samfile(unmapbam,'rb')
resfq_a=open('%s/%s_se_unmap.fq'%(outdir,target),'w')
resfq_b=open('%s/%s_as_unmap.fq'%(outdir,target),'w')
for item in samfile:
    if item.qname in seunmap:
        resfq_a.write('@%s\n'%item.qname)
        resfq_a.write('%s\n'%item.seq)
        resfq_a.write('+\n')
        resfq_a.write('%s\n'%item.qual)
    if item.qname in asunmap:
        resfq_b.write('@%s\n'%item.qname)
        resfq_b.write('%s\n'%item.seq)
        resfq_b.write('+\n')
        resfq_b.write('%s\n'%item.qual)
resfq_a.close()
resfq_b.close()
samfile.close()

##indexing junction db
print 'Aligning reads to junction db'
cmd='%s index %s/%s'%(bwa,outdir,juncfile)
cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)
while True:
    if cmdout.poll() is None:
        time.sleep(3)
        pass
    if cmdout.poll()==0:
        print 'Junction DB indexed.'
        break
    if cmdout.poll() is not None and cmdout.poll() != 0:
        raise Exception('Error building junction db index')

taga='%s_se'%target
tagb='%s_as'%target
for rs in [taga,tagb]:
    cmd='%s aln -n %d -R 100 %s/%s %s/%s_unmap.fq > %s/%s_unmap.sai'%(bwa,mm,outdir,juncfile,outdir,rs,outdir,rs)
    cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)
    while True:
        if cmdout.poll() is None:
            time.sleep(5)
            pass
        if cmdout.poll()==0:
            print 'Aligned unmapped reads group %s'%rs
            break
        if cmdout.poll() is not None and cmdout.poll() != 0:
            raise Exception('Error aligning unmapped reads for group %s'%rs)

    cmd='%s samse -n 1000 %s/%s %s/%s_unmap.sai %s/%s_unmap.fq > %s/%s_unmap.sam'%(bwa,outdir,juncfile,outdir,rs,outdir,rs,outdir,rs)
    cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)
    while True:
        if cmdout.poll() is None:
            time.sleep(2)
            pass
        if cmdout.poll()==0:
            print 'Converting to sam for group %s'%rs
            break
        if cmdout.poll() is not None and cmdout.poll() != 0:
            raise Exception('Error converting to sam for group %s'%rs)

qualrd_a=[]
junc_a=[]
samfile=pysam.Samfile('%s/%s_unmap.sam'%(outdir,taga),'r')
for rd in samfile:
    if not rd.is_unmapped and rd.is_reverse:
        qualrd_a.append(rd)
        junc_a.append(samfile.getrname(rd.tid))
samfile.close()
qualrd_b=[]
junc_b=[]
samfile=pysam.Samfile('%s/%s_unmap.sam'%(outdir,tagb),'r')
for rd in samfile:
    if not rd.is_unmapped and not rd.is_reverse:
        qualrd_b.append(rd)
        junc_b.append(samfile.getrname(rd.tid))
samfile.close()

junc_span=[]
junc_span.extend(qualrd_a)
junc_span.extend(qualrd_b)

junc_name=[]
junc_name.extend(junc_a)
junc_name.extend(junc_b)

#Generate a summary report
sumfile=open('%s/%s.GUESS-IF.summary.txt'%(outdir,target),'w')
sumfile.write('%s\n'%(target))
sumfile.write('\n')
sumfile.write('>spanning\n')
for i in range(len(junc_span)):
    rd=junc_span[i]
    jname=junc_name[i]
    mm_j=[x[1] for x in rd.tags if x[0]=='NM'][0]
    ss='%s\t%s.mm%d'%(rd.qname,jname,mm_j)
    sumfile.write('%s\n'%ss)

sumfile.write('\n')
sumfile.write('>junction\n')
juncol=[]
for item in set(junc_name):
    nn=junc_name.count(item)
    juncol.append([item,nn])
juncol=sorted(juncol,key=lambda x:x[1],reverse=True)
for item in juncol:
    sumfile.write('%s\t%s\n'%(item[0],item[1]))

sumfile.write('\n')
sumfile.write('>summary\n')
sumfile.write('Number of Fusion Reads = %d\n'%len(junc_span))
sumfile.write('Number of Distinct Junctions = %d\n'%len(set(junc_name)))

sumfile.close()
print 'Done: %s'%time.ctime()