view pyPRADA_1.2/prada-homology @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
line wrap: on
line source

#!/usr/bin/env python

#This script is part of the pyPRADA package. It is used to calculate sequence similarity of genes.
#In practice, it finds longest transcripts and uses blastn to compare the sequences. 
#It is used to filter out fusion false positives like family genes etc. 
#A temp folder is generated to save transcript fasta files. 

import os
import re
import sys
import time
import subprocess
import bioclass
from Bio import SeqIO
import ioprada

######################################################################################
help_menu='''\nUsage: prada-homology -conf xx.txt -i inputfile -o outputfile -tmpdir XXX
**parameters**
    -conf	see prada-fusion -conf for details
    -i		a tab/space delimited two-column file --- gene1 gene2
    -o		output file
    -tmpdir	tmporary directory that saves fasta files
'''

args=sys.argv

if '-h' in args or '-help' in args or len(args)==1:
    print help_menu
    sys.exit(0)

if '-i' not in sys.argv:
    sys.exit('Input file needed')
else:
    i=sys.argv.index('-i')
    datafilename=sys.argv[i+1]
if '-o' not in sys.argv:
    sys.exit('Output file needed')
else:
    i=sys.argv.index('-o')
    outfilename=sys.argv[i+1]
if '-tmpdir' not in sys.argv:
    sys.exit('TMP dir needed')
else:
    i=sys.argv.index('-tmpdir')
    blastseq_tmp_dir=sys.argv[i+1]
if os.path.exists(blastseq_tmp_dir):
    print 'Warning: tmpdir %s exists!'%blastseq_tmp_dir
else:
    cmd='mkdir %s'%blastseq_tmp_dir
    cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)

prada_path=os.path.dirname(os.path.abspath(__file__))   ####
ref_search_path=[prada_path,os.getcwd()]                #search path for ref file if not specified in command line

if '-conf' in args:
    i=args.index('-conf')
    reffile=args[i+1]
    if os.path.exists(reffile):
        pass
    else:
        for pth in ref_search_path:
            new_reffile='%s/%s'%(pth, os.path.basename(reffile))
            if os.path.exists(new_reffile):
                reffile=new_reffile
                break
        else:
            sys.exit('ERROR: conf file %s not found'%reffile)
else:
    reffile='%s/conf.txt'%prada_path
    if not os.path.exists(reffile):
        sys.exit('ERROR: No default conf.txt found and none specified')

#Now print all input parameters
print 'CMD: %s'%('\t'.join(args))

#reference files
refdict=ioprada.read_conf(reffile)
featurefile=refdict['--REF--']['feature_file']
refdb=refdict['--REF--']['tx_seq_file']

blastn='%s/tools/blastn'%prada_path

#################Here We Go#############################################
##Read information from raw file
print 'Collecting gene/Transcript info ...'
#call functions in ioprada module //
txdb,genedb=ioprada.read_feature(featurefile,verbose=False)

##################################################################
##Get all candidate gene pairs
infile=open(datafilename)
candidates={}
for line in infile:
    if line.strip():
        info=line.split()
        geneA,geneB=info[0],info[1]
        key='%s_%s'%(geneA,geneB)
        candidates[key]=''
infile.close()
print '%d unique gene pairs collected'%len(candidates)

##Select the transcript for each gene
selecttranscript={}
for gene in genedb:
    txs=genedb[gene].transcript
    stx=txs[0]
    initlen=stx.length
    for tx in txs:
        if tx.length >= initlen:
            stx=tx
            initlen=stx.length
    selecttranscript[gene]=stx.name
print 'Selected longest transcript for each gene.'

##Get all needed sequences
allpartners=set()
for item in candidates:
    sset=set(item.split('_'))
    allpartners=allpartners.union(sset)

####################################################################
def seqblast(seqA,seqB):
    '''seqA,seqB:fasta files'''
    global blastn
    cmdstr='%s -task=blastn  -subject %s -query %s -outfmt 6'%(blastn,seqA,seqB)
    cmdout=subprocess.Popen(cmdstr.split(),stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
    result=cmdout.stdout.read()
    best_align=result.split('\n')[0]
    if best_align=='':
        return None
    else:
        info=best_align.split('\t')
        identity=info[2].strip()
        align_len=info[3].strip()
        evalue=info[10].strip()
        bit_score=info[11].strip()
        return [identity,align_len,evalue,bit_score]

presenttxs=[]    #tx that is present in our annotation
absent=[]     #tx that is not in our annotation
for gene in allpartners:
    if selecttranscript.has_key(gene):
        presenttxs.append(selecttranscript[gene])
    else:
        absent.append(gene)

for seq_record in SeqIO.parse(refdb,'fasta'):
    sid=seq_record.id
    seq=seq_record.seq
    if sid in presenttxs:
        g=txdb[sid].gene
        fastafile=open('%s/%s.fasta'%(blastseq_tmp_dir,g),'w')
        SeqIO.write(seq_record,fastafile,'fasta')
        fastafile.close()

for gp in candidates:
    geneA,geneB=gp.split('_')
    if geneA in absent or geneB in absent:
        candidates[gp]=['NA']*6
    else:
        gaseq='%s/%s.fasta'%(blastseq_tmp_dir,geneA)
        gaobj=SeqIO.parse(gaseq,'fasta').next()
        gbseq='%s/%s.fasta'%(blastseq_tmp_dir,geneB)
        gbobj=SeqIO.parse(gbseq,'fasta').next()
        ga_len,gb_len=str(len(gaobj.seq)),str(len(gbobj.seq))
        print 'Comparing %s %s'%(geneA,geneB)
        a=seqblast(gaseq,gbseq)
        if a==None:
            candidates[gp]=[ga_len,gb_len,'NA','NA','100','0']
        else:
            a[0:0]=[ga_len,gb_len]
            candidates[gp]=a


infile=open(datafilename)
outfile=open(outfilename,'w')
header=['#Gene1','Gene2','Transcript1_Len','Transcript2_Len','Identity','Align_Len','Evalue','BitScore']
outfile.write('%s\n'%('\t'.join(header)))
for line in infile:
    if line.strip():
        info=line.split()
        geneA,geneB=info[0],info[1]
        key='%s_%s'%(geneA,geneB)
        vv=candidates[key]
        outfile.write('%s\t%s\t%s\n'%(geneA,geneB,'\t'.join(vv)))
outfile.close()
infile.close()

print 'Homolgy test done!'