diff pyPRADA_1.2/prada-homology @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pyPRADA_1.2/prada-homology	Thu Feb 20 00:44:58 2014 -0500
@@ -0,0 +1,190 @@
+#!/usr/bin/env python
+
+#This script is part of the pyPRADA package. It is used to calculate sequence similarity of genes.
+#In practice, it finds longest transcripts and uses blastn to compare the sequences. 
+#It is used to filter out fusion false positives like family genes etc. 
+#A temp folder is generated to save transcript fasta files. 
+
+import os
+import re
+import sys
+import time
+import subprocess
+import bioclass
+from Bio import SeqIO
+import ioprada
+
+######################################################################################
+help_menu='''\nUsage: prada-homology -conf xx.txt -i inputfile -o outputfile -tmpdir XXX
+**parameters**
+    -conf	see prada-fusion -conf for details
+    -i		a tab/space delimited two-column file --- gene1 gene2
+    -o		output file
+    -tmpdir	tmporary directory that saves fasta files
+'''
+
+args=sys.argv
+
+if '-h' in args or '-help' in args or len(args)==1:
+    print help_menu
+    sys.exit(0)
+
+if '-i' not in sys.argv:
+    sys.exit('Input file needed')
+else:
+    i=sys.argv.index('-i')
+    datafilename=sys.argv[i+1]
+if '-o' not in sys.argv:
+    sys.exit('Output file needed')
+else:
+    i=sys.argv.index('-o')
+    outfilename=sys.argv[i+1]
+if '-tmpdir' not in sys.argv:
+    sys.exit('TMP dir needed')
+else:
+    i=sys.argv.index('-tmpdir')
+    blastseq_tmp_dir=sys.argv[i+1]
+if os.path.exists(blastseq_tmp_dir):
+    print 'Warning: tmpdir %s exists!'%blastseq_tmp_dir
+else:
+    cmd='mkdir %s'%blastseq_tmp_dir
+    cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)
+
+prada_path=os.path.dirname(os.path.abspath(__file__))   ####
+ref_search_path=[prada_path,os.getcwd()]                #search path for ref file if not specified in command line
+
+if '-conf' in args:
+    i=args.index('-conf')
+    reffile=args[i+1]
+    if os.path.exists(reffile):
+        pass
+    else:
+        for pth in ref_search_path:
+            new_reffile='%s/%s'%(pth, os.path.basename(reffile))
+            if os.path.exists(new_reffile):
+                reffile=new_reffile
+                break
+        else:
+            sys.exit('ERROR: conf file %s not found'%reffile)
+else:
+    reffile='%s/conf.txt'%prada_path
+    if not os.path.exists(reffile):
+        sys.exit('ERROR: No default conf.txt found and none specified')
+
+#Now print all input parameters
+print 'CMD: %s'%('\t'.join(args))
+
+#reference files
+refdict=ioprada.read_conf(reffile)
+featurefile=refdict['--REF--']['feature_file']
+refdb=refdict['--REF--']['tx_seq_file']
+
+blastn='%s/tools/blastn'%prada_path
+
+#################Here We Go#############################################
+##Read information from raw file
+print 'Collecting gene/Transcript info ...'
+#call functions in ioprada module //
+txdb,genedb=ioprada.read_feature(featurefile,verbose=False)
+
+##################################################################
+##Get all candidate gene pairs
+infile=open(datafilename)
+candidates={}
+for line in infile:
+    if line.strip():
+        info=line.split()
+        geneA,geneB=info[0],info[1]
+        key='%s_%s'%(geneA,geneB)
+        candidates[key]=''
+infile.close()
+print '%d unique gene pairs collected'%len(candidates)
+
+##Select the transcript for each gene
+selecttranscript={}
+for gene in genedb:
+    txs=genedb[gene].transcript
+    stx=txs[0]
+    initlen=stx.length
+    for tx in txs:
+        if tx.length >= initlen:
+            stx=tx
+            initlen=stx.length
+    selecttranscript[gene]=stx.name
+print 'Selected longest transcript for each gene.'
+
+##Get all needed sequences
+allpartners=set()
+for item in candidates:
+    sset=set(item.split('_'))
+    allpartners=allpartners.union(sset)
+
+####################################################################
+def seqblast(seqA,seqB):
+    '''seqA,seqB:fasta files'''
+    global blastn
+    cmdstr='%s -task=blastn  -subject %s -query %s -outfmt 6'%(blastn,seqA,seqB)
+    cmdout=subprocess.Popen(cmdstr.split(),stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
+    result=cmdout.stdout.read()
+    best_align=result.split('\n')[0]
+    if best_align=='':
+        return None
+    else:
+        info=best_align.split('\t')
+        identity=info[2].strip()
+        align_len=info[3].strip()
+        evalue=info[10].strip()
+        bit_score=info[11].strip()
+        return [identity,align_len,evalue,bit_score]
+
+presenttxs=[]    #tx that is present in our annotation
+absent=[]     #tx that is not in our annotation
+for gene in allpartners:
+    if selecttranscript.has_key(gene):
+        presenttxs.append(selecttranscript[gene])
+    else:
+        absent.append(gene)
+
+for seq_record in SeqIO.parse(refdb,'fasta'):
+    sid=seq_record.id
+    seq=seq_record.seq
+    if sid in presenttxs:
+        g=txdb[sid].gene
+        fastafile=open('%s/%s.fasta'%(blastseq_tmp_dir,g),'w')
+        SeqIO.write(seq_record,fastafile,'fasta')
+        fastafile.close()
+
+for gp in candidates:
+    geneA,geneB=gp.split('_')
+    if geneA in absent or geneB in absent:
+        candidates[gp]=['NA']*6
+    else:
+        gaseq='%s/%s.fasta'%(blastseq_tmp_dir,geneA)
+        gaobj=SeqIO.parse(gaseq,'fasta').next()
+        gbseq='%s/%s.fasta'%(blastseq_tmp_dir,geneB)
+        gbobj=SeqIO.parse(gbseq,'fasta').next()
+        ga_len,gb_len=str(len(gaobj.seq)),str(len(gbobj.seq))
+        print 'Comparing %s %s'%(geneA,geneB)
+        a=seqblast(gaseq,gbseq)
+        if a==None:
+            candidates[gp]=[ga_len,gb_len,'NA','NA','100','0']
+        else:
+            a[0:0]=[ga_len,gb_len]
+            candidates[gp]=a
+
+
+infile=open(datafilename)
+outfile=open(outfilename,'w')
+header=['#Gene1','Gene2','Transcript1_Len','Transcript2_Len','Identity','Align_Len','Evalue','BitScore']
+outfile.write('%s\n'%('\t'.join(header)))
+for line in infile:
+    if line.strip():
+        info=line.split()
+        geneA,geneB=info[0],info[1]
+        key='%s_%s'%(geneA,geneB)
+        vv=candidates[key]
+        outfile.write('%s\t%s\t%s\n'%(geneA,geneB,'\t'.join(vv)))
+outfile.close()
+infile.close()
+
+print 'Homolgy test done!'