prada: pyPRADA_1.2/prada-homology comparison

comparison pyPRADA_1.2/prada-homology @ 0:acc2ca1a3ba4

Uploaded

author	siyuan
date	Thu, 20 Feb 2014 00:44:58 -0500
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:acc2ca1a3ba4
+#!/usr/bin/env python
+#This script is part of the pyPRADA package. It is used to calculate sequence similarity of genes.
+#In practice, it finds longest transcripts and uses blastn to compare the sequences.
+#It is used to filter out fusion false positives like family genes etc.
+#A temp folder is generated to save transcript fasta files.
+import os
+import re
+import sys
+import time
+import subprocess
+import bioclass
+from Bio import SeqIO
+import ioprada
+######################################################################################
+help_menu='''\nUsage: prada-homology -conf xx.txt -i inputfile -o outputfile -tmpdir XXX
+**parameters**
+-conf	see prada-fusion -conf for details
+-i		a tab/space delimited two-column file --- gene1 gene2
+-o		output file
+-tmpdir	tmporary directory that saves fasta files
+'''
+args=sys.argv
+if '-h' in args or '-help' in args or len(args)==1:
+print help_menu
+sys.exit(0)
+if '-i' not in sys.argv:
+sys.exit('Input file needed')
+else:
+i=sys.argv.index('-i')
+datafilename=sys.argv[i+1]
+if '-o' not in sys.argv:
+sys.exit('Output file needed')
+else:
+i=sys.argv.index('-o')
+outfilename=sys.argv[i+1]
+if '-tmpdir' not in sys.argv:
+sys.exit('TMP dir needed')
+else:
+i=sys.argv.index('-tmpdir')
+blastseq_tmp_dir=sys.argv[i+1]
+if os.path.exists(blastseq_tmp_dir):
+print 'Warning: tmpdir %s exists!'%blastseq_tmp_dir
+else:
+cmd='mkdir %s'%blastseq_tmp_dir
+cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)
+prada_path=os.path.dirname(os.path.abspath(__file__))   ####
+ref_search_path=[prada_path,os.getcwd()]                #search path for ref file if not specified in command line
+if '-conf' in args:
+i=args.index('-conf')
+reffile=args[i+1]
+if os.path.exists(reffile):
+pass
+else:
+for pth in ref_search_path:
+new_reffile='%s/%s'%(pth, os.path.basename(reffile))
+if os.path.exists(new_reffile):
+reffile=new_reffile
+break
+else:
+sys.exit('ERROR: conf file %s not found'%reffile)
+else:
+reffile='%s/conf.txt'%prada_path
+if not os.path.exists(reffile):
+sys.exit('ERROR: No default conf.txt found and none specified')
+#Now print all input parameters
+print 'CMD: %s'%('\t'.join(args))
+#reference files
+refdict=ioprada.read_conf(reffile)
+featurefile=refdict['--REF--']['feature_file']
+refdb=refdict['--REF--']['tx_seq_file']
+blastn='%s/tools/blastn'%prada_path
+#################Here We Go#############################################
+##Read information from raw file
+print 'Collecting gene/Transcript info ...'
+#call functions in ioprada module //
+txdb,genedb=ioprada.read_feature(featurefile,verbose=False)
+##################################################################
+##Get all candidate gene pairs
+infile=open(datafilename)
+candidates={}
+for line in infile:
+if line.strip():
+info=line.split()
+geneA,geneB=info[0],info[1]
+key='%s_%s'%(geneA,geneB)
+candidates[key]=''
+infile.close()
+print '%d unique gene pairs collected'%len(candidates)
+##Select the transcript for each gene
+selecttranscript={}
+for gene in genedb:
+txs=genedb[gene].transcript
+stx=txs[0]
+initlen=stx.length
+for tx in txs:
+if tx.length >= initlen:
+stx=tx
+initlen=stx.length
+selecttranscript[gene]=stx.name
+print 'Selected longest transcript for each gene.'
+##Get all needed sequences
+allpartners=set()
+for item in candidates:
+sset=set(item.split('_'))
+allpartners=allpartners.union(sset)
+####################################################################
+def seqblast(seqA,seqB):
+'''seqA,seqB:fasta files'''
+global blastn
+cmdstr='%s -task=blastn  -subject %s -query %s -outfmt 6'%(blastn,seqA,seqB)
+cmdout=subprocess.Popen(cmdstr.split(),stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
+result=cmdout.stdout.read()
+best_align=result.split('\n')[0]
+if best_align=='':
+return None
+else:
+info=best_align.split('\t')
+identity=info[2].strip()
+align_len=info[3].strip()
+evalue=info[10].strip()
+bit_score=info[11].strip()
+return [identity,align_len,evalue,bit_score]
+presenttxs=[]    #tx that is present in our annotation
+absent=[]     #tx that is not in our annotation
+for gene in allpartners:
+if selecttranscript.has_key(gene):
+presenttxs.append(selecttranscript[gene])
+else:
+absent.append(gene)
+for seq_record in SeqIO.parse(refdb,'fasta'):
+sid=seq_record.id
+seq=seq_record.seq
+if sid in presenttxs:
+g=txdb[sid].gene
+fastafile=open('%s/%s.fasta'%(blastseq_tmp_dir,g),'w')
+SeqIO.write(seq_record,fastafile,'fasta')
+fastafile.close()
+for gp in candidates:
+geneA,geneB=gp.split('_')
+if geneA in absent or geneB in absent:
+candidates[gp]=['NA']*6
+else:
+gaseq='%s/%s.fasta'%(blastseq_tmp_dir,geneA)
+gaobj=SeqIO.parse(gaseq,'fasta').next()
+gbseq='%s/%s.fasta'%(blastseq_tmp_dir,geneB)
+gbobj=SeqIO.parse(gbseq,'fasta').next()
+ga_len,gb_len=str(len(gaobj.seq)),str(len(gbobj.seq))
+print 'Comparing %s %s'%(geneA,geneB)
+a=seqblast(gaseq,gbseq)
+if a==None:
+candidates[gp]=[ga_len,gb_len,'NA','NA','100','0']
+else:
+a[0:0]=[ga_len,gb_len]
+candidates[gp]=a
+infile=open(datafilename)
+outfile=open(outfilename,'w')
+header=['#Gene1','Gene2','Transcript1_Len','Transcript2_Len','Identity','Align_Len','Evalue','BitScore']
+outfile.write('%s\n'%('\t'.join(header)))
+for line in infile:
+if line.strip():
+info=line.split()
+geneA,geneB=info[0],info[1]
+key='%s_%s'%(geneA,geneB)
+vv=candidates[key]
+outfile.write('%s\t%s\t%s\n'%(geneA,geneB,'\t'.join(vv)))
+outfile.close()
+infile.close()
+print 'Homolgy test done!'

Mercurial > repos > siyuan > prada

comparison pyPRADA_1.2/prada-homology @ 0:acc2ca1a3ba4