Mercurial > repos > siyuan > prada
diff pyPRADA_1.2/prada-homology @ 0:acc2ca1a3ba4
Uploaded
author | siyuan |
---|---|
date | Thu, 20 Feb 2014 00:44:58 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyPRADA_1.2/prada-homology Thu Feb 20 00:44:58 2014 -0500 @@ -0,0 +1,190 @@ +#!/usr/bin/env python + +#This script is part of the pyPRADA package. It is used to calculate sequence similarity of genes. +#In practice, it finds longest transcripts and uses blastn to compare the sequences. +#It is used to filter out fusion false positives like family genes etc. +#A temp folder is generated to save transcript fasta files. + +import os +import re +import sys +import time +import subprocess +import bioclass +from Bio import SeqIO +import ioprada + +###################################################################################### +help_menu='''\nUsage: prada-homology -conf xx.txt -i inputfile -o outputfile -tmpdir XXX +**parameters** + -conf see prada-fusion -conf for details + -i a tab/space delimited two-column file --- gene1 gene2 + -o output file + -tmpdir tmporary directory that saves fasta files +''' + +args=sys.argv + +if '-h' in args or '-help' in args or len(args)==1: + print help_menu + sys.exit(0) + +if '-i' not in sys.argv: + sys.exit('Input file needed') +else: + i=sys.argv.index('-i') + datafilename=sys.argv[i+1] +if '-o' not in sys.argv: + sys.exit('Output file needed') +else: + i=sys.argv.index('-o') + outfilename=sys.argv[i+1] +if '-tmpdir' not in sys.argv: + sys.exit('TMP dir needed') +else: + i=sys.argv.index('-tmpdir') + blastseq_tmp_dir=sys.argv[i+1] +if os.path.exists(blastseq_tmp_dir): + print 'Warning: tmpdir %s exists!'%blastseq_tmp_dir +else: + cmd='mkdir %s'%blastseq_tmp_dir + cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) + +prada_path=os.path.dirname(os.path.abspath(__file__)) #### +ref_search_path=[prada_path,os.getcwd()] #search path for ref file if not specified in command line + +if '-conf' in args: + i=args.index('-conf') + reffile=args[i+1] + if os.path.exists(reffile): + pass + else: + for pth in ref_search_path: + new_reffile='%s/%s'%(pth, os.path.basename(reffile)) + if os.path.exists(new_reffile): + reffile=new_reffile + break + else: + sys.exit('ERROR: conf file %s not found'%reffile) +else: + reffile='%s/conf.txt'%prada_path + if not os.path.exists(reffile): + sys.exit('ERROR: No default conf.txt found and none specified') + +#Now print all input parameters +print 'CMD: %s'%('\t'.join(args)) + +#reference files +refdict=ioprada.read_conf(reffile) +featurefile=refdict['--REF--']['feature_file'] +refdb=refdict['--REF--']['tx_seq_file'] + +blastn='%s/tools/blastn'%prada_path + +#################Here We Go############################################# +##Read information from raw file +print 'Collecting gene/Transcript info ...' +#call functions in ioprada module // +txdb,genedb=ioprada.read_feature(featurefile,verbose=False) + +################################################################## +##Get all candidate gene pairs +infile=open(datafilename) +candidates={} +for line in infile: + if line.strip(): + info=line.split() + geneA,geneB=info[0],info[1] + key='%s_%s'%(geneA,geneB) + candidates[key]='' +infile.close() +print '%d unique gene pairs collected'%len(candidates) + +##Select the transcript for each gene +selecttranscript={} +for gene in genedb: + txs=genedb[gene].transcript + stx=txs[0] + initlen=stx.length + for tx in txs: + if tx.length >= initlen: + stx=tx + initlen=stx.length + selecttranscript[gene]=stx.name +print 'Selected longest transcript for each gene.' + +##Get all needed sequences +allpartners=set() +for item in candidates: + sset=set(item.split('_')) + allpartners=allpartners.union(sset) + +#################################################################### +def seqblast(seqA,seqB): + '''seqA,seqB:fasta files''' + global blastn + cmdstr='%s -task=blastn -subject %s -query %s -outfmt 6'%(blastn,seqA,seqB) + cmdout=subprocess.Popen(cmdstr.split(),stdout=subprocess.PIPE,stderr=subprocess.STDOUT) + result=cmdout.stdout.read() + best_align=result.split('\n')[0] + if best_align=='': + return None + else: + info=best_align.split('\t') + identity=info[2].strip() + align_len=info[3].strip() + evalue=info[10].strip() + bit_score=info[11].strip() + return [identity,align_len,evalue,bit_score] + +presenttxs=[] #tx that is present in our annotation +absent=[] #tx that is not in our annotation +for gene in allpartners: + if selecttranscript.has_key(gene): + presenttxs.append(selecttranscript[gene]) + else: + absent.append(gene) + +for seq_record in SeqIO.parse(refdb,'fasta'): + sid=seq_record.id + seq=seq_record.seq + if sid in presenttxs: + g=txdb[sid].gene + fastafile=open('%s/%s.fasta'%(blastseq_tmp_dir,g),'w') + SeqIO.write(seq_record,fastafile,'fasta') + fastafile.close() + +for gp in candidates: + geneA,geneB=gp.split('_') + if geneA in absent or geneB in absent: + candidates[gp]=['NA']*6 + else: + gaseq='%s/%s.fasta'%(blastseq_tmp_dir,geneA) + gaobj=SeqIO.parse(gaseq,'fasta').next() + gbseq='%s/%s.fasta'%(blastseq_tmp_dir,geneB) + gbobj=SeqIO.parse(gbseq,'fasta').next() + ga_len,gb_len=str(len(gaobj.seq)),str(len(gbobj.seq)) + print 'Comparing %s %s'%(geneA,geneB) + a=seqblast(gaseq,gbseq) + if a==None: + candidates[gp]=[ga_len,gb_len,'NA','NA','100','0'] + else: + a[0:0]=[ga_len,gb_len] + candidates[gp]=a + + +infile=open(datafilename) +outfile=open(outfilename,'w') +header=['#Gene1','Gene2','Transcript1_Len','Transcript2_Len','Identity','Align_Len','Evalue','BitScore'] +outfile.write('%s\n'%('\t'.join(header))) +for line in infile: + if line.strip(): + info=line.split() + geneA,geneB=info[0],info[1] + key='%s_%s'%(geneA,geneB) + vv=candidates[key] + outfile.write('%s\t%s\t%s\n'%(geneA,geneB,'\t'.join(vv))) +outfile.close() +infile.close() + +print 'Homolgy test done!'