Mercurial > repos > siyuan > prada
view pyPRADA_1.2/prada-homology @ 1:03815b87eb65 draft
Uploaded
author | siyuan |
---|---|
date | Fri, 21 Feb 2014 18:08:37 -0500 |
parents | acc2ca1a3ba4 |
children |
line wrap: on
line source
#!/usr/bin/env python #This script is part of the pyPRADA package. It is used to calculate sequence similarity of genes. #In practice, it finds longest transcripts and uses blastn to compare the sequences. #It is used to filter out fusion false positives like family genes etc. #A temp folder is generated to save transcript fasta files. import os import re import sys import time import subprocess import bioclass from Bio import SeqIO import ioprada ###################################################################################### help_menu='''\nUsage: prada-homology -conf xx.txt -i inputfile -o outputfile -tmpdir XXX **parameters** -conf see prada-fusion -conf for details -i a tab/space delimited two-column file --- gene1 gene2 -o output file -tmpdir tmporary directory that saves fasta files ''' args=sys.argv if '-h' in args or '-help' in args or len(args)==1: print help_menu sys.exit(0) if '-i' not in sys.argv: sys.exit('Input file needed') else: i=sys.argv.index('-i') datafilename=sys.argv[i+1] if '-o' not in sys.argv: sys.exit('Output file needed') else: i=sys.argv.index('-o') outfilename=sys.argv[i+1] if '-tmpdir' not in sys.argv: sys.exit('TMP dir needed') else: i=sys.argv.index('-tmpdir') blastseq_tmp_dir=sys.argv[i+1] if os.path.exists(blastseq_tmp_dir): print 'Warning: tmpdir %s exists!'%blastseq_tmp_dir else: cmd='mkdir %s'%blastseq_tmp_dir cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) prada_path=os.path.dirname(os.path.abspath(__file__)) #### ref_search_path=[prada_path,os.getcwd()] #search path for ref file if not specified in command line if '-conf' in args: i=args.index('-conf') reffile=args[i+1] if os.path.exists(reffile): pass else: for pth in ref_search_path: new_reffile='%s/%s'%(pth, os.path.basename(reffile)) if os.path.exists(new_reffile): reffile=new_reffile break else: sys.exit('ERROR: conf file %s not found'%reffile) else: reffile='%s/conf.txt'%prada_path if not os.path.exists(reffile): sys.exit('ERROR: No default conf.txt found and none specified') #Now print all input parameters print 'CMD: %s'%('\t'.join(args)) #reference files refdict=ioprada.read_conf(reffile) featurefile=refdict['--REF--']['feature_file'] refdb=refdict['--REF--']['tx_seq_file'] blastn='%s/tools/blastn'%prada_path #################Here We Go############################################# ##Read information from raw file print 'Collecting gene/Transcript info ...' #call functions in ioprada module // txdb,genedb=ioprada.read_feature(featurefile,verbose=False) ################################################################## ##Get all candidate gene pairs infile=open(datafilename) candidates={} for line in infile: if line.strip(): info=line.split() geneA,geneB=info[0],info[1] key='%s_%s'%(geneA,geneB) candidates[key]='' infile.close() print '%d unique gene pairs collected'%len(candidates) ##Select the transcript for each gene selecttranscript={} for gene in genedb: txs=genedb[gene].transcript stx=txs[0] initlen=stx.length for tx in txs: if tx.length >= initlen: stx=tx initlen=stx.length selecttranscript[gene]=stx.name print 'Selected longest transcript for each gene.' ##Get all needed sequences allpartners=set() for item in candidates: sset=set(item.split('_')) allpartners=allpartners.union(sset) #################################################################### def seqblast(seqA,seqB): '''seqA,seqB:fasta files''' global blastn cmdstr='%s -task=blastn -subject %s -query %s -outfmt 6'%(blastn,seqA,seqB) cmdout=subprocess.Popen(cmdstr.split(),stdout=subprocess.PIPE,stderr=subprocess.STDOUT) result=cmdout.stdout.read() best_align=result.split('\n')[0] if best_align=='': return None else: info=best_align.split('\t') identity=info[2].strip() align_len=info[3].strip() evalue=info[10].strip() bit_score=info[11].strip() return [identity,align_len,evalue,bit_score] presenttxs=[] #tx that is present in our annotation absent=[] #tx that is not in our annotation for gene in allpartners: if selecttranscript.has_key(gene): presenttxs.append(selecttranscript[gene]) else: absent.append(gene) for seq_record in SeqIO.parse(refdb,'fasta'): sid=seq_record.id seq=seq_record.seq if sid in presenttxs: g=txdb[sid].gene fastafile=open('%s/%s.fasta'%(blastseq_tmp_dir,g),'w') SeqIO.write(seq_record,fastafile,'fasta') fastafile.close() for gp in candidates: geneA,geneB=gp.split('_') if geneA in absent or geneB in absent: candidates[gp]=['NA']*6 else: gaseq='%s/%s.fasta'%(blastseq_tmp_dir,geneA) gaobj=SeqIO.parse(gaseq,'fasta').next() gbseq='%s/%s.fasta'%(blastseq_tmp_dir,geneB) gbobj=SeqIO.parse(gbseq,'fasta').next() ga_len,gb_len=str(len(gaobj.seq)),str(len(gbobj.seq)) print 'Comparing %s %s'%(geneA,geneB) a=seqblast(gaseq,gbseq) if a==None: candidates[gp]=[ga_len,gb_len,'NA','NA','100','0'] else: a[0:0]=[ga_len,gb_len] candidates[gp]=a infile=open(datafilename) outfile=open(outfilename,'w') header=['#Gene1','Gene2','Transcript1_Len','Transcript2_Len','Identity','Align_Len','Evalue','BitScore'] outfile.write('%s\n'%('\t'.join(header))) for line in infile: if line.strip(): info=line.split() geneA,geneB=info[0],info[1] key='%s_%s'%(geneA,geneB) vv=candidates[key] outfile.write('%s\t%s\t%s\n'%(geneA,geneB,'\t'.join(vv))) outfile.close() infile.close() print 'Homolgy test done!'