0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 #This script is part of the pyPRADA package. It is used to calculate sequence similarity of genes.
|
|
4 #In practice, it finds longest transcripts and uses blastn to compare the sequences.
|
|
5 #It is used to filter out fusion false positives like family genes etc.
|
|
6 #A temp folder is generated to save transcript fasta files.
|
|
7
|
|
8 import os
|
|
9 import re
|
|
10 import sys
|
|
11 import time
|
|
12 import subprocess
|
|
13 import bioclass
|
|
14 from Bio import SeqIO
|
|
15 import ioprada
|
|
16
|
|
17 ######################################################################################
|
|
18 help_menu='''\nUsage: prada-homology -conf xx.txt -i inputfile -o outputfile -tmpdir XXX
|
|
19 **parameters**
|
|
20 -conf see prada-fusion -conf for details
|
|
21 -i a tab/space delimited two-column file --- gene1 gene2
|
|
22 -o output file
|
|
23 -tmpdir tmporary directory that saves fasta files
|
|
24 '''
|
|
25
|
|
26 args=sys.argv
|
|
27
|
|
28 if '-h' in args or '-help' in args or len(args)==1:
|
|
29 print help_menu
|
|
30 sys.exit(0)
|
|
31
|
|
32 if '-i' not in sys.argv:
|
|
33 sys.exit('Input file needed')
|
|
34 else:
|
|
35 i=sys.argv.index('-i')
|
|
36 datafilename=sys.argv[i+1]
|
|
37 if '-o' not in sys.argv:
|
|
38 sys.exit('Output file needed')
|
|
39 else:
|
|
40 i=sys.argv.index('-o')
|
|
41 outfilename=sys.argv[i+1]
|
|
42 if '-tmpdir' not in sys.argv:
|
|
43 sys.exit('TMP dir needed')
|
|
44 else:
|
|
45 i=sys.argv.index('-tmpdir')
|
|
46 blastseq_tmp_dir=sys.argv[i+1]
|
|
47 if os.path.exists(blastseq_tmp_dir):
|
|
48 print 'Warning: tmpdir %s exists!'%blastseq_tmp_dir
|
|
49 else:
|
|
50 cmd='mkdir %s'%blastseq_tmp_dir
|
|
51 cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)
|
|
52
|
|
53 prada_path=os.path.dirname(os.path.abspath(__file__)) ####
|
|
54 ref_search_path=[prada_path,os.getcwd()] #search path for ref file if not specified in command line
|
|
55
|
|
56 if '-conf' in args:
|
|
57 i=args.index('-conf')
|
|
58 reffile=args[i+1]
|
|
59 if os.path.exists(reffile):
|
|
60 pass
|
|
61 else:
|
|
62 for pth in ref_search_path:
|
|
63 new_reffile='%s/%s'%(pth, os.path.basename(reffile))
|
|
64 if os.path.exists(new_reffile):
|
|
65 reffile=new_reffile
|
|
66 break
|
|
67 else:
|
|
68 sys.exit('ERROR: conf file %s not found'%reffile)
|
|
69 else:
|
|
70 reffile='%s/conf.txt'%prada_path
|
|
71 if not os.path.exists(reffile):
|
|
72 sys.exit('ERROR: No default conf.txt found and none specified')
|
|
73
|
|
74 #Now print all input parameters
|
|
75 print 'CMD: %s'%('\t'.join(args))
|
|
76
|
|
77 #reference files
|
|
78 refdict=ioprada.read_conf(reffile)
|
|
79 featurefile=refdict['--REF--']['feature_file']
|
|
80 refdb=refdict['--REF--']['tx_seq_file']
|
|
81
|
|
82 blastn='%s/tools/blastn'%prada_path
|
|
83
|
|
84 #################Here We Go#############################################
|
|
85 ##Read information from raw file
|
|
86 print 'Collecting gene/Transcript info ...'
|
|
87 #call functions in ioprada module //
|
|
88 txdb,genedb=ioprada.read_feature(featurefile,verbose=False)
|
|
89
|
|
90 ##################################################################
|
|
91 ##Get all candidate gene pairs
|
|
92 infile=open(datafilename)
|
|
93 candidates={}
|
|
94 for line in infile:
|
|
95 if line.strip():
|
|
96 info=line.split()
|
|
97 geneA,geneB=info[0],info[1]
|
|
98 key='%s_%s'%(geneA,geneB)
|
|
99 candidates[key]=''
|
|
100 infile.close()
|
|
101 print '%d unique gene pairs collected'%len(candidates)
|
|
102
|
|
103 ##Select the transcript for each gene
|
|
104 selecttranscript={}
|
|
105 for gene in genedb:
|
|
106 txs=genedb[gene].transcript
|
|
107 stx=txs[0]
|
|
108 initlen=stx.length
|
|
109 for tx in txs:
|
|
110 if tx.length >= initlen:
|
|
111 stx=tx
|
|
112 initlen=stx.length
|
|
113 selecttranscript[gene]=stx.name
|
|
114 print 'Selected longest transcript for each gene.'
|
|
115
|
|
116 ##Get all needed sequences
|
|
117 allpartners=set()
|
|
118 for item in candidates:
|
|
119 sset=set(item.split('_'))
|
|
120 allpartners=allpartners.union(sset)
|
|
121
|
|
122 ####################################################################
|
|
123 def seqblast(seqA,seqB):
|
|
124 '''seqA,seqB:fasta files'''
|
|
125 global blastn
|
|
126 cmdstr='%s -task=blastn -subject %s -query %s -outfmt 6'%(blastn,seqA,seqB)
|
|
127 cmdout=subprocess.Popen(cmdstr.split(),stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
|
|
128 result=cmdout.stdout.read()
|
|
129 best_align=result.split('\n')[0]
|
|
130 if best_align=='':
|
|
131 return None
|
|
132 else:
|
|
133 info=best_align.split('\t')
|
|
134 identity=info[2].strip()
|
|
135 align_len=info[3].strip()
|
|
136 evalue=info[10].strip()
|
|
137 bit_score=info[11].strip()
|
|
138 return [identity,align_len,evalue,bit_score]
|
|
139
|
|
140 presenttxs=[] #tx that is present in our annotation
|
|
141 absent=[] #tx that is not in our annotation
|
|
142 for gene in allpartners:
|
|
143 if selecttranscript.has_key(gene):
|
|
144 presenttxs.append(selecttranscript[gene])
|
|
145 else:
|
|
146 absent.append(gene)
|
|
147
|
|
148 for seq_record in SeqIO.parse(refdb,'fasta'):
|
|
149 sid=seq_record.id
|
|
150 seq=seq_record.seq
|
|
151 if sid in presenttxs:
|
|
152 g=txdb[sid].gene
|
|
153 fastafile=open('%s/%s.fasta'%(blastseq_tmp_dir,g),'w')
|
|
154 SeqIO.write(seq_record,fastafile,'fasta')
|
|
155 fastafile.close()
|
|
156
|
|
157 for gp in candidates:
|
|
158 geneA,geneB=gp.split('_')
|
|
159 if geneA in absent or geneB in absent:
|
|
160 candidates[gp]=['NA']*6
|
|
161 else:
|
|
162 gaseq='%s/%s.fasta'%(blastseq_tmp_dir,geneA)
|
|
163 gaobj=SeqIO.parse(gaseq,'fasta').next()
|
|
164 gbseq='%s/%s.fasta'%(blastseq_tmp_dir,geneB)
|
|
165 gbobj=SeqIO.parse(gbseq,'fasta').next()
|
|
166 ga_len,gb_len=str(len(gaobj.seq)),str(len(gbobj.seq))
|
|
167 print 'Comparing %s %s'%(geneA,geneB)
|
|
168 a=seqblast(gaseq,gbseq)
|
|
169 if a==None:
|
|
170 candidates[gp]=[ga_len,gb_len,'NA','NA','100','0']
|
|
171 else:
|
|
172 a[0:0]=[ga_len,gb_len]
|
|
173 candidates[gp]=a
|
|
174
|
|
175
|
|
176 infile=open(datafilename)
|
|
177 outfile=open(outfilename,'w')
|
|
178 header=['#Gene1','Gene2','Transcript1_Len','Transcript2_Len','Identity','Align_Len','Evalue','BitScore']
|
|
179 outfile.write('%s\n'%('\t'.join(header)))
|
|
180 for line in infile:
|
|
181 if line.strip():
|
|
182 info=line.split()
|
|
183 geneA,geneB=info[0],info[1]
|
|
184 key='%s_%s'%(geneA,geneB)
|
|
185 vv=candidates[key]
|
|
186 outfile.write('%s\t%s\t%s\n'%(geneA,geneB,'\t'.join(vv)))
|
|
187 outfile.close()
|
|
188 infile.close()
|
|
189
|
|
190 print 'Homolgy test done!'
|