annotate pyPRADA_1.2/prada-homology @ 3:f17965495ec9 draft default tip

Uploaded
author siyuan
date Tue, 11 Mar 2014 12:14:01 -0400
parents acc2ca1a3ba4
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
1 #!/usr/bin/env python
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
2
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
3 #This script is part of the pyPRADA package. It is used to calculate sequence similarity of genes.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
4 #In practice, it finds longest transcripts and uses blastn to compare the sequences.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
5 #It is used to filter out fusion false positives like family genes etc.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
6 #A temp folder is generated to save transcript fasta files.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
7
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
8 import os
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
9 import re
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
10 import sys
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
11 import time
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
12 import subprocess
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
13 import bioclass
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
14 from Bio import SeqIO
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
15 import ioprada
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
16
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
17 ######################################################################################
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
18 help_menu='''\nUsage: prada-homology -conf xx.txt -i inputfile -o outputfile -tmpdir XXX
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
19 **parameters**
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
20 -conf see prada-fusion -conf for details
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
21 -i a tab/space delimited two-column file --- gene1 gene2
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
22 -o output file
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
23 -tmpdir tmporary directory that saves fasta files
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
24 '''
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
25
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
26 args=sys.argv
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
27
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
28 if '-h' in args or '-help' in args or len(args)==1:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
29 print help_menu
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
30 sys.exit(0)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
31
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
32 if '-i' not in sys.argv:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
33 sys.exit('Input file needed')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
34 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
35 i=sys.argv.index('-i')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
36 datafilename=sys.argv[i+1]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
37 if '-o' not in sys.argv:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
38 sys.exit('Output file needed')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
39 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
40 i=sys.argv.index('-o')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
41 outfilename=sys.argv[i+1]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
42 if '-tmpdir' not in sys.argv:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
43 sys.exit('TMP dir needed')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
44 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
45 i=sys.argv.index('-tmpdir')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
46 blastseq_tmp_dir=sys.argv[i+1]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
47 if os.path.exists(blastseq_tmp_dir):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
48 print 'Warning: tmpdir %s exists!'%blastseq_tmp_dir
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
49 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
50 cmd='mkdir %s'%blastseq_tmp_dir
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
51 cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
52
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
53 prada_path=os.path.dirname(os.path.abspath(__file__)) ####
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
54 ref_search_path=[prada_path,os.getcwd()] #search path for ref file if not specified in command line
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
55
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
56 if '-conf' in args:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
57 i=args.index('-conf')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
58 reffile=args[i+1]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
59 if os.path.exists(reffile):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
60 pass
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
61 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
62 for pth in ref_search_path:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
63 new_reffile='%s/%s'%(pth, os.path.basename(reffile))
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
64 if os.path.exists(new_reffile):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
65 reffile=new_reffile
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
66 break
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
67 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
68 sys.exit('ERROR: conf file %s not found'%reffile)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
69 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
70 reffile='%s/conf.txt'%prada_path
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
71 if not os.path.exists(reffile):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
72 sys.exit('ERROR: No default conf.txt found and none specified')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
73
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
74 #Now print all input parameters
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
75 print 'CMD: %s'%('\t'.join(args))
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
76
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
77 #reference files
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
78 refdict=ioprada.read_conf(reffile)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
79 featurefile=refdict['--REF--']['feature_file']
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
80 refdb=refdict['--REF--']['tx_seq_file']
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
81
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
82 blastn='%s/tools/blastn'%prada_path
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
83
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
84 #################Here We Go#############################################
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
85 ##Read information from raw file
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
86 print 'Collecting gene/Transcript info ...'
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
87 #call functions in ioprada module //
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
88 txdb,genedb=ioprada.read_feature(featurefile,verbose=False)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
89
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
90 ##################################################################
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
91 ##Get all candidate gene pairs
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
92 infile=open(datafilename)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
93 candidates={}
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
94 for line in infile:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
95 if line.strip():
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
96 info=line.split()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
97 geneA,geneB=info[0],info[1]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
98 key='%s_%s'%(geneA,geneB)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
99 candidates[key]=''
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
100 infile.close()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
101 print '%d unique gene pairs collected'%len(candidates)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
102
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
103 ##Select the transcript for each gene
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
104 selecttranscript={}
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
105 for gene in genedb:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
106 txs=genedb[gene].transcript
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
107 stx=txs[0]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
108 initlen=stx.length
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
109 for tx in txs:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
110 if tx.length >= initlen:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
111 stx=tx
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
112 initlen=stx.length
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
113 selecttranscript[gene]=stx.name
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
114 print 'Selected longest transcript for each gene.'
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
115
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
116 ##Get all needed sequences
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
117 allpartners=set()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
118 for item in candidates:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
119 sset=set(item.split('_'))
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
120 allpartners=allpartners.union(sset)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
121
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
122 ####################################################################
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
123 def seqblast(seqA,seqB):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
124 '''seqA,seqB:fasta files'''
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
125 global blastn
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
126 cmdstr='%s -task=blastn -subject %s -query %s -outfmt 6'%(blastn,seqA,seqB)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
127 cmdout=subprocess.Popen(cmdstr.split(),stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
128 result=cmdout.stdout.read()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
129 best_align=result.split('\n')[0]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
130 if best_align=='':
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
131 return None
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
132 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
133 info=best_align.split('\t')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
134 identity=info[2].strip()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
135 align_len=info[3].strip()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
136 evalue=info[10].strip()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
137 bit_score=info[11].strip()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
138 return [identity,align_len,evalue,bit_score]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
139
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
140 presenttxs=[] #tx that is present in our annotation
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
141 absent=[] #tx that is not in our annotation
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
142 for gene in allpartners:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
143 if selecttranscript.has_key(gene):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
144 presenttxs.append(selecttranscript[gene])
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
145 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
146 absent.append(gene)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
147
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
148 for seq_record in SeqIO.parse(refdb,'fasta'):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
149 sid=seq_record.id
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
150 seq=seq_record.seq
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
151 if sid in presenttxs:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
152 g=txdb[sid].gene
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
153 fastafile=open('%s/%s.fasta'%(blastseq_tmp_dir,g),'w')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
154 SeqIO.write(seq_record,fastafile,'fasta')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
155 fastafile.close()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
156
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
157 for gp in candidates:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
158 geneA,geneB=gp.split('_')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
159 if geneA in absent or geneB in absent:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
160 candidates[gp]=['NA']*6
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
161 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
162 gaseq='%s/%s.fasta'%(blastseq_tmp_dir,geneA)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
163 gaobj=SeqIO.parse(gaseq,'fasta').next()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
164 gbseq='%s/%s.fasta'%(blastseq_tmp_dir,geneB)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
165 gbobj=SeqIO.parse(gbseq,'fasta').next()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
166 ga_len,gb_len=str(len(gaobj.seq)),str(len(gbobj.seq))
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
167 print 'Comparing %s %s'%(geneA,geneB)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
168 a=seqblast(gaseq,gbseq)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
169 if a==None:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
170 candidates[gp]=[ga_len,gb_len,'NA','NA','100','0']
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
171 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
172 a[0:0]=[ga_len,gb_len]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
173 candidates[gp]=a
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
174
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
175
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
176 infile=open(datafilename)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
177 outfile=open(outfilename,'w')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
178 header=['#Gene1','Gene2','Transcript1_Len','Transcript2_Len','Identity','Align_Len','Evalue','BitScore']
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
179 outfile.write('%s\n'%('\t'.join(header)))
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
180 for line in infile:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
181 if line.strip():
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
182 info=line.split()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
183 geneA,geneB=info[0],info[1]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
184 key='%s_%s'%(geneA,geneB)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
185 vv=candidates[key]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
186 outfile.write('%s\t%s\t%s\n'%(geneA,geneB,'\t'.join(vv)))
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
187 outfile.close()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
188 infile.close()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
189
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
190 print 'Homolgy test done!'