comparison pyPRADA_1.2/prada-homology @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:acc2ca1a3ba4
1 #!/usr/bin/env python
2
3 #This script is part of the pyPRADA package. It is used to calculate sequence similarity of genes.
4 #In practice, it finds longest transcripts and uses blastn to compare the sequences.
5 #It is used to filter out fusion false positives like family genes etc.
6 #A temp folder is generated to save transcript fasta files.
7
8 import os
9 import re
10 import sys
11 import time
12 import subprocess
13 import bioclass
14 from Bio import SeqIO
15 import ioprada
16
17 ######################################################################################
18 help_menu='''\nUsage: prada-homology -conf xx.txt -i inputfile -o outputfile -tmpdir XXX
19 **parameters**
20 -conf see prada-fusion -conf for details
21 -i a tab/space delimited two-column file --- gene1 gene2
22 -o output file
23 -tmpdir tmporary directory that saves fasta files
24 '''
25
26 args=sys.argv
27
28 if '-h' in args or '-help' in args or len(args)==1:
29 print help_menu
30 sys.exit(0)
31
32 if '-i' not in sys.argv:
33 sys.exit('Input file needed')
34 else:
35 i=sys.argv.index('-i')
36 datafilename=sys.argv[i+1]
37 if '-o' not in sys.argv:
38 sys.exit('Output file needed')
39 else:
40 i=sys.argv.index('-o')
41 outfilename=sys.argv[i+1]
42 if '-tmpdir' not in sys.argv:
43 sys.exit('TMP dir needed')
44 else:
45 i=sys.argv.index('-tmpdir')
46 blastseq_tmp_dir=sys.argv[i+1]
47 if os.path.exists(blastseq_tmp_dir):
48 print 'Warning: tmpdir %s exists!'%blastseq_tmp_dir
49 else:
50 cmd='mkdir %s'%blastseq_tmp_dir
51 cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True)
52
53 prada_path=os.path.dirname(os.path.abspath(__file__)) ####
54 ref_search_path=[prada_path,os.getcwd()] #search path for ref file if not specified in command line
55
56 if '-conf' in args:
57 i=args.index('-conf')
58 reffile=args[i+1]
59 if os.path.exists(reffile):
60 pass
61 else:
62 for pth in ref_search_path:
63 new_reffile='%s/%s'%(pth, os.path.basename(reffile))
64 if os.path.exists(new_reffile):
65 reffile=new_reffile
66 break
67 else:
68 sys.exit('ERROR: conf file %s not found'%reffile)
69 else:
70 reffile='%s/conf.txt'%prada_path
71 if not os.path.exists(reffile):
72 sys.exit('ERROR: No default conf.txt found and none specified')
73
74 #Now print all input parameters
75 print 'CMD: %s'%('\t'.join(args))
76
77 #reference files
78 refdict=ioprada.read_conf(reffile)
79 featurefile=refdict['--REF--']['feature_file']
80 refdb=refdict['--REF--']['tx_seq_file']
81
82 blastn='%s/tools/blastn'%prada_path
83
84 #################Here We Go#############################################
85 ##Read information from raw file
86 print 'Collecting gene/Transcript info ...'
87 #call functions in ioprada module //
88 txdb,genedb=ioprada.read_feature(featurefile,verbose=False)
89
90 ##################################################################
91 ##Get all candidate gene pairs
92 infile=open(datafilename)
93 candidates={}
94 for line in infile:
95 if line.strip():
96 info=line.split()
97 geneA,geneB=info[0],info[1]
98 key='%s_%s'%(geneA,geneB)
99 candidates[key]=''
100 infile.close()
101 print '%d unique gene pairs collected'%len(candidates)
102
103 ##Select the transcript for each gene
104 selecttranscript={}
105 for gene in genedb:
106 txs=genedb[gene].transcript
107 stx=txs[0]
108 initlen=stx.length
109 for tx in txs:
110 if tx.length >= initlen:
111 stx=tx
112 initlen=stx.length
113 selecttranscript[gene]=stx.name
114 print 'Selected longest transcript for each gene.'
115
116 ##Get all needed sequences
117 allpartners=set()
118 for item in candidates:
119 sset=set(item.split('_'))
120 allpartners=allpartners.union(sset)
121
122 ####################################################################
123 def seqblast(seqA,seqB):
124 '''seqA,seqB:fasta files'''
125 global blastn
126 cmdstr='%s -task=blastn -subject %s -query %s -outfmt 6'%(blastn,seqA,seqB)
127 cmdout=subprocess.Popen(cmdstr.split(),stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
128 result=cmdout.stdout.read()
129 best_align=result.split('\n')[0]
130 if best_align=='':
131 return None
132 else:
133 info=best_align.split('\t')
134 identity=info[2].strip()
135 align_len=info[3].strip()
136 evalue=info[10].strip()
137 bit_score=info[11].strip()
138 return [identity,align_len,evalue,bit_score]
139
140 presenttxs=[] #tx that is present in our annotation
141 absent=[] #tx that is not in our annotation
142 for gene in allpartners:
143 if selecttranscript.has_key(gene):
144 presenttxs.append(selecttranscript[gene])
145 else:
146 absent.append(gene)
147
148 for seq_record in SeqIO.parse(refdb,'fasta'):
149 sid=seq_record.id
150 seq=seq_record.seq
151 if sid in presenttxs:
152 g=txdb[sid].gene
153 fastafile=open('%s/%s.fasta'%(blastseq_tmp_dir,g),'w')
154 SeqIO.write(seq_record,fastafile,'fasta')
155 fastafile.close()
156
157 for gp in candidates:
158 geneA,geneB=gp.split('_')
159 if geneA in absent or geneB in absent:
160 candidates[gp]=['NA']*6
161 else:
162 gaseq='%s/%s.fasta'%(blastseq_tmp_dir,geneA)
163 gaobj=SeqIO.parse(gaseq,'fasta').next()
164 gbseq='%s/%s.fasta'%(blastseq_tmp_dir,geneB)
165 gbobj=SeqIO.parse(gbseq,'fasta').next()
166 ga_len,gb_len=str(len(gaobj.seq)),str(len(gbobj.seq))
167 print 'Comparing %s %s'%(geneA,geneB)
168 a=seqblast(gaseq,gbseq)
169 if a==None:
170 candidates[gp]=[ga_len,gb_len,'NA','NA','100','0']
171 else:
172 a[0:0]=[ga_len,gb_len]
173 candidates[gp]=a
174
175
176 infile=open(datafilename)
177 outfile=open(outfilename,'w')
178 header=['#Gene1','Gene2','Transcript1_Len','Transcript2_Len','Identity','Align_Len','Evalue','BitScore']
179 outfile.write('%s\n'%('\t'.join(header)))
180 for line in infile:
181 if line.strip():
182 info=line.split()
183 geneA,geneB=info[0],info[1]
184 key='%s_%s'%(geneA,geneB)
185 vv=candidates[key]
186 outfile.write('%s\t%s\t%s\n'%(geneA,geneB,'\t'.join(vv)))
187 outfile.close()
188 infile.close()
189
190 print 'Homolgy test done!'