5
|
1 ################################################################################
|
|
2 # This program will read in a SAINT 'list.txt' file and the interactions from
|
|
3 # the consensus path db database and return all the interactions that we saw in
|
|
4 # our experiment in a format suitable for cytoscape. This allows us to filter
|
|
5 # before getting PPIs so that it doesn't affect our SAINT score or include
|
|
6 # interactions that don't score well
|
|
7 ################################################################################
|
|
8 # Copyright (C) Brent Kuenzi.
|
|
9 # Permission is granted to copy, distribute and/or modify this document
|
|
10 # under the terms of the GNU Free Documentation License, Version 1.3
|
|
11 # or any later version published by the Free Software Foundation;
|
|
12 # with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts.
|
|
13 # A copy of the license is included in the section entitled "GNU
|
|
14 # Free Documentation License".
|
|
15 ################################################################################
|
|
16 ## REQUIRED INPUT ##
|
|
17
|
|
18 # 1) listfile: SAINTexpress output
|
|
19 # 2) SAINT_cutoff: Saint score cutoff for import (between 0 and 1)
|
|
20 # 3) Int_conf: Confidence of PPI from CPDB to include
|
|
21 # - low: no filtering
|
|
22 # - medium: >0.5
|
|
23 # - high: >0.7
|
|
24 # - very high: >0.9
|
|
25 # 4) Species: Human, Yeast, or Mouse
|
|
26 ################################################################################
|
|
27
|
|
28
|
|
29 import urllib2
|
|
30 import itertools
|
|
31 import sys
|
|
32 import os
|
|
33
|
|
34
|
|
35 listfile = sys.argv[1]
|
|
36 SAINT_cutoff = sys.argv[2]
|
|
37 Int_conf = sys.argv[3]
|
|
38 Species = sys.argv[4]
|
|
39 cyto_file = sys.argv[5]
|
|
40 db_path = sys.argv[6]
|
|
41
|
|
42 class ReturnValue1(object):
|
|
43 def __init__(self, uniprot_acc, gene, swissprot):
|
|
44 self.up = uniprot_acc
|
|
45 self.gn = gene
|
|
46 self.sp = swissprot
|
|
47 class ReturnValue2(object):
|
|
48 def __init__(self, getdata, getproteins, getheader):
|
|
49 self.data = getdata
|
|
50 self.proteins = getproteins
|
|
51 self.header = getheader
|
|
52
|
|
53
|
|
54 def main(listfile, SAINT_cutoff, Int_conf, Species):
|
|
55 cytoscape(dd_network(listfile, SAINT_cutoff, Int_conf), listfile, SAINT_cutoff)
|
|
56
|
|
57
|
|
58 def readtab(infile):
|
|
59 with open(infile, 'r') as file_to_read:
|
|
60 # Read in tab-delim text.
|
|
61 output = []
|
|
62 for line in file_to_read:
|
|
63 line = line.strip()
|
|
64 temp = line.split('\t')
|
|
65 output.append(temp)
|
|
66 return output
|
|
67
|
|
68
|
|
69 def read_listfile(listfile):
|
|
70 # Get data, proteins and header from scaffold output
|
|
71 dupes = readtab(listfile)
|
|
72 header = dupes[0]
|
|
73 prot_start = header.index("PreyGene")-1
|
|
74 data = dupes[1:]
|
|
75 # Cut off blank line and END OF FILE.
|
|
76 proteins = []
|
|
77 for protein in data:
|
|
78 proteins.append(protein[prot_start])
|
|
79 return ReturnValue2(data, proteins, header)
|
|
80
|
|
81
|
|
82 def get_info(uniprot_accession_in):
|
|
83 # Get aa lengths and gene name.
|
9
|
84 print uniprot_accession_in
|
5
|
85 error = open('error proteins.txt', 'a+')
|
|
86 i = 0
|
|
87 while i == 0:
|
|
88 try:
|
|
89 data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in
|
|
90 + ".fasta")
|
|
91 break
|
|
92 except urllib2.HTTPError, err:
|
|
93 i = i + 1
|
|
94 if i == 50:
|
|
95 sys.exit("More than 50 errors. Check your file or try again later.")
|
|
96 if err.code == 404:
|
|
97 error.write(uniprot_accession_in + '\t' + "Invalid URL. Check protein" + '\n')
|
|
98 seqlength = 'NA'
|
|
99 genename = 'NA'
|
8
|
100 return ReturnValue1(seqlength, genename, genename)
|
5
|
101 elif err.code == 302:
|
|
102 sys.exit("Request timed out. Check connection and try again.")
|
|
103 else:
|
|
104 sys.exit("Uniprot had some other error")
|
|
105 lines = data.readlines()
|
|
106 header = lines[0]
|
|
107 lst = header.split('|')
|
|
108 lst2 = lst[2].split(' ')
|
|
109 swissprot = lst2[0]
|
|
110 uniprot_acc = lst[1]
|
|
111 if lines == []:
|
|
112 error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n')
|
|
113 error.close
|
|
114 uniprot_acc = 'NA'
|
|
115 genename = 'NA'
|
|
116 return ReturnValue1(uniprot_acc, genename, swissprot)
|
|
117 if lines != []:
|
|
118 seqlength = 0
|
|
119 header = lines[0]
|
|
120 if 'GN=' in header:
|
|
121 lst = header.split('GN=')
|
|
122 lst2 = lst[1].split(' ')
|
|
123 genename = lst2[0]
|
|
124 error.close
|
|
125 return ReturnValue1(uniprot_acc, genename, swissprot)
|
|
126 if 'GN=' not in header:
|
|
127 genename = 'NA'
|
|
128 error.close
|
|
129 return ReturnValue1(uniprot_acc, genename, swissprot)
|
|
130
|
|
131
|
|
132 def dd_network(listfile, SAINTscore, CPDB_filter):
|
|
133 # Filter by SS and CPDB.
|
|
134 data = read_listfile(listfile).data
|
|
135 # Change to filtered list.
|
|
136 SS = (read_listfile(listfile).header).index("SaintScore")
|
9
|
137 uni_ids = (read_listfile(listfile).header).index("PreyGene") - 1
|
5
|
138 filt_data = []
|
|
139 for i in data:
|
|
140 if i[SS] >= SAINTscore:
|
|
141 filt_data.append(i)
|
|
142 accessions = []
|
|
143 for i in filt_data:
|
9
|
144 accessions.append(get_info(i[uni_ids]).sp)
|
5
|
145 GO = []
|
|
146 for i in CPDB[2:]:
|
|
147 if i[3] >= CPDB_filter:
|
|
148 # Filter interaction confidence.
|
|
149 GO.append(i[2])
|
|
150 # All known interactions.
|
|
151 GO2 = []
|
|
152 for i in GO:
|
|
153 GO2.append(i.split(','))
|
|
154 # Make interactions list friendly.
|
|
155 unfiltered_network = {}
|
|
156 for i in accessions:
|
|
157 interactions = []
|
|
158 for j in GO2:
|
|
159 if i in j:
|
|
160 # Find the interactions.
|
|
161 if j not in interactions:
|
|
162 # Dont add duplicate interactions.
|
|
163 interactions.append(j)
|
|
164 merged = list(itertools.chain(*interactions))
|
|
165 # Flatten list of lists.
|
|
166 unfiltered_network[i] = merged
|
|
167 # Assign all possible interactions to protein in a dictionary.
|
|
168 dd_network = {}
|
|
169 # Data dependent network.
|
|
170 for i in unfiltered_network:
|
|
171 temp = []
|
|
172 for j in unfiltered_network[i]:
|
|
173 if j in accessions:
|
|
174 if j not in temp:
|
|
175 if j != i:
|
|
176 temp.append(j)
|
|
177 dd_network[i] = temp
|
|
178 return dd_network
|
|
179
|
|
180
|
|
181 def cytoscape(dd_network, listfile, SAINTscore):
|
|
182 with open('network.sif', 'wt') as y:
|
|
183 data = read_listfile(listfile).data
|
|
184 SS = (read_listfile(listfile).header).index("SaintScore")
|
9
|
185 uni_ids = (read_listfile(listfile).header).index("PreyGene") - 1
|
5
|
186 filt_data = []
|
|
187 for i in data:
|
|
188 if i[SS] >= SAINTscore:
|
9
|
189 filt_data.append(get_info(i[uni_ids]).sp)
|
5
|
190 for i in filt_data:
|
|
191 if dd_network[i] != []:
|
|
192 lst = []
|
|
193 for j in dd_network[i]:
|
|
194 lst.append(j)
|
|
195 for j in lst:
|
|
196 y.write(i+'\t'+'pp'+'\t' + j+'\n')
|
|
197
|
|
198
|
|
199 if Species == "Human":
|
|
200 CPDB = readtab(str(db_path) + 'ConsensusPathDB_human_PPI.txt')
|
|
201 if Species == "Yeast":
|
|
202 CPDB = readtab(str(db_path) + 'ConsensusPathDB_yeast_PPI.txt')
|
|
203 if Species == "Mouse":
|
|
204 CPDB = readtab(str(db_path) +'ConsensusPathDB_mouse_PPI.txt')
|
|
205 if __name__ == '__main__':
|
|
206 main(listfile, SAINT_cutoff, Int_conf, Species)
|
|
207 os.rename('network.sif', str(cyto_file))
|