annotate ProteinInteractions.py @ 10:c414a56f5874 draft default tip

Uploaded
author bornea
date Mon, 18 Apr 2016 10:41:54 -0400
parents dd0b4c901d0a
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5
5935066604a7 Uploaded
bornea
parents:
diff changeset
1 ################################################################################
5935066604a7 Uploaded
bornea
parents:
diff changeset
2 # This program will read in a SAINT 'list.txt' file and the interactions from
5935066604a7 Uploaded
bornea
parents:
diff changeset
3 # the consensus path db database and return all the interactions that we saw in
5935066604a7 Uploaded
bornea
parents:
diff changeset
4 # our experiment in a format suitable for cytoscape. This allows us to filter
5935066604a7 Uploaded
bornea
parents:
diff changeset
5 # before getting PPIs so that it doesn't affect our SAINT score or include
5935066604a7 Uploaded
bornea
parents:
diff changeset
6 # interactions that don't score well
5935066604a7 Uploaded
bornea
parents:
diff changeset
7 ################################################################################
5935066604a7 Uploaded
bornea
parents:
diff changeset
8 # Copyright (C) Brent Kuenzi.
5935066604a7 Uploaded
bornea
parents:
diff changeset
9 # Permission is granted to copy, distribute and/or modify this document
5935066604a7 Uploaded
bornea
parents:
diff changeset
10 # under the terms of the GNU Free Documentation License, Version 1.3
5935066604a7 Uploaded
bornea
parents:
diff changeset
11 # or any later version published by the Free Software Foundation;
5935066604a7 Uploaded
bornea
parents:
diff changeset
12 # with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts.
5935066604a7 Uploaded
bornea
parents:
diff changeset
13 # A copy of the license is included in the section entitled "GNU
5935066604a7 Uploaded
bornea
parents:
diff changeset
14 # Free Documentation License".
5935066604a7 Uploaded
bornea
parents:
diff changeset
15 ################################################################################
5935066604a7 Uploaded
bornea
parents:
diff changeset
16 ## REQUIRED INPUT ##
5935066604a7 Uploaded
bornea
parents:
diff changeset
17
5935066604a7 Uploaded
bornea
parents:
diff changeset
18 # 1) listfile: SAINTexpress output
5935066604a7 Uploaded
bornea
parents:
diff changeset
19 # 2) SAINT_cutoff: Saint score cutoff for import (between 0 and 1)
5935066604a7 Uploaded
bornea
parents:
diff changeset
20 # 3) Int_conf: Confidence of PPI from CPDB to include
5935066604a7 Uploaded
bornea
parents:
diff changeset
21 # - low: no filtering
5935066604a7 Uploaded
bornea
parents:
diff changeset
22 # - medium: >0.5
5935066604a7 Uploaded
bornea
parents:
diff changeset
23 # - high: >0.7
5935066604a7 Uploaded
bornea
parents:
diff changeset
24 # - very high: >0.9
5935066604a7 Uploaded
bornea
parents:
diff changeset
25 # 4) Species: Human, Yeast, or Mouse
5935066604a7 Uploaded
bornea
parents:
diff changeset
26 ################################################################################
5935066604a7 Uploaded
bornea
parents:
diff changeset
27
5935066604a7 Uploaded
bornea
parents:
diff changeset
28
5935066604a7 Uploaded
bornea
parents:
diff changeset
29 import urllib2
5935066604a7 Uploaded
bornea
parents:
diff changeset
30 import itertools
5935066604a7 Uploaded
bornea
parents:
diff changeset
31 import sys
5935066604a7 Uploaded
bornea
parents:
diff changeset
32 import os
5935066604a7 Uploaded
bornea
parents:
diff changeset
33
5935066604a7 Uploaded
bornea
parents:
diff changeset
34
5935066604a7 Uploaded
bornea
parents:
diff changeset
35 listfile = sys.argv[1]
5935066604a7 Uploaded
bornea
parents:
diff changeset
36 SAINT_cutoff = sys.argv[2]
5935066604a7 Uploaded
bornea
parents:
diff changeset
37 Int_conf = sys.argv[3]
5935066604a7 Uploaded
bornea
parents:
diff changeset
38 Species = sys.argv[4]
5935066604a7 Uploaded
bornea
parents:
diff changeset
39 cyto_file = sys.argv[5]
5935066604a7 Uploaded
bornea
parents:
diff changeset
40 db_path = sys.argv[6]
5935066604a7 Uploaded
bornea
parents:
diff changeset
41
5935066604a7 Uploaded
bornea
parents:
diff changeset
42 class ReturnValue1(object):
5935066604a7 Uploaded
bornea
parents:
diff changeset
43 def __init__(self, uniprot_acc, gene, swissprot):
5935066604a7 Uploaded
bornea
parents:
diff changeset
44 self.up = uniprot_acc
5935066604a7 Uploaded
bornea
parents:
diff changeset
45 self.gn = gene
5935066604a7 Uploaded
bornea
parents:
diff changeset
46 self.sp = swissprot
5935066604a7 Uploaded
bornea
parents:
diff changeset
47 class ReturnValue2(object):
5935066604a7 Uploaded
bornea
parents:
diff changeset
48 def __init__(self, getdata, getproteins, getheader):
5935066604a7 Uploaded
bornea
parents:
diff changeset
49 self.data = getdata
5935066604a7 Uploaded
bornea
parents:
diff changeset
50 self.proteins = getproteins
5935066604a7 Uploaded
bornea
parents:
diff changeset
51 self.header = getheader
5935066604a7 Uploaded
bornea
parents:
diff changeset
52
5935066604a7 Uploaded
bornea
parents:
diff changeset
53
5935066604a7 Uploaded
bornea
parents:
diff changeset
54 def main(listfile, SAINT_cutoff, Int_conf, Species):
5935066604a7 Uploaded
bornea
parents:
diff changeset
55 cytoscape(dd_network(listfile, SAINT_cutoff, Int_conf), listfile, SAINT_cutoff)
5935066604a7 Uploaded
bornea
parents:
diff changeset
56
5935066604a7 Uploaded
bornea
parents:
diff changeset
57
5935066604a7 Uploaded
bornea
parents:
diff changeset
58 def readtab(infile):
5935066604a7 Uploaded
bornea
parents:
diff changeset
59 with open(infile, 'r') as file_to_read:
5935066604a7 Uploaded
bornea
parents:
diff changeset
60 # Read in tab-delim text.
5935066604a7 Uploaded
bornea
parents:
diff changeset
61 output = []
5935066604a7 Uploaded
bornea
parents:
diff changeset
62 for line in file_to_read:
5935066604a7 Uploaded
bornea
parents:
diff changeset
63 line = line.strip()
5935066604a7 Uploaded
bornea
parents:
diff changeset
64 temp = line.split('\t')
5935066604a7 Uploaded
bornea
parents:
diff changeset
65 output.append(temp)
5935066604a7 Uploaded
bornea
parents:
diff changeset
66 return output
5935066604a7 Uploaded
bornea
parents:
diff changeset
67
5935066604a7 Uploaded
bornea
parents:
diff changeset
68
5935066604a7 Uploaded
bornea
parents:
diff changeset
69 def read_listfile(listfile):
5935066604a7 Uploaded
bornea
parents:
diff changeset
70 # Get data, proteins and header from scaffold output
5935066604a7 Uploaded
bornea
parents:
diff changeset
71 dupes = readtab(listfile)
5935066604a7 Uploaded
bornea
parents:
diff changeset
72 header = dupes[0]
5935066604a7 Uploaded
bornea
parents:
diff changeset
73 prot_start = header.index("PreyGene")-1
5935066604a7 Uploaded
bornea
parents:
diff changeset
74 data = dupes[1:]
5935066604a7 Uploaded
bornea
parents:
diff changeset
75 # Cut off blank line and END OF FILE.
5935066604a7 Uploaded
bornea
parents:
diff changeset
76 proteins = []
5935066604a7 Uploaded
bornea
parents:
diff changeset
77 for protein in data:
5935066604a7 Uploaded
bornea
parents:
diff changeset
78 proteins.append(protein[prot_start])
5935066604a7 Uploaded
bornea
parents:
diff changeset
79 return ReturnValue2(data, proteins, header)
5935066604a7 Uploaded
bornea
parents:
diff changeset
80
5935066604a7 Uploaded
bornea
parents:
diff changeset
81
5935066604a7 Uploaded
bornea
parents:
diff changeset
82 def get_info(uniprot_accession_in):
5935066604a7 Uploaded
bornea
parents:
diff changeset
83 # Get aa lengths and gene name.
9
dd0b4c901d0a Uploaded
bornea
parents: 8
diff changeset
84 print uniprot_accession_in
5
5935066604a7 Uploaded
bornea
parents:
diff changeset
85 error = open('error proteins.txt', 'a+')
5935066604a7 Uploaded
bornea
parents:
diff changeset
86 i = 0
5935066604a7 Uploaded
bornea
parents:
diff changeset
87 while i == 0:
5935066604a7 Uploaded
bornea
parents:
diff changeset
88 try:
5935066604a7 Uploaded
bornea
parents:
diff changeset
89 data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in
5935066604a7 Uploaded
bornea
parents:
diff changeset
90 + ".fasta")
5935066604a7 Uploaded
bornea
parents:
diff changeset
91 break
5935066604a7 Uploaded
bornea
parents:
diff changeset
92 except urllib2.HTTPError, err:
5935066604a7 Uploaded
bornea
parents:
diff changeset
93 i = i + 1
5935066604a7 Uploaded
bornea
parents:
diff changeset
94 if i == 50:
5935066604a7 Uploaded
bornea
parents:
diff changeset
95 sys.exit("More than 50 errors. Check your file or try again later.")
5935066604a7 Uploaded
bornea
parents:
diff changeset
96 if err.code == 404:
5935066604a7 Uploaded
bornea
parents:
diff changeset
97 error.write(uniprot_accession_in + '\t' + "Invalid URL. Check protein" + '\n')
5935066604a7 Uploaded
bornea
parents:
diff changeset
98 seqlength = 'NA'
5935066604a7 Uploaded
bornea
parents:
diff changeset
99 genename = 'NA'
8
355ad1ce474b Uploaded
bornea
parents: 5
diff changeset
100 return ReturnValue1(seqlength, genename, genename)
5
5935066604a7 Uploaded
bornea
parents:
diff changeset
101 elif err.code == 302:
5935066604a7 Uploaded
bornea
parents:
diff changeset
102 sys.exit("Request timed out. Check connection and try again.")
5935066604a7 Uploaded
bornea
parents:
diff changeset
103 else:
5935066604a7 Uploaded
bornea
parents:
diff changeset
104 sys.exit("Uniprot had some other error")
5935066604a7 Uploaded
bornea
parents:
diff changeset
105 lines = data.readlines()
5935066604a7 Uploaded
bornea
parents:
diff changeset
106 header = lines[0]
5935066604a7 Uploaded
bornea
parents:
diff changeset
107 lst = header.split('|')
5935066604a7 Uploaded
bornea
parents:
diff changeset
108 lst2 = lst[2].split(' ')
5935066604a7 Uploaded
bornea
parents:
diff changeset
109 swissprot = lst2[0]
5935066604a7 Uploaded
bornea
parents:
diff changeset
110 uniprot_acc = lst[1]
5935066604a7 Uploaded
bornea
parents:
diff changeset
111 if lines == []:
5935066604a7 Uploaded
bornea
parents:
diff changeset
112 error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n')
5935066604a7 Uploaded
bornea
parents:
diff changeset
113 error.close
5935066604a7 Uploaded
bornea
parents:
diff changeset
114 uniprot_acc = 'NA'
5935066604a7 Uploaded
bornea
parents:
diff changeset
115 genename = 'NA'
5935066604a7 Uploaded
bornea
parents:
diff changeset
116 return ReturnValue1(uniprot_acc, genename, swissprot)
5935066604a7 Uploaded
bornea
parents:
diff changeset
117 if lines != []:
5935066604a7 Uploaded
bornea
parents:
diff changeset
118 seqlength = 0
5935066604a7 Uploaded
bornea
parents:
diff changeset
119 header = lines[0]
5935066604a7 Uploaded
bornea
parents:
diff changeset
120 if 'GN=' in header:
5935066604a7 Uploaded
bornea
parents:
diff changeset
121 lst = header.split('GN=')
5935066604a7 Uploaded
bornea
parents:
diff changeset
122 lst2 = lst[1].split(' ')
5935066604a7 Uploaded
bornea
parents:
diff changeset
123 genename = lst2[0]
5935066604a7 Uploaded
bornea
parents:
diff changeset
124 error.close
5935066604a7 Uploaded
bornea
parents:
diff changeset
125 return ReturnValue1(uniprot_acc, genename, swissprot)
5935066604a7 Uploaded
bornea
parents:
diff changeset
126 if 'GN=' not in header:
5935066604a7 Uploaded
bornea
parents:
diff changeset
127 genename = 'NA'
5935066604a7 Uploaded
bornea
parents:
diff changeset
128 error.close
5935066604a7 Uploaded
bornea
parents:
diff changeset
129 return ReturnValue1(uniprot_acc, genename, swissprot)
5935066604a7 Uploaded
bornea
parents:
diff changeset
130
5935066604a7 Uploaded
bornea
parents:
diff changeset
131
5935066604a7 Uploaded
bornea
parents:
diff changeset
132 def dd_network(listfile, SAINTscore, CPDB_filter):
5935066604a7 Uploaded
bornea
parents:
diff changeset
133 # Filter by SS and CPDB.
5935066604a7 Uploaded
bornea
parents:
diff changeset
134 data = read_listfile(listfile).data
5935066604a7 Uploaded
bornea
parents:
diff changeset
135 # Change to filtered list.
5935066604a7 Uploaded
bornea
parents:
diff changeset
136 SS = (read_listfile(listfile).header).index("SaintScore")
9
dd0b4c901d0a Uploaded
bornea
parents: 8
diff changeset
137 uni_ids = (read_listfile(listfile).header).index("PreyGene") - 1
5
5935066604a7 Uploaded
bornea
parents:
diff changeset
138 filt_data = []
5935066604a7 Uploaded
bornea
parents:
diff changeset
139 for i in data:
5935066604a7 Uploaded
bornea
parents:
diff changeset
140 if i[SS] >= SAINTscore:
5935066604a7 Uploaded
bornea
parents:
diff changeset
141 filt_data.append(i)
5935066604a7 Uploaded
bornea
parents:
diff changeset
142 accessions = []
5935066604a7 Uploaded
bornea
parents:
diff changeset
143 for i in filt_data:
9
dd0b4c901d0a Uploaded
bornea
parents: 8
diff changeset
144 accessions.append(get_info(i[uni_ids]).sp)
5
5935066604a7 Uploaded
bornea
parents:
diff changeset
145 GO = []
5935066604a7 Uploaded
bornea
parents:
diff changeset
146 for i in CPDB[2:]:
5935066604a7 Uploaded
bornea
parents:
diff changeset
147 if i[3] >= CPDB_filter:
5935066604a7 Uploaded
bornea
parents:
diff changeset
148 # Filter interaction confidence.
5935066604a7 Uploaded
bornea
parents:
diff changeset
149 GO.append(i[2])
5935066604a7 Uploaded
bornea
parents:
diff changeset
150 # All known interactions.
5935066604a7 Uploaded
bornea
parents:
diff changeset
151 GO2 = []
5935066604a7 Uploaded
bornea
parents:
diff changeset
152 for i in GO:
5935066604a7 Uploaded
bornea
parents:
diff changeset
153 GO2.append(i.split(','))
5935066604a7 Uploaded
bornea
parents:
diff changeset
154 # Make interactions list friendly.
5935066604a7 Uploaded
bornea
parents:
diff changeset
155 unfiltered_network = {}
5935066604a7 Uploaded
bornea
parents:
diff changeset
156 for i in accessions:
5935066604a7 Uploaded
bornea
parents:
diff changeset
157 interactions = []
5935066604a7 Uploaded
bornea
parents:
diff changeset
158 for j in GO2:
5935066604a7 Uploaded
bornea
parents:
diff changeset
159 if i in j:
5935066604a7 Uploaded
bornea
parents:
diff changeset
160 # Find the interactions.
5935066604a7 Uploaded
bornea
parents:
diff changeset
161 if j not in interactions:
5935066604a7 Uploaded
bornea
parents:
diff changeset
162 # Dont add duplicate interactions.
5935066604a7 Uploaded
bornea
parents:
diff changeset
163 interactions.append(j)
5935066604a7 Uploaded
bornea
parents:
diff changeset
164 merged = list(itertools.chain(*interactions))
5935066604a7 Uploaded
bornea
parents:
diff changeset
165 # Flatten list of lists.
5935066604a7 Uploaded
bornea
parents:
diff changeset
166 unfiltered_network[i] = merged
5935066604a7 Uploaded
bornea
parents:
diff changeset
167 # Assign all possible interactions to protein in a dictionary.
5935066604a7 Uploaded
bornea
parents:
diff changeset
168 dd_network = {}
5935066604a7 Uploaded
bornea
parents:
diff changeset
169 # Data dependent network.
5935066604a7 Uploaded
bornea
parents:
diff changeset
170 for i in unfiltered_network:
5935066604a7 Uploaded
bornea
parents:
diff changeset
171 temp = []
5935066604a7 Uploaded
bornea
parents:
diff changeset
172 for j in unfiltered_network[i]:
5935066604a7 Uploaded
bornea
parents:
diff changeset
173 if j in accessions:
5935066604a7 Uploaded
bornea
parents:
diff changeset
174 if j not in temp:
5935066604a7 Uploaded
bornea
parents:
diff changeset
175 if j != i:
5935066604a7 Uploaded
bornea
parents:
diff changeset
176 temp.append(j)
5935066604a7 Uploaded
bornea
parents:
diff changeset
177 dd_network[i] = temp
5935066604a7 Uploaded
bornea
parents:
diff changeset
178 return dd_network
5935066604a7 Uploaded
bornea
parents:
diff changeset
179
5935066604a7 Uploaded
bornea
parents:
diff changeset
180
5935066604a7 Uploaded
bornea
parents:
diff changeset
181 def cytoscape(dd_network, listfile, SAINTscore):
5935066604a7 Uploaded
bornea
parents:
diff changeset
182 with open('network.sif', 'wt') as y:
5935066604a7 Uploaded
bornea
parents:
diff changeset
183 data = read_listfile(listfile).data
5935066604a7 Uploaded
bornea
parents:
diff changeset
184 SS = (read_listfile(listfile).header).index("SaintScore")
9
dd0b4c901d0a Uploaded
bornea
parents: 8
diff changeset
185 uni_ids = (read_listfile(listfile).header).index("PreyGene") - 1
5
5935066604a7 Uploaded
bornea
parents:
diff changeset
186 filt_data = []
5935066604a7 Uploaded
bornea
parents:
diff changeset
187 for i in data:
5935066604a7 Uploaded
bornea
parents:
diff changeset
188 if i[SS] >= SAINTscore:
9
dd0b4c901d0a Uploaded
bornea
parents: 8
diff changeset
189 filt_data.append(get_info(i[uni_ids]).sp)
5
5935066604a7 Uploaded
bornea
parents:
diff changeset
190 for i in filt_data:
5935066604a7 Uploaded
bornea
parents:
diff changeset
191 if dd_network[i] != []:
5935066604a7 Uploaded
bornea
parents:
diff changeset
192 lst = []
5935066604a7 Uploaded
bornea
parents:
diff changeset
193 for j in dd_network[i]:
5935066604a7 Uploaded
bornea
parents:
diff changeset
194 lst.append(j)
5935066604a7 Uploaded
bornea
parents:
diff changeset
195 for j in lst:
5935066604a7 Uploaded
bornea
parents:
diff changeset
196 y.write(i+'\t'+'pp'+'\t' + j+'\n')
5935066604a7 Uploaded
bornea
parents:
diff changeset
197
5935066604a7 Uploaded
bornea
parents:
diff changeset
198
5935066604a7 Uploaded
bornea
parents:
diff changeset
199 if Species == "Human":
5935066604a7 Uploaded
bornea
parents:
diff changeset
200 CPDB = readtab(str(db_path) + 'ConsensusPathDB_human_PPI.txt')
5935066604a7 Uploaded
bornea
parents:
diff changeset
201 if Species == "Yeast":
5935066604a7 Uploaded
bornea
parents:
diff changeset
202 CPDB = readtab(str(db_path) + 'ConsensusPathDB_yeast_PPI.txt')
5935066604a7 Uploaded
bornea
parents:
diff changeset
203 if Species == "Mouse":
5935066604a7 Uploaded
bornea
parents:
diff changeset
204 CPDB = readtab(str(db_path) +'ConsensusPathDB_mouse_PPI.txt')
5935066604a7 Uploaded
bornea
parents:
diff changeset
205 if __name__ == '__main__':
5935066604a7 Uploaded
bornea
parents:
diff changeset
206 main(listfile, SAINT_cutoff, Int_conf, Species)
5935066604a7 Uploaded
bornea
parents:
diff changeset
207 os.rename('network.sif', str(cyto_file))