comparison ProteinInteractions.py @ 5:5935066604a7 draft

Uploaded
author bornea
date Tue, 15 Mar 2016 15:36:27 -0400
parents
children 355ad1ce474b
comparison
equal deleted inserted replaced
4:1c89c5b8fadf 5:5935066604a7
1 ################################################################################
2 # This program will read in a SAINT 'list.txt' file and the interactions from
3 # the consensus path db database and return all the interactions that we saw in
4 # our experiment in a format suitable for cytoscape. This allows us to filter
5 # before getting PPIs so that it doesn't affect our SAINT score or include
6 # interactions that don't score well
7 ################################################################################
8 # Copyright (C) Brent Kuenzi.
9 # Permission is granted to copy, distribute and/or modify this document
10 # under the terms of the GNU Free Documentation License, Version 1.3
11 # or any later version published by the Free Software Foundation;
12 # with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts.
13 # A copy of the license is included in the section entitled "GNU
14 # Free Documentation License".
15 ################################################################################
16 ## REQUIRED INPUT ##
17
18 # 1) listfile: SAINTexpress output
19 # 2) SAINT_cutoff: Saint score cutoff for import (between 0 and 1)
20 # 3) Int_conf: Confidence of PPI from CPDB to include
21 # - low: no filtering
22 # - medium: >0.5
23 # - high: >0.7
24 # - very high: >0.9
25 # 4) Species: Human, Yeast, or Mouse
26 ################################################################################
27
28
29 import urllib2
30 import itertools
31 import sys
32 import os
33
34
35 listfile = sys.argv[1]
36 SAINT_cutoff = sys.argv[2]
37 Int_conf = sys.argv[3]
38 Species = sys.argv[4]
39 cyto_file = sys.argv[5]
40 db_path = sys.argv[6]
41
42 class ReturnValue1(object):
43 def __init__(self, uniprot_acc, gene, swissprot):
44 self.up = uniprot_acc
45 self.gn = gene
46 self.sp = swissprot
47 class ReturnValue2(object):
48 def __init__(self, getdata, getproteins, getheader):
49 self.data = getdata
50 self.proteins = getproteins
51 self.header = getheader
52
53
54 def main(listfile, SAINT_cutoff, Int_conf, Species):
55 cytoscape(dd_network(listfile, SAINT_cutoff, Int_conf), listfile, SAINT_cutoff)
56
57
58 def readtab(infile):
59 with open(infile, 'r') as file_to_read:
60 # Read in tab-delim text.
61 output = []
62 for line in file_to_read:
63 line = line.strip()
64 temp = line.split('\t')
65 output.append(temp)
66 return output
67
68
69 def read_listfile(listfile):
70 # Get data, proteins and header from scaffold output
71 dupes = readtab(listfile)
72 header = dupes[0]
73 prot_start = header.index("PreyGene")-1
74 data = dupes[1:]
75 # Cut off blank line and END OF FILE.
76 proteins = []
77 for protein in data:
78 proteins.append(protein[prot_start])
79 return ReturnValue2(data, proteins, header)
80
81
82 def get_info(uniprot_accession_in):
83 # Get aa lengths and gene name.
84 error = open('error proteins.txt', 'a+')
85 i = 0
86 while i == 0:
87 try:
88 data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in
89 + ".fasta")
90 break
91 except urllib2.HTTPError, err:
92 i = i + 1
93 if i == 50:
94 sys.exit("More than 50 errors. Check your file or try again later.")
95 if err.code == 404:
96 error.write(uniprot_accession_in + '\t' + "Invalid URL. Check protein" + '\n')
97 seqlength = 'NA'
98 genename = 'NA'
99 return ReturnValue1(seqlength, genename)
100 elif err.code == 302:
101 sys.exit("Request timed out. Check connection and try again.")
102 else:
103 sys.exit("Uniprot had some other error")
104 lines = data.readlines()
105 header = lines[0]
106 lst = header.split('|')
107 lst2 = lst[2].split(' ')
108 swissprot = lst2[0]
109 uniprot_acc = lst[1]
110 if lines == []:
111 error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n')
112 error.close
113 uniprot_acc = 'NA'
114 genename = 'NA'
115 return ReturnValue1(uniprot_acc, genename, swissprot)
116 if lines != []:
117 seqlength = 0
118 header = lines[0]
119 if 'GN=' in header:
120 lst = header.split('GN=')
121 lst2 = lst[1].split(' ')
122 genename = lst2[0]
123 error.close
124 return ReturnValue1(uniprot_acc, genename, swissprot)
125 if 'GN=' not in header:
126 genename = 'NA'
127 error.close
128 return ReturnValue1(uniprot_acc, genename, swissprot)
129
130
131 def dd_network(listfile, SAINTscore, CPDB_filter):
132 # Filter by SS and CPDB.
133 data = read_listfile(listfile).data
134 # Change to filtered list.
135 SS = (read_listfile(listfile).header).index("SaintScore")
136 filt_data = []
137 for i in data:
138 if i[SS] >= SAINTscore:
139 filt_data.append(i)
140 accessions = []
141 for i in filt_data:
142 accessions.append(get_info(i[1]).sp)
143 GO = []
144 for i in CPDB[2:]:
145 if i[3] >= CPDB_filter:
146 # Filter interaction confidence.
147 GO.append(i[2])
148 # All known interactions.
149 GO2 = []
150 for i in GO:
151 GO2.append(i.split(','))
152 # Make interactions list friendly.
153 unfiltered_network = {}
154 for i in accessions:
155 interactions = []
156 for j in GO2:
157 if i in j:
158 # Find the interactions.
159 if j not in interactions:
160 # Dont add duplicate interactions.
161 interactions.append(j)
162 merged = list(itertools.chain(*interactions))
163 # Flatten list of lists.
164 unfiltered_network[i] = merged
165 # Assign all possible interactions to protein in a dictionary.
166 dd_network = {}
167 # Data dependent network.
168 for i in unfiltered_network:
169 temp = []
170 for j in unfiltered_network[i]:
171 if j in accessions:
172 if j not in temp:
173 if j != i:
174 temp.append(j)
175 dd_network[i] = temp
176 return dd_network
177
178
179 def cytoscape(dd_network, listfile, SAINTscore):
180 with open('network.sif', 'wt') as y:
181 data = read_listfile(listfile).data
182 SS = (read_listfile(listfile).header).index("SaintScore")
183 filt_data = []
184 for i in data:
185 if i[SS] >= SAINTscore:
186 filt_data.append(get_info(i[1]).sp)
187 for i in filt_data:
188 if dd_network[i] != []:
189 lst = []
190 for j in dd_network[i]:
191 lst.append(j)
192 for j in lst:
193 y.write(i+'\t'+'pp'+'\t' + j+'\n')
194
195
196 if Species == "Human":
197 CPDB = readtab(str(db_path) + 'ConsensusPathDB_human_PPI.txt')
198 if Species == "Yeast":
199 CPDB = readtab(str(db_path) + 'ConsensusPathDB_yeast_PPI.txt')
200 if Species == "Mouse":
201 CPDB = readtab(str(db_path) +'ConsensusPathDB_mouse_PPI.txt')
202 if __name__ == '__main__':
203 main(listfile, SAINT_cutoff, Int_conf, Species)
204 os.rename('network.sif', str(cyto_file))