Mercurial > repos > bornea > saint_interactions
comparison ProteinInteractions_v2.py @ 4:cd2c68e1b1ae draft
Uploaded
author | bornea |
---|---|
date | Thu, 19 Nov 2015 11:17:03 -0500 |
parents | |
children | 6ff557cd705f |
comparison
equal
deleted
inserted
replaced
3:e7d3a8865e8a | 4:cd2c68e1b1ae |
---|---|
1 ################################################################################ | |
2 # This program will read in a SAINT 'list.txt' file and the interactions from | |
3 # the consensus path db database and return all the interactions that we saw in | |
4 # our experiment in a format suitable for cytoscape. This allows us to filter | |
5 # before getting PPIs so that it doesn't affect our SAINT score or include | |
6 # interactions that don't score well | |
7 ################################################################################ | |
8 import urllib2 | |
9 import itertools | |
10 import sys | |
11 ################################################################################ | |
12 ## REQUIRED INPUT ## | |
13 | |
14 # 1) listfile: SAINTexpress output | |
15 # 2) SAINT_cutoff: Saint score cutoff for import (between 0 and 1) | |
16 # 3) Int_conf: Confidence of PPI from CPDB to include | |
17 # - low: no filtering | |
18 # - medium: >0.5 | |
19 # - high: >0.7 | |
20 # - very high: >0.9 | |
21 # 4) Species: Human, Yeast, or Mouse | |
22 ############################################################################### | |
23 listfile = sys.argv[1] | |
24 SAINT_cutoff = sys.argv[2] | |
25 Int_conf = sys.argv[3] | |
26 Species = sys.argv[4] | |
27 cyto_file = sys.argv[5] | |
28 db_path = sys.arv[6] | |
29 ############################################################################### | |
30 class ReturnValue1(object): | |
31 def __init__(self, uniprot_acc, gene, swissprot): | |
32 self.up = uniprot_acc | |
33 self.gn = gene | |
34 self.sp = swissprot | |
35 class ReturnValue2(object): | |
36 def __init__(self, getdata, getproteins, getheader): | |
37 self.data = getdata | |
38 self.proteins = getproteins | |
39 self.header = getheader | |
40 def main(listfile, SAINT_cutoff, Int_conf, Species): | |
41 cytoscape(dd_network(listfile, SAINT_cutoff, Int_conf), listfile, SAINT_cutoff) | |
42 def readtab(infile): | |
43 with open(infile,'r') as x: # read in tab-delim text | |
44 output = [] | |
45 for line in x: | |
46 line = line.strip() | |
47 temp = line.split('\t') | |
48 output.append(temp) | |
49 return output | |
50 def read_listfile(listfile): # Get data, proteins and header from scaffold output | |
51 dupes = readtab(listfile) | |
52 header = dupes[0] | |
53 prot_start = header.index("PreyGene")-1 | |
54 data = dupes[1:] # cut off blank line and END OF FILE | |
55 proteins = [] | |
56 for protein in data: | |
57 proteins.append(protein[prot_start]) | |
58 return ReturnValue2(data, proteins, header) | |
59 | |
60 def get_info(uniprot_accession_in): #get aa lengths and gene name | |
61 error = open('error proteins.txt', 'a+') | |
62 i=0 | |
63 while i==0: | |
64 try: | |
65 data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in + ".fasta") | |
66 break | |
67 except urllib2.HTTPError, err: | |
68 i = i + 1 | |
69 if i == 50: | |
70 sys.exit("More than 50 errors. Check your file or try again later.") | |
71 if err.code == 404: | |
72 error.write(uniprot_accession_in + '\t' + "Invalid URL. Check protein" + '\n') | |
73 seqlength = 'NA' | |
74 genename = 'NA' | |
75 return ReturnValue1(seqlength, genename) | |
76 elif err.code == 302: | |
77 sys.exit("Request timed out. Check connection and try again.") | |
78 else: | |
79 sys.exit("Uniprot had some other error") | |
80 lines = data.readlines() | |
81 header = lines[0] | |
82 lst = header.split('|') | |
83 lst2 = lst[2].split(' ') | |
84 swissprot = lst2[0] | |
85 uniprot_acc = lst[1] | |
86 if lines == []: | |
87 error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n') | |
88 error.close | |
89 uniprot_acc = 'NA' | |
90 genename = 'NA' | |
91 return ReturnValue1(uniprot_acc, genename, swissprot) | |
92 if lines != []: | |
93 seqlength = 0 | |
94 header = lines[0] | |
95 if 'GN=' in header: | |
96 lst = header.split('GN=') | |
97 lst2 = lst[1].split(' ') | |
98 genename = lst2[0] | |
99 error.close | |
100 return ReturnValue1(uniprot_acc, genename, swissprot) | |
101 if 'GN=' not in header: | |
102 genename = 'NA' | |
103 error.close | |
104 return ReturnValue1(uniprot_acc, genename, swissprot) | |
105 | |
106 def dd_network(listfile, SAINTscore, CPDB_filter): ## Filter by SS and CPDB | |
107 data = read_listfile(listfile).data # change to filtered list | |
108 SS = (read_listfile(listfile).header).index("SaintScore") | |
109 filt_data = [] | |
110 for i in data: | |
111 if i[SS] >= SAINTscore: | |
112 filt_data.append(i) | |
113 accessions = [] | |
114 for i in filt_data: | |
115 accessions.append(get_info(i[1]).sp) | |
116 GO=[] | |
117 for i in CPDB[2:]: | |
118 if i[3] >= CPDB_filter: # filter interaction confidence | |
119 GO.append(i[2]) # all known interactions | |
120 GO2 = [] | |
121 for i in GO: | |
122 GO2.append(i.split(',')) # make interactions list friendly | |
123 unfiltered_network = {} | |
124 for i in accessions: | |
125 interactions = [] | |
126 for j in GO2: | |
127 if i in j: # find the interactions | |
128 if j not in interactions:# dont add duplicate interactions | |
129 interactions.append(j) | |
130 merged = list(itertools.chain(*interactions)) # flatten list of lists | |
131 unfiltered_network[i]=merged # assign all possible interactions to protein in a dictionary | |
132 dd_network = {} #data dependent network | |
133 for i in unfiltered_network: | |
134 temp = [] | |
135 for j in unfiltered_network[i]: | |
136 if j in accessions: | |
137 if j not in temp: | |
138 if j != i: | |
139 temp.append(j) | |
140 dd_network[i]=temp | |
141 return dd_network | |
142 def cytoscape(dd_network, listfile, SAINTscore): | |
143 with open('network.sif','wt') as y: | |
144 data = read_listfile(listfile).data | |
145 SS = (read_listfile(listfile).header).index("SaintScore") | |
146 filt_data = [] | |
147 for i in data: | |
148 if i[SS] >= SAINTscore: | |
149 filt_data.append(i) | |
150 header = ["Prey", "Interactions"] | |
151 header = '\t'.join(header) | |
152 y.write(header + '\n') | |
153 for i in filt_data: | |
154 if dd_network[i[1]] != []: | |
155 lst = [] | |
156 #x='\t'.join(i) | |
157 for j in dd_network[i[1]]: | |
158 lst.append(j) | |
159 for j in lst: | |
160 y.write(i[1]+'\t'+'pp'+'\t' + j+'\n') | |
161 if Species == "Human": | |
162 CPDB = readtab(str(db_path) + 'ConsensusPathDB_human_PPI.txt') | |
163 if Species == "Yeast": | |
164 CPDB = readtab(str(db_path) + 'ConsensusPathDB_yeast_PPI.txt') | |
165 if Species == "Mouse": | |
166 CPDB = readtab(str(db_path) +'ConsensusPathDB_mouse_PPI.txt') | |
167 if __name__ == '__main__': | |
168 main(listfile, SAINT_cutoff, Int_conf, Species) | |
169 #main("Crizo_list.txt", 0.7, 0.7, 'Human') | |
170 os.rename('network.sif', str(cyto_file)) |