comparison ProteinInteractions_v2.py @ 4:cd2c68e1b1ae draft

Uploaded
author bornea
date Thu, 19 Nov 2015 11:17:03 -0500
parents
children 6ff557cd705f
comparison
equal deleted inserted replaced
3:e7d3a8865e8a 4:cd2c68e1b1ae
1 ################################################################################
2 # This program will read in a SAINT 'list.txt' file and the interactions from
3 # the consensus path db database and return all the interactions that we saw in
4 # our experiment in a format suitable for cytoscape. This allows us to filter
5 # before getting PPIs so that it doesn't affect our SAINT score or include
6 # interactions that don't score well
7 ################################################################################
8 import urllib2
9 import itertools
10 import sys
11 ################################################################################
12 ## REQUIRED INPUT ##
13
14 # 1) listfile: SAINTexpress output
15 # 2) SAINT_cutoff: Saint score cutoff for import (between 0 and 1)
16 # 3) Int_conf: Confidence of PPI from CPDB to include
17 # - low: no filtering
18 # - medium: >0.5
19 # - high: >0.7
20 # - very high: >0.9
21 # 4) Species: Human, Yeast, or Mouse
22 ###############################################################################
23 listfile = sys.argv[1]
24 SAINT_cutoff = sys.argv[2]
25 Int_conf = sys.argv[3]
26 Species = sys.argv[4]
27 cyto_file = sys.argv[5]
28 db_path = sys.arv[6]
29 ###############################################################################
30 class ReturnValue1(object):
31 def __init__(self, uniprot_acc, gene, swissprot):
32 self.up = uniprot_acc
33 self.gn = gene
34 self.sp = swissprot
35 class ReturnValue2(object):
36 def __init__(self, getdata, getproteins, getheader):
37 self.data = getdata
38 self.proteins = getproteins
39 self.header = getheader
40 def main(listfile, SAINT_cutoff, Int_conf, Species):
41 cytoscape(dd_network(listfile, SAINT_cutoff, Int_conf), listfile, SAINT_cutoff)
42 def readtab(infile):
43 with open(infile,'r') as x: # read in tab-delim text
44 output = []
45 for line in x:
46 line = line.strip()
47 temp = line.split('\t')
48 output.append(temp)
49 return output
50 def read_listfile(listfile): # Get data, proteins and header from scaffold output
51 dupes = readtab(listfile)
52 header = dupes[0]
53 prot_start = header.index("PreyGene")-1
54 data = dupes[1:] # cut off blank line and END OF FILE
55 proteins = []
56 for protein in data:
57 proteins.append(protein[prot_start])
58 return ReturnValue2(data, proteins, header)
59
60 def get_info(uniprot_accession_in): #get aa lengths and gene name
61 error = open('error proteins.txt', 'a+')
62 i=0
63 while i==0:
64 try:
65 data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in + ".fasta")
66 break
67 except urllib2.HTTPError, err:
68 i = i + 1
69 if i == 50:
70 sys.exit("More than 50 errors. Check your file or try again later.")
71 if err.code == 404:
72 error.write(uniprot_accession_in + '\t' + "Invalid URL. Check protein" + '\n')
73 seqlength = 'NA'
74 genename = 'NA'
75 return ReturnValue1(seqlength, genename)
76 elif err.code == 302:
77 sys.exit("Request timed out. Check connection and try again.")
78 else:
79 sys.exit("Uniprot had some other error")
80 lines = data.readlines()
81 header = lines[0]
82 lst = header.split('|')
83 lst2 = lst[2].split(' ')
84 swissprot = lst2[0]
85 uniprot_acc = lst[1]
86 if lines == []:
87 error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n')
88 error.close
89 uniprot_acc = 'NA'
90 genename = 'NA'
91 return ReturnValue1(uniprot_acc, genename, swissprot)
92 if lines != []:
93 seqlength = 0
94 header = lines[0]
95 if 'GN=' in header:
96 lst = header.split('GN=')
97 lst2 = lst[1].split(' ')
98 genename = lst2[0]
99 error.close
100 return ReturnValue1(uniprot_acc, genename, swissprot)
101 if 'GN=' not in header:
102 genename = 'NA'
103 error.close
104 return ReturnValue1(uniprot_acc, genename, swissprot)
105
106 def dd_network(listfile, SAINTscore, CPDB_filter): ## Filter by SS and CPDB
107 data = read_listfile(listfile).data # change to filtered list
108 SS = (read_listfile(listfile).header).index("SaintScore")
109 filt_data = []
110 for i in data:
111 if i[SS] >= SAINTscore:
112 filt_data.append(i)
113 accessions = []
114 for i in filt_data:
115 accessions.append(get_info(i[1]).sp)
116 GO=[]
117 for i in CPDB[2:]:
118 if i[3] >= CPDB_filter: # filter interaction confidence
119 GO.append(i[2]) # all known interactions
120 GO2 = []
121 for i in GO:
122 GO2.append(i.split(',')) # make interactions list friendly
123 unfiltered_network = {}
124 for i in accessions:
125 interactions = []
126 for j in GO2:
127 if i in j: # find the interactions
128 if j not in interactions:# dont add duplicate interactions
129 interactions.append(j)
130 merged = list(itertools.chain(*interactions)) # flatten list of lists
131 unfiltered_network[i]=merged # assign all possible interactions to protein in a dictionary
132 dd_network = {} #data dependent network
133 for i in unfiltered_network:
134 temp = []
135 for j in unfiltered_network[i]:
136 if j in accessions:
137 if j not in temp:
138 if j != i:
139 temp.append(j)
140 dd_network[i]=temp
141 return dd_network
142 def cytoscape(dd_network, listfile, SAINTscore):
143 with open('network.sif','wt') as y:
144 data = read_listfile(listfile).data
145 SS = (read_listfile(listfile).header).index("SaintScore")
146 filt_data = []
147 for i in data:
148 if i[SS] >= SAINTscore:
149 filt_data.append(i)
150 header = ["Prey", "Interactions"]
151 header = '\t'.join(header)
152 y.write(header + '\n')
153 for i in filt_data:
154 if dd_network[i[1]] != []:
155 lst = []
156 #x='\t'.join(i)
157 for j in dd_network[i[1]]:
158 lst.append(j)
159 for j in lst:
160 y.write(i[1]+'\t'+'pp'+'\t' + j+'\n')
161 if Species == "Human":
162 CPDB = readtab(str(db_path) + 'ConsensusPathDB_human_PPI.txt')
163 if Species == "Yeast":
164 CPDB = readtab(str(db_path) + 'ConsensusPathDB_yeast_PPI.txt')
165 if Species == "Mouse":
166 CPDB = readtab(str(db_path) +'ConsensusPathDB_mouse_PPI.txt')
167 if __name__ == '__main__':
168 main(listfile, SAINT_cutoff, Int_conf, Species)
169 #main("Crizo_list.txt", 0.7, 0.7, 'Human')
170 os.rename('network.sif', str(cyto_file))