0
|
1 # -*- coding: utf-8 -*-
|
|
2 """
|
|
3 Created on Thu Apr 14 16:58:05 2016
|
|
4
|
|
5 @author: brentkuenzi
|
|
6 """
|
|
7 ################################################################################
|
17
|
8 ## Dependencies ##
|
|
9 import urllib2
|
|
10 import sys
|
|
11 import numpy
|
|
12 import os
|
|
13 ################################################################################
|
0
|
14 # This program will read in a SAINT formatted 'prey.txt' file or a file
|
|
15 # containing a single column list of uniprot accessions (e.g. "P00533" or
|
|
16 # "EGFR_HUMAN")query the CRAPome database (v1.1), and return a file specifying
|
|
17 # the prevalence of each protein in the CRAPome.
|
|
18 ################################################################################
|
|
19 # Copyright (C) Brent Kuenzi.
|
|
20 # Permission is granted to copy, distribute and/or modify this document
|
|
21 # under the terms of the GNU Free Documentation License, Version 1.3
|
|
22 # or any later version published by the Free Software Foundation;
|
|
23 # with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts.
|
|
24 # A copy of the license is included in the section entitled "GNU
|
|
25 # Free Documentation License".
|
|
26 ################################################################################
|
|
27 ## REQUIRED INPUT ##
|
|
28 # 1) crappyData: Prey.txt or single column list of Uniprot accessions
|
|
29 crappyData = sys.argv[1] # Prey file or File with single column of accessions
|
|
30 # 2) Species: HUMAN or YEAST
|
|
31 species = sys.argv[2] # HUMAN or YEAST
|
17
|
32 fasta_db = sys.argv[3]
|
|
33 db_path = sys.argv[5]
|
0
|
34 ################################################################################
|
|
35 ## Global Variables ##
|
|
36 if species == "HUMAN":
|
17
|
37 database = str(db_path) + "Human_CRAPome_v1-1.txt"
|
0
|
38 if species == "YEAST":
|
17
|
39 database = str(db_path) + "Yeast_CRAPome_v1-1.txt"
|
0
|
40 ################################################################################
|
|
41 ## CRAPomeQuery ##
|
|
42 class ReturnValue1(object):
|
|
43 def __init__(self, uniprot_acc, gene, swissprot):
|
|
44 self.up = uniprot_acc
|
|
45 self.gn = gene
|
|
46 self.sp = swissprot
|
|
47 def get_info(uniprot_accession_in): #get aa lengths and gene name
|
|
48 error = open('error proteins.txt', 'a+')
|
|
49 i=0
|
17
|
50 if fasta_db == "None":
|
|
51 while i==0:
|
|
52 try:
|
|
53 data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in + ".fasta")
|
|
54 break
|
|
55 except urllib2.HTTPError, err:
|
|
56 i = i + 1
|
|
57 if i == 50:
|
|
58 sys.exit("More than 50 errors. Check your file or try again later.")
|
|
59 if err.code == 404:
|
|
60 error.write(uniprot_accession_in + '\t' + "Invalid URL. Check protein" + '\n')
|
|
61 seqlength = 'NA'
|
|
62 genename = 'NA'
|
|
63 return ReturnValue1(seqlength, genename)
|
|
64 elif err.code == 302:
|
|
65 sys.exit("Request timed out. Check connection and try again.")
|
|
66 else:
|
|
67 sys.exit("Uniprot had some other error")
|
|
68 lines = data.readlines()
|
|
69 header = lines[0]
|
|
70 lst = header.split('|')
|
|
71 lst2 = lst[2].split(' ')
|
|
72 swissprot = lst2[0]
|
|
73 uniprot_acc = lst[1]
|
|
74 if lines == []:
|
|
75 error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n')
|
|
76 error.close
|
|
77 uniprot_acc = 'NA'
|
|
78 genename = 'NA'
|
|
79 return ReturnValue1(uniprot_acc, genename, swissprot)
|
|
80 if lines != []:
|
|
81 seqlength = 0
|
|
82 header = lines[0]
|
|
83 if 'GN=' in header:
|
|
84 lst = header.split('GN=')
|
|
85 lst2 = lst[1].split(' ')
|
|
86 genename = lst2[0]
|
|
87 error.close
|
|
88 return ReturnValue1(uniprot_acc, genename, swissprot)
|
|
89 if 'GN=' not in header:
|
0
|
90 genename = 'NA'
|
17
|
91 error.close
|
|
92 return ReturnValue1(uniprot_acc, genename, swissprot)
|
|
93 else:
|
|
94 data = open(fasta_db, 'r')
|
|
95 data_lines = data.readlines()
|
|
96 db_len = len(data_lines)
|
|
97 for fasta_line in data_lines:
|
|
98 if uniprot_accession_in in fasta_line:
|
|
99 sp, uniprot_acc, name_info = fasta_line.split("|")
|
|
100 swissprot = name_info.split(' ')[0]
|
|
101 if 'GN=' in name_info:
|
|
102 lst = name_info.split('GN=')
|
|
103 lst2 = lst[1].split(' ')
|
|
104 genename = lst2[0]
|
|
105 return ReturnValue1(uniprot_acc, genename, swissprot)
|
|
106 if 'GN=' not in name_info:
|
|
107 genename = 'NA'
|
|
108 return ReturnValue1(uniprot_acc, genename, swissprot)
|
|
109 if sp = ">sp":
|
|
110 error.close()
|
|
111 else:
|
|
112 error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n')
|
0
|
113 error.close
|
17
|
114 uniprot_acc = 'NA'
|
0
|
115 genename = 'NA'
|
|
116 return ReturnValue1(uniprot_acc, genename, swissprot)
|
17
|
117
|
|
118
|
0
|
119 def readtab(infile): # read in tab-delim text
|
|
120 with open(infile,'r') as x:
|
|
121 output = []
|
|
122 for line in x:
|
|
123 line = line.strip()
|
|
124 temp = line.split('\t')
|
|
125 output.append(temp)
|
|
126 return output
|
|
127 def crapome(infile): # Query CRAPome
|
|
128 data = readtab(infile)
|
|
129 crapome = readtab(database)
|
|
130 filt = []
|
|
131 for i in data: # Filter CRAPome database on our data
|
|
132 flag = 0 # is protein in CRAPome?
|
|
133 ac_flag = 0 # is it _SPECIES or not
|
|
134 unique = 0 # only take first ID in CRAPome
|
|
135 if "_"+species in i[0]:
|
|
136 ac = i[0]
|
|
137 else:
|
|
138 ac = get_info(i[0]).sp # query swissprot if not _SPECIES
|
|
139 ac_flag +=1
|
|
140 for j in crapome:
|
|
141 if ac == j[2]:
|
|
142 if ac_flag == 0: # if _SPECIES
|
|
143 if unique == 0:
|
|
144 filt.append(j)
|
|
145 flag+=1
|
|
146 unique+=1
|
|
147 if ac_flag != 0: # if not _SPECIES
|
|
148 if unique == 0:
|
|
149 unique+=1
|
|
150 j[2] = i[0] # change to user input
|
|
151 filt.append(j)
|
|
152 flag +=1
|
|
153 if flag == 0: # if protein is not present in CRAPome database then add it
|
14
|
154 filt.append(["\t", "\t", i[0], "Invalid identifier / gene not available"])
|
0
|
155 total = 0 # Experiment counter
|
|
156 query = []
|
|
157 for i in filt: # Create CRAPome file as list
|
|
158 temp=[]
|
|
159 if len(i) > 5:
|
|
160 cnt=0
|
|
161 temp.append(i[2]) # append accession
|
|
162 temp.append(i[0]) # append gene name
|
|
163 ave = []
|
|
164 total = len(i[3:]) # calculate total experiments
|
17
|
165 for j in i[3:]:
|
0
|
166 if j != '0':
|
|
167 ave.append(int(j)) # calculate Ave.SC on only experiments with ID
|
|
168 cnt+=1
|
|
169 temp.append(str(cnt) + " / "+str(total)) # format ratio
|
|
170 if ave != []:
|
|
171 temp.append(str(round(numpy.mean(ave),1))) # calculate Ave.SC
|
|
172 temp.append(str(max(ave))) # calculate Max.SC
|
|
173 else:
|
|
174 temp.append(0) # add 0 if has not been ID'd in CRAPome
|
|
175 temp.append(0) # add 0 if has not been ID'd in CRAPome
|
|
176 else:
|
|
177 temp.append(i[2]) # append accession
|
|
178 temp.append(i[3])
|
17
|
179 temp.append("NA")
|
|
180 temp.append("NA")
|
|
181 temp.append("NA")
|
0
|
182 query.append(temp) # final query results
|
|
183
|
|
184 header = ["User Input","Mapped Gene Symbol","Num of Expt. (found/total)","Ave SC","Max SC"]
|
|
185 with open("Crappy Data.txt","wt") as x: # write file
|
|
186 x.write("\t".join(header) + "\n")
|
|
187 for i in query:
|
|
188 x.write("\t".join(i) + "\n")
|
|
189 if __name__ == '__main__':
|
|
190 crapome(crappyData)
|
|
191
|
17
|
192 os.rename("Crappy Data.txt", sys.argv[4])
|
0
|
193 ## END ## |