comparison CRAPomeQuery.py @ 17:378fc3676c78 draft

Uploaded
author bornea
date Fri, 29 Apr 2016 15:42:22 -0400
parents a5444c834e72
children f1459638e1bc
comparison
equal deleted inserted replaced
16:a5444c834e72 17:378fc3676c78
2 """ 2 """
3 Created on Thu Apr 14 16:58:05 2016 3 Created on Thu Apr 14 16:58:05 2016
4 4
5 @author: brentkuenzi 5 @author: brentkuenzi
6 """ 6 """
7 ################################################################################
8 ## Dependencies ##
9 import urllib2
10 import sys
11 import numpy
12 import os
7 ################################################################################ 13 ################################################################################
8 # This program will read in a SAINT formatted 'prey.txt' file or a file 14 # This program will read in a SAINT formatted 'prey.txt' file or a file
9 # containing a single column list of uniprot accessions (e.g. "P00533" or 15 # containing a single column list of uniprot accessions (e.g. "P00533" or
10 # "EGFR_HUMAN")query the CRAPome database (v1.1), and return a file specifying 16 # "EGFR_HUMAN")query the CRAPome database (v1.1), and return a file specifying
11 # the prevalence of each protein in the CRAPome. 17 # the prevalence of each protein in the CRAPome.
16 # or any later version published by the Free Software Foundation; 22 # or any later version published by the Free Software Foundation;
17 # with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts. 23 # with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts.
18 # A copy of the license is included in the section entitled "GNU 24 # A copy of the license is included in the section entitled "GNU
19 # Free Documentation License". 25 # Free Documentation License".
20 ################################################################################ 26 ################################################################################
21 ## Dependencies ##
22 import urllib2
23 import sys
24 import numpy
25 import os
26 ################################################################################
27 ## REQUIRED INPUT ## 27 ## REQUIRED INPUT ##
28 # 1) crappyData: Prey.txt or single column list of Uniprot accessions 28 # 1) crappyData: Prey.txt or single column list of Uniprot accessions
29 crappyData = sys.argv[1] # Prey file or File with single column of accessions 29 crappyData = sys.argv[1] # Prey file or File with single column of accessions
30 # 2) Species: HUMAN or YEAST 30 # 2) Species: HUMAN or YEAST
31 species = sys.argv[2] # HUMAN or YEAST 31 species = sys.argv[2] # HUMAN or YEAST
32 db_path = sys.argv[4] 32 fasta_db = sys.argv[3]
33 db_path = sys.argv[5]
33 ################################################################################ 34 ################################################################################
34 ## Global Variables ## 35 ## Global Variables ##
35 if species == "HUMAN": 36 if species == "HUMAN":
36 database = "Human_CRAPome_v1-1.txt" 37 database = str(db_path) + "Human_CRAPome_v1-1.txt"
37 if species == "YEAST": 38 if species == "YEAST":
38 database = "Yeast_CRAPome_v1-1.txt" 39 database = str(db_path) + "Yeast_CRAPome_v1-1.txt"
39 ################################################################################ 40 ################################################################################
40 ## CRAPomeQuery ## 41 ## CRAPomeQuery ##
41 class ReturnValue1(object): 42 class ReturnValue1(object):
42 def __init__(self, uniprot_acc, gene, swissprot): 43 def __init__(self, uniprot_acc, gene, swissprot):
43 self.up = uniprot_acc 44 self.up = uniprot_acc
44 self.gn = gene 45 self.gn = gene
45 self.sp = swissprot 46 self.sp = swissprot
46 def get_info(uniprot_accession_in): #get aa lengths and gene name 47 def get_info(uniprot_accession_in): #get aa lengths and gene name
47 error = open('error proteins.txt', 'a+') 48 error = open('error proteins.txt', 'a+')
48 i=0 49 i=0
49 while i==0: 50 if fasta_db == "None":
50 try: 51 while i==0:
51 data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in + ".fasta") 52 try:
52 break 53 data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in + ".fasta")
53 except urllib2.HTTPError, err: 54 break
54 i = i + 1 55 except urllib2.HTTPError, err:
55 if i == 50: 56 i = i + 1
56 sys.exit("More than 50 errors. Check your file or try again later.") 57 if i == 50:
57 if err.code == 404: 58 sys.exit("More than 50 errors. Check your file or try again later.")
58 error.write(uniprot_accession_in + '\t' + "Invalid URL. Check protein" + '\n') 59 if err.code == 404:
59 seqlength = 'NA' 60 error.write(uniprot_accession_in + '\t' + "Invalid URL. Check protein" + '\n')
61 seqlength = 'NA'
62 genename = 'NA'
63 return ReturnValue1(seqlength, genename)
64 elif err.code == 302:
65 sys.exit("Request timed out. Check connection and try again.")
66 else:
67 sys.exit("Uniprot had some other error")
68 lines = data.readlines()
69 header = lines[0]
70 lst = header.split('|')
71 lst2 = lst[2].split(' ')
72 swissprot = lst2[0]
73 uniprot_acc = lst[1]
74 if lines == []:
75 error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n')
76 error.close
77 uniprot_acc = 'NA'
78 genename = 'NA'
79 return ReturnValue1(uniprot_acc, genename, swissprot)
80 if lines != []:
81 seqlength = 0
82 header = lines[0]
83 if 'GN=' in header:
84 lst = header.split('GN=')
85 lst2 = lst[1].split(' ')
86 genename = lst2[0]
87 error.close
88 return ReturnValue1(uniprot_acc, genename, swissprot)
89 if 'GN=' not in header:
60 genename = 'NA' 90 genename = 'NA'
61 return ReturnValue1(seqlength, genename) 91 error.close
62 elif err.code == 302: 92 return ReturnValue1(uniprot_acc, genename, swissprot)
63 sys.exit("Request timed out. Check connection and try again.") 93 else:
64 else: 94 data = open(fasta_db, 'r')
65 sys.exit("Uniprot had some other error") 95 data_lines = data.readlines()
66 lines = data.readlines() 96 db_len = len(data_lines)
67 header = lines[0] 97 for fasta_line in data_lines:
68 lst = header.split('|') 98 if uniprot_accession_in in fasta_line:
69 lst2 = lst[2].split(' ') 99 sp, uniprot_acc, name_info = fasta_line.split("|")
70 swissprot = lst2[0] 100 swissprot = name_info.split(' ')[0]
71 uniprot_acc = lst[1] 101 if 'GN=' in name_info:
72 if lines == []: 102 lst = name_info.split('GN=')
73 error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n') 103 lst2 = lst[1].split(' ')
74 error.close 104 genename = lst2[0]
75 uniprot_acc = 'NA' 105 return ReturnValue1(uniprot_acc, genename, swissprot)
76 genename = 'NA' 106 if 'GN=' not in name_info:
77 return ReturnValue1(uniprot_acc, genename, swissprot) 107 genename = 'NA'
78 if lines != []: 108 return ReturnValue1(uniprot_acc, genename, swissprot)
79 seqlength = 0 109 if sp = ">sp":
80 header = lines[0] 110 error.close()
81 if 'GN=' in header: 111 else:
82 lst = header.split('GN=') 112 error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n')
83 lst2 = lst[1].split(' ')
84 genename = lst2[0]
85 error.close 113 error.close
114 uniprot_acc = 'NA'
115 genename = 'NA'
86 return ReturnValue1(uniprot_acc, genename, swissprot) 116 return ReturnValue1(uniprot_acc, genename, swissprot)
87 if 'GN=' not in header: 117
88 genename = 'NA' 118
89 error.close
90 return ReturnValue1(uniprot_acc, genename, swissprot)
91 def readtab(infile): # read in tab-delim text 119 def readtab(infile): # read in tab-delim text
92 with open(infile,'r') as x: 120 with open(infile,'r') as x:
93 output = [] 121 output = []
94 for line in x: 122 for line in x:
95 line = line.strip() 123 line = line.strip()
132 cnt=0 160 cnt=0
133 temp.append(i[2]) # append accession 161 temp.append(i[2]) # append accession
134 temp.append(i[0]) # append gene name 162 temp.append(i[0]) # append gene name
135 ave = [] 163 ave = []
136 total = len(i[3:]) # calculate total experiments 164 total = len(i[3:]) # calculate total experiments
137 for j in i[3:]: 165 for j in i[3:]:
138 if j != '0': 166 if j != '0':
139 ave.append(int(j)) # calculate Ave.SC on only experiments with ID 167 ave.append(int(j)) # calculate Ave.SC on only experiments with ID
140 cnt+=1 168 cnt+=1
141 temp.append(str(cnt) + " / "+str(total)) # format ratio 169 temp.append(str(cnt) + " / "+str(total)) # format ratio
142 if ave != []: 170 if ave != []:
146 temp.append(0) # add 0 if has not been ID'd in CRAPome 174 temp.append(0) # add 0 if has not been ID'd in CRAPome
147 temp.append(0) # add 0 if has not been ID'd in CRAPome 175 temp.append(0) # add 0 if has not been ID'd in CRAPome
148 else: 176 else:
149 temp.append(i[2]) # append accession 177 temp.append(i[2]) # append accession
150 temp.append(i[3]) 178 temp.append(i[3])
179 temp.append("NA")
180 temp.append("NA")
181 temp.append("NA")
151 query.append(temp) # final query results 182 query.append(temp) # final query results
152 183
153 header = ["User Input","Mapped Gene Symbol","Num of Expt. (found/total)","Ave SC","Max SC"] 184 header = ["User Input","Mapped Gene Symbol","Num of Expt. (found/total)","Ave SC","Max SC"]
154 with open("Crappy Data.txt","wt") as x: # write file 185 with open("Crappy Data.txt","wt") as x: # write file
155 x.write("\t".join(header) + "\n") 186 x.write("\t".join(header) + "\n")
156 for i in query: 187 for i in query:
157 x.write("\t".join(i) + "\n") 188 x.write("\t".join(i) + "\n")
158 if __name__ == '__main__': 189 if __name__ == '__main__':
159 crapome(crappyData) 190 crapome(crappyData)
160 191
161 os.rename("Crappy Data.txt", sys.argv[3]) 192 os.rename("Crappy Data.txt", sys.argv[4])
162 ## END ## 193 ## END ##