Mercurial > repos > bornea > query_crapome
comparison CRAPomeQuery.py @ 17:378fc3676c78 draft
Uploaded
author | bornea |
---|---|
date | Fri, 29 Apr 2016 15:42:22 -0400 |
parents | a5444c834e72 |
children | f1459638e1bc |
comparison
equal
deleted
inserted
replaced
16:a5444c834e72 | 17:378fc3676c78 |
---|---|
2 """ | 2 """ |
3 Created on Thu Apr 14 16:58:05 2016 | 3 Created on Thu Apr 14 16:58:05 2016 |
4 | 4 |
5 @author: brentkuenzi | 5 @author: brentkuenzi |
6 """ | 6 """ |
7 ################################################################################ | |
8 ## Dependencies ## | |
9 import urllib2 | |
10 import sys | |
11 import numpy | |
12 import os | |
7 ################################################################################ | 13 ################################################################################ |
8 # This program will read in a SAINT formatted 'prey.txt' file or a file | 14 # This program will read in a SAINT formatted 'prey.txt' file or a file |
9 # containing a single column list of uniprot accessions (e.g. "P00533" or | 15 # containing a single column list of uniprot accessions (e.g. "P00533" or |
10 # "EGFR_HUMAN")query the CRAPome database (v1.1), and return a file specifying | 16 # "EGFR_HUMAN")query the CRAPome database (v1.1), and return a file specifying |
11 # the prevalence of each protein in the CRAPome. | 17 # the prevalence of each protein in the CRAPome. |
16 # or any later version published by the Free Software Foundation; | 22 # or any later version published by the Free Software Foundation; |
17 # with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts. | 23 # with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts. |
18 # A copy of the license is included in the section entitled "GNU | 24 # A copy of the license is included in the section entitled "GNU |
19 # Free Documentation License". | 25 # Free Documentation License". |
20 ################################################################################ | 26 ################################################################################ |
21 ## Dependencies ## | |
22 import urllib2 | |
23 import sys | |
24 import numpy | |
25 import os | |
26 ################################################################################ | |
27 ## REQUIRED INPUT ## | 27 ## REQUIRED INPUT ## |
28 # 1) crappyData: Prey.txt or single column list of Uniprot accessions | 28 # 1) crappyData: Prey.txt or single column list of Uniprot accessions |
29 crappyData = sys.argv[1] # Prey file or File with single column of accessions | 29 crappyData = sys.argv[1] # Prey file or File with single column of accessions |
30 # 2) Species: HUMAN or YEAST | 30 # 2) Species: HUMAN or YEAST |
31 species = sys.argv[2] # HUMAN or YEAST | 31 species = sys.argv[2] # HUMAN or YEAST |
32 db_path = sys.argv[4] | 32 fasta_db = sys.argv[3] |
33 db_path = sys.argv[5] | |
33 ################################################################################ | 34 ################################################################################ |
34 ## Global Variables ## | 35 ## Global Variables ## |
35 if species == "HUMAN": | 36 if species == "HUMAN": |
36 database = "Human_CRAPome_v1-1.txt" | 37 database = str(db_path) + "Human_CRAPome_v1-1.txt" |
37 if species == "YEAST": | 38 if species == "YEAST": |
38 database = "Yeast_CRAPome_v1-1.txt" | 39 database = str(db_path) + "Yeast_CRAPome_v1-1.txt" |
39 ################################################################################ | 40 ################################################################################ |
40 ## CRAPomeQuery ## | 41 ## CRAPomeQuery ## |
41 class ReturnValue1(object): | 42 class ReturnValue1(object): |
42 def __init__(self, uniprot_acc, gene, swissprot): | 43 def __init__(self, uniprot_acc, gene, swissprot): |
43 self.up = uniprot_acc | 44 self.up = uniprot_acc |
44 self.gn = gene | 45 self.gn = gene |
45 self.sp = swissprot | 46 self.sp = swissprot |
46 def get_info(uniprot_accession_in): #get aa lengths and gene name | 47 def get_info(uniprot_accession_in): #get aa lengths and gene name |
47 error = open('error proteins.txt', 'a+') | 48 error = open('error proteins.txt', 'a+') |
48 i=0 | 49 i=0 |
49 while i==0: | 50 if fasta_db == "None": |
50 try: | 51 while i==0: |
51 data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in + ".fasta") | 52 try: |
52 break | 53 data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in + ".fasta") |
53 except urllib2.HTTPError, err: | 54 break |
54 i = i + 1 | 55 except urllib2.HTTPError, err: |
55 if i == 50: | 56 i = i + 1 |
56 sys.exit("More than 50 errors. Check your file or try again later.") | 57 if i == 50: |
57 if err.code == 404: | 58 sys.exit("More than 50 errors. Check your file or try again later.") |
58 error.write(uniprot_accession_in + '\t' + "Invalid URL. Check protein" + '\n') | 59 if err.code == 404: |
59 seqlength = 'NA' | 60 error.write(uniprot_accession_in + '\t' + "Invalid URL. Check protein" + '\n') |
61 seqlength = 'NA' | |
62 genename = 'NA' | |
63 return ReturnValue1(seqlength, genename) | |
64 elif err.code == 302: | |
65 sys.exit("Request timed out. Check connection and try again.") | |
66 else: | |
67 sys.exit("Uniprot had some other error") | |
68 lines = data.readlines() | |
69 header = lines[0] | |
70 lst = header.split('|') | |
71 lst2 = lst[2].split(' ') | |
72 swissprot = lst2[0] | |
73 uniprot_acc = lst[1] | |
74 if lines == []: | |
75 error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n') | |
76 error.close | |
77 uniprot_acc = 'NA' | |
78 genename = 'NA' | |
79 return ReturnValue1(uniprot_acc, genename, swissprot) | |
80 if lines != []: | |
81 seqlength = 0 | |
82 header = lines[0] | |
83 if 'GN=' in header: | |
84 lst = header.split('GN=') | |
85 lst2 = lst[1].split(' ') | |
86 genename = lst2[0] | |
87 error.close | |
88 return ReturnValue1(uniprot_acc, genename, swissprot) | |
89 if 'GN=' not in header: | |
60 genename = 'NA' | 90 genename = 'NA' |
61 return ReturnValue1(seqlength, genename) | 91 error.close |
62 elif err.code == 302: | 92 return ReturnValue1(uniprot_acc, genename, swissprot) |
63 sys.exit("Request timed out. Check connection and try again.") | 93 else: |
64 else: | 94 data = open(fasta_db, 'r') |
65 sys.exit("Uniprot had some other error") | 95 data_lines = data.readlines() |
66 lines = data.readlines() | 96 db_len = len(data_lines) |
67 header = lines[0] | 97 for fasta_line in data_lines: |
68 lst = header.split('|') | 98 if uniprot_accession_in in fasta_line: |
69 lst2 = lst[2].split(' ') | 99 sp, uniprot_acc, name_info = fasta_line.split("|") |
70 swissprot = lst2[0] | 100 swissprot = name_info.split(' ')[0] |
71 uniprot_acc = lst[1] | 101 if 'GN=' in name_info: |
72 if lines == []: | 102 lst = name_info.split('GN=') |
73 error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n') | 103 lst2 = lst[1].split(' ') |
74 error.close | 104 genename = lst2[0] |
75 uniprot_acc = 'NA' | 105 return ReturnValue1(uniprot_acc, genename, swissprot) |
76 genename = 'NA' | 106 if 'GN=' not in name_info: |
77 return ReturnValue1(uniprot_acc, genename, swissprot) | 107 genename = 'NA' |
78 if lines != []: | 108 return ReturnValue1(uniprot_acc, genename, swissprot) |
79 seqlength = 0 | 109 if sp = ">sp": |
80 header = lines[0] | 110 error.close() |
81 if 'GN=' in header: | 111 else: |
82 lst = header.split('GN=') | 112 error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n') |
83 lst2 = lst[1].split(' ') | |
84 genename = lst2[0] | |
85 error.close | 113 error.close |
114 uniprot_acc = 'NA' | |
115 genename = 'NA' | |
86 return ReturnValue1(uniprot_acc, genename, swissprot) | 116 return ReturnValue1(uniprot_acc, genename, swissprot) |
87 if 'GN=' not in header: | 117 |
88 genename = 'NA' | 118 |
89 error.close | |
90 return ReturnValue1(uniprot_acc, genename, swissprot) | |
91 def readtab(infile): # read in tab-delim text | 119 def readtab(infile): # read in tab-delim text |
92 with open(infile,'r') as x: | 120 with open(infile,'r') as x: |
93 output = [] | 121 output = [] |
94 for line in x: | 122 for line in x: |
95 line = line.strip() | 123 line = line.strip() |
132 cnt=0 | 160 cnt=0 |
133 temp.append(i[2]) # append accession | 161 temp.append(i[2]) # append accession |
134 temp.append(i[0]) # append gene name | 162 temp.append(i[0]) # append gene name |
135 ave = [] | 163 ave = [] |
136 total = len(i[3:]) # calculate total experiments | 164 total = len(i[3:]) # calculate total experiments |
137 for j in i[3:]: | 165 for j in i[3:]: |
138 if j != '0': | 166 if j != '0': |
139 ave.append(int(j)) # calculate Ave.SC on only experiments with ID | 167 ave.append(int(j)) # calculate Ave.SC on only experiments with ID |
140 cnt+=1 | 168 cnt+=1 |
141 temp.append(str(cnt) + " / "+str(total)) # format ratio | 169 temp.append(str(cnt) + " / "+str(total)) # format ratio |
142 if ave != []: | 170 if ave != []: |
146 temp.append(0) # add 0 if has not been ID'd in CRAPome | 174 temp.append(0) # add 0 if has not been ID'd in CRAPome |
147 temp.append(0) # add 0 if has not been ID'd in CRAPome | 175 temp.append(0) # add 0 if has not been ID'd in CRAPome |
148 else: | 176 else: |
149 temp.append(i[2]) # append accession | 177 temp.append(i[2]) # append accession |
150 temp.append(i[3]) | 178 temp.append(i[3]) |
179 temp.append("NA") | |
180 temp.append("NA") | |
181 temp.append("NA") | |
151 query.append(temp) # final query results | 182 query.append(temp) # final query results |
152 | 183 |
153 header = ["User Input","Mapped Gene Symbol","Num of Expt. (found/total)","Ave SC","Max SC"] | 184 header = ["User Input","Mapped Gene Symbol","Num of Expt. (found/total)","Ave SC","Max SC"] |
154 with open("Crappy Data.txt","wt") as x: # write file | 185 with open("Crappy Data.txt","wt") as x: # write file |
155 x.write("\t".join(header) + "\n") | 186 x.write("\t".join(header) + "\n") |
156 for i in query: | 187 for i in query: |
157 x.write("\t".join(i) + "\n") | 188 x.write("\t".join(i) + "\n") |
158 if __name__ == '__main__': | 189 if __name__ == '__main__': |
159 crapome(crappyData) | 190 crapome(crappyData) |
160 | 191 |
161 os.rename("Crappy Data.txt", sys.argv[3]) | 192 os.rename("Crappy Data.txt", sys.argv[4]) |
162 ## END ## | 193 ## END ## |