Mercurial > repos > bornea > saint_preprocessing
comparison mzID_process2.py @ 72:5ec0b997fb13 draft
Uploaded
author | bornea |
---|---|
date | Sat, 27 Aug 2016 23:26:10 -0400 |
parents | f6d916d1d304 |
children | 50391fdc229a |
comparison
equal
deleted
inserted
replaced
71:f6d916d1d304 | 72:5ec0b997fb13 |
---|---|
4 @author = Brent Kuenzi | 4 @author = Brent Kuenzi |
5 @email = Brent.Kuenzi@moffitt.org | 5 @email = Brent.Kuenzi@moffitt.org |
6 """ | 6 """ |
7 ####################################################################################### | 7 ####################################################################################### |
8 ## Description: ## | 8 ## Description: ## |
9 # This program will create inter, prey, and bait files from mzIdentML files | 9 #This program will create inter, prey, and bait files from mzIdentML files |
10 ## Required input: ## | 10 ## Required input: ## |
11 # 1) mzIdentML file to be reformatted | 11 # 1) mzIdentML file to be reformatted |
12 # 2) minimum PSM for quantification | 12 # 2) minimum PSM for quantification |
13 | 13 |
14 | 14 |
16 import os | 16 import os |
17 | 17 |
18 ins_path = sys.argv[5] | 18 ins_path = sys.argv[5] |
19 | 19 |
20 class ReturnValue1(object): | 20 class ReturnValue1(object): |
21 def __init__(self, sequence, gene): | 21 def __init__(self, sequence, gene): |
22 self.seqlength = sequence | 22 self.seqlength = sequence |
23 self.genename = gene | 23 self.genename = gene |
24 class ReturnValue2(object): | 24 class ReturnValue2(object): |
25 def __init__(self, inter, accessions): | 25 def __init__(self, inter, accessions): |
26 self.inter = inter | 26 self.inter = inter |
27 self.accessions = accessions | 27 self.accessions = accessions |
28 def read_tab(infile): | 28 def read_tab(infile): |
29 with open(infile,'r') as x: | 29 with open(infile,'r') as x: |
30 output = [] | 30 output = [] |
31 for line in x: | 31 for line in x: |
32 line = line.strip() | 32 line = line.strip() |
33 temp = line.split('\t') | 33 temp = line.split('\t') |
34 output.append(temp) | 34 output.append(temp) |
35 return output | 35 return output |
36 def printProgress (iteration, total, prefix = '', suffix = '', decimals = 1, barLength = 100): | 36 def printProgress (iteration, total, prefix = '', suffix = '', decimals = 1, barLength = 100): |
37 """ | 37 """ |
38 Call in a loop to create terminal progress bar | 38 Call in a loop to create terminal progress bar |
39 @params: | 39 @params: |
40 iteration - Required : current iteration (Int) | 40 iteration - Required : current iteration (Int) |
41 total - Required : total iterations (Int) | 41 total - Required : total iterations (Int) |
42 prefix - Optional : prefix string (Str) | 42 prefix - Optional : prefix string (Str) |
43 suffix - Optional : suffix string (Str) | 43 suffix - Optional : suffix string (Str) |
44 decimals - Optional : positive number of decimals in percent complete (Int) | 44 decimals - Optional : positive number of decimals in percent complete (Int) |
45 barLength - Optional : character length of bar (Int) | 45 barLength - Optional : character length of bar (Int) |
46 """ | 46 """ |
47 formatStr = "{0:." + str(decimals) + "f}" | 47 formatStr = "{0:." + str(decimals) + "f}" |
48 percents = formatStr.format(100 * (iteration / float(total))) | 48 percents = formatStr.format(100 * (iteration / float(total))) |
49 filledLength = int(round(barLength * iteration / float(total))) | 49 filledLength = int(round(barLength * iteration / float(total))) |
50 bar = '=' * filledLength + '-' * (barLength - filledLength) | 50 bar = '=' * filledLength + '-' * (barLength - filledLength) |
51 sys.stdout.write('\r%s |%s| %s%s %s' % (prefix, bar, percents, '%', suffix)), | 51 sys.stdout.write('\r%s |%s| %s%s %s' % (prefix, bar, percents, '%', suffix)), |
52 sys.stdout.flush() | 52 sys.stdout.flush() |
53 if iteration == total: | 53 if iteration == total: |
54 sys.stdout.write('\n') | 54 sys.stdout.write('\n') |
55 sys.stdout.flush() | 55 sys.stdout.flush() |
56 def get_info(uniprot_accession_in,fasta_db): | 56 def get_info(uniprot_accession_in,fasta_db): |
57 # Get aminoacid lengths and gene name. | 57 # Get aminoacid lengths and gene name. |
58 error = open('error proteins.txt', 'a+') | 58 error = open('error proteins.txt', 'a+') |
59 data = open(fasta_db, 'r') | 59 data = open(fasta_db, 'r') |
60 data_lines = data.readlines() | 60 data_lines = data.readlines() |
61 db_len = len(data_lines) | 61 db_len = len(data_lines) |
62 seqlength = 0 | 62 seqlength = 0 |
63 count = 0 | 63 count = 0 |
64 last_line = data_lines[-1] | 64 last_line = data_lines[-1] |
65 for data_line in data_lines: | 65 for data_line in data_lines: |
66 if ">sp" in data_line: | 66 if ">sp" in data_line: |
67 namer = data_line.split("|")[2] | 67 namer = data_line.split("|")[2] |
68 if uniprot_accession_in == data_line.split("|")[1]: | 68 if uniprot_accession_in == data_line.split("|")[1]: |
69 match = count + 1 | 69 match = count + 1 |
70 if 'GN=' in data_line: | 70 if 'GN=' in data_line: |
71 lst = data_line.split('GN=') | 71 lst = data_line.split('GN=') |
72 lst2 = lst[1].split(' ') | 72 lst2 = lst[1].split(' ') |
73 genename = lst2[0] | 73 genename = lst2[0] |
74 if 'GN=' not in data_line: | 74 if 'GN=' not in data_line: |
75 genename = 'NA' | 75 genename = 'NA' |
76 while ">sp" not in data_lines[match]: | 76 while ">sp" not in data_lines[match]: |
77 if match <= db_len: | 77 if match <= db_len: |
78 seqlength = seqlength + len(data_lines[match].strip()) | 78 seqlength = seqlength + len(data_lines[match].strip()) |
79 if data_lines[match] == last_line: | 79 if data_lines[match] == last_line: |
80 break | 80 break |
81 match = match + 1 | 81 match = match + 1 |
82 else: | 82 else: |
83 break | 83 break |
84 return ReturnValue1(seqlength, genename) | 84 return ReturnValue1(seqlength, genename) |
85 if uniprot_accession_in == namer.split(" ")[0]: | 85 if uniprot_accession_in == namer.split(" ")[0]: |
86 match = count + 1 | 86 match = count + 1 |
87 # Ensures consistent spacing throughout. | 87 # Ensures consistent spacing throughout. |
88 if 'GN=' in data_line: | 88 if 'GN=' in data_line: |
89 lst = data_line.split('GN=') | 89 lst = data_line.split('GN=') |
90 lst2 = lst[1].split(' ') | 90 lst2 = lst[1].split(' ') |
91 genename = lst2[0] | 91 genename = lst2[0] |
92 if 'GN=' not in data_line: | 92 if 'GN=' not in data_line: |
93 genename = 'NA' | 93 genename = 'NA' |
94 while ">sp" not in data_lines[match]: | 94 while ">sp" not in data_lines[match]: |
95 if match <= db_len: | 95 if match <= db_len: |
96 seqlength = seqlength + len(data_lines[match].strip()) | 96 seqlength = seqlength + len(data_lines[match].strip()) |
97 if data_lines[match] == last_line: | 97 if data_lines[match] == last_line: |
98 break | 98 break |
99 match = match + 1 | 99 match = match + 1 |
100 else: | 100 else: |
101 break | 101 break |
102 return ReturnValue1(seqlength, genename) | 102 return ReturnValue1(seqlength, genename) |
103 count = count + 1 | 103 count = count + 1 |
104 if seqlength == 0: | 104 if seqlength == 0: |
105 error.write(uniprot_accession_in + '\t' + "Uniprot not in Fasta" + '\n') | 105 error.write(uniprot_accession_in + '\t' + "Uniprot not in Fasta" + '\n') |
106 error.close | 106 error.close |
107 seqlength = 'NA' | 107 seqlength = 'NA' |
108 genename = 'NA' | 108 genename = 'NA' |
109 return ReturnValue1(seqlength, genename) | 109 return ReturnValue1(seqlength, genename) |
110 def make_inter(mzIdentML,replicate,grouping): | 110 def make_inter(mzIdentML,replicate,grouping): |
111 accession_index = mzIdentML[0].index("accession") | 111 accession_index = mzIdentML[0].index("accession") |
112 PSMs = {} | 112 PSMs = {} |
113 accessions = [] | 113 accessions = [] |
114 cnt = 0 | 114 cnt = 0 |
133 file_list = files.split(",") | 133 file_list = files.split(",") |
134 | 134 |
135 make_prey = sys.argv[3] | 135 make_prey = sys.argv[3] |
136 db = sys.argv[4] | 136 db = sys.argv[4] |
137 if db == "None": | 137 if db == "None": |
138 db = str(ins_path) + "/SwissProt_HUMAN_2015_12.fasta" | 138 db = str(ins_path) + "/SwissProt_HUMAN_2015_12.fasta" |
139 make_bait = sys.argv[6] | 139 make_bait = sys.argv[6] |
140 bait_bool = sys.argv[7] | 140 bait_bool = sys.argv[7] |
141 prey_file = sys.argv[8] | 141 prey_file = sys.argv[8] |
142 bait_out = sys.argv[9] | 142 bait_out = sys.argv[9] |
143 inter_out = sys.argv[10] | 143 inter_out = sys.argv[10] |
144 | 144 |
145 def bait_create(baits, infile): | 145 def bait_create(baits, infile): |
146 # Verifies the Baits are valid in the Scaffold file and writes the Bait.txt. | 146 # Verifies the Baits are valid in the Scaffold file and writes the Bait.txt. |
147 baits = make_bait.split() | 147 baits = make_bait.split() |
148 i = 0 | 148 i = 0 |
149 bait_file_tmp = open("bait.txt", "w") | 149 bait_file_tmp = open("bait.txt", "w") |
150 order = [] | 150 order = [] |
151 bait_cache = [] | 151 bait_cache = [] |
152 while i < len(baits): | 152 while i < len(baits): |
153 if baits[i+2] == "true": | 153 if baits[i+2] == "true": |
154 T_C = "C" | 154 T_C = "C" |
155 else: | 155 else: |
156 T_C = "T" | 156 T_C = "T" |
157 bait_line = baits[i] + "\t" + baits[i+1] + "\t" + T_C + "\n" | 157 bait_line = baits[i] + "\t" + baits[i+1] + "\t" + T_C + "\n" |
158 bait_cache.append(str(bait_line)) | 158 bait_cache.append(str(bait_line)) |
159 i = i + 3 | 159 i = i + 3 |
160 | 160 |
161 for cache_line in bait_cache: | 161 for cache_line in bait_cache: |
162 bait_file_tmp.write(cache_line) | 162 bait_file_tmp.write(cache_line) |
163 | 163 |
164 bait_file_tmp.close() | 164 bait_file_tmp.close() |
165 | 165 |
166 if bait_bool == 'false': | 166 if bait_bool == 'false': |
167 bait_create(make_bait, infile) | 167 bait_create(make_bait, infile) |
168 bait = "bait.txt" | 168 bait = "bait.txt" |
169 else: | 169 else: |
170 bait_temp_file = open(sys.argv[2], 'r') | 170 bait_temp_file = open(sys.argv[2], 'r') |
171 bait_cache = bait_temp_file.readlines() | 171 bait_cache = bait_temp_file.readlines() |
172 bait_file_tmp = open("bait.txt", "wr") | 172 bait_file_tmp = open("bait.txt", "wr") |
173 for cache_line in bait_cache: | 173 for cache_line in bait_cache: |
174 bait_file_tmp.write(cache_line) | 174 bait_file_tmp.write(cache_line) |
175 bait_file_tmp.close() | 175 bait_file_tmp.close() |
176 bait = "bait.txt" | 176 bait = "bait.txt" |
177 bait = read_tab("bait.txt") | 177 bait = read_tab("bait.txt") |
178 | 178 |
179 inter = "" | 179 inter = "" |
180 cnt = 0 | 180 cnt = 0 |
181 accessions = [] | 181 accessions = [] |
182 for i in file_list: | 182 for i in file_list: |
183 cmd = (r"Rscript "+ str(ins_path) +"flatten_mzIdentML.R " + i) | 183 cmd = (r"Rscript "+ str(ins_path) +"flatten_mzIdentML.R " + i) |
184 os.system(cmd) | 184 os.system(cmd) |
185 mzIdentML = read_tab("flat_mzIdentML.txt") | 185 mzIdentML = read_tab("flat_mzIdentML.txt") |
186 inter = inter + make_inter(mzIdentML,bait[cnt][0],bait[cnt][1]).inter | 186 inter = inter + make_inter(mzIdentML,bait[cnt][0],bait[cnt][1]).inter |
187 print inter | 187 print inter |
188 accessions.append(make_inter(mzIdentML,bait[cnt][0],bait[cnt][1]).accessions) | 188 accessions.append(make_inter(mzIdentML,bait[cnt][0],bait[cnt][1]).accessions) |
189 print accessions | 189 print accessions |
190 cnt+=1 | 190 cnt+=1 |
191 | 191 |
192 with open("inter.txt","w") as x: | 192 with open("inter.txt","w") as x: |
193 x.write(inter) | 193 x.write(inter) |
194 if make_prey == "Y": | 194 if make_prey == "Y": |
203 printProgress(start,end,prefix = "Making Prey File:",suffix = "Complete",barLength=50) | 203 printProgress(start,end,prefix = "Making Prey File:",suffix = "Complete",barLength=50) |
204 | 204 |
205 for i in unique_accessions: | 205 for i in unique_accessions: |
206 prey = prey + i + "\t" + str(get_info(i,db).seqlength) + "\t" + get_info(i,db).genename + "\n" | 206 prey = prey + i + "\t" + str(get_info(i,db).seqlength) + "\t" + get_info(i,db).genename + "\n" |
207 start+=1 | 207 start+=1 |
208 printProgress(start, end) | 208 printProgress(start, end) |
209 with open("prey.txt","w") as x: | 209 with open("prey.txt","w") as x: |
210 x.write(prey) | 210 x.write(prey) |
211 | 211 |
212 os.rename("bait.txt", bait_out) | 212 os.rename("bait.txt", bait_out) |
213 os.rename("inter.txt", inter_out) | 213 os.rename("inter.txt", inter_out) |
214 if str(prey_file) != "None": | 214 if str(prey_file) != "None": |
215 os.rename("prey.txt", prey_file) | 215 os.rename("prey.txt", prey_file) |