Mercurial > repos > bornea > saint_preprocessing
comparison mzID_process2.py @ 72:5ec0b997fb13 draft
Uploaded
| author | bornea |
|---|---|
| date | Sat, 27 Aug 2016 23:26:10 -0400 |
| parents | f6d916d1d304 |
| children | 50391fdc229a |
comparison
equal
deleted
inserted
replaced
| 71:f6d916d1d304 | 72:5ec0b997fb13 |
|---|---|
| 4 @author = Brent Kuenzi | 4 @author = Brent Kuenzi |
| 5 @email = Brent.Kuenzi@moffitt.org | 5 @email = Brent.Kuenzi@moffitt.org |
| 6 """ | 6 """ |
| 7 ####################################################################################### | 7 ####################################################################################### |
| 8 ## Description: ## | 8 ## Description: ## |
| 9 # This program will create inter, prey, and bait files from mzIdentML files | 9 #This program will create inter, prey, and bait files from mzIdentML files |
| 10 ## Required input: ## | 10 ## Required input: ## |
| 11 # 1) mzIdentML file to be reformatted | 11 # 1) mzIdentML file to be reformatted |
| 12 # 2) minimum PSM for quantification | 12 # 2) minimum PSM for quantification |
| 13 | 13 |
| 14 | 14 |
| 16 import os | 16 import os |
| 17 | 17 |
| 18 ins_path = sys.argv[5] | 18 ins_path = sys.argv[5] |
| 19 | 19 |
| 20 class ReturnValue1(object): | 20 class ReturnValue1(object): |
| 21 def __init__(self, sequence, gene): | 21 def __init__(self, sequence, gene): |
| 22 self.seqlength = sequence | 22 self.seqlength = sequence |
| 23 self.genename = gene | 23 self.genename = gene |
| 24 class ReturnValue2(object): | 24 class ReturnValue2(object): |
| 25 def __init__(self, inter, accessions): | 25 def __init__(self, inter, accessions): |
| 26 self.inter = inter | 26 self.inter = inter |
| 27 self.accessions = accessions | 27 self.accessions = accessions |
| 28 def read_tab(infile): | 28 def read_tab(infile): |
| 29 with open(infile,'r') as x: | 29 with open(infile,'r') as x: |
| 30 output = [] | 30 output = [] |
| 31 for line in x: | 31 for line in x: |
| 32 line = line.strip() | 32 line = line.strip() |
| 33 temp = line.split('\t') | 33 temp = line.split('\t') |
| 34 output.append(temp) | 34 output.append(temp) |
| 35 return output | 35 return output |
| 36 def printProgress (iteration, total, prefix = '', suffix = '', decimals = 1, barLength = 100): | 36 def printProgress (iteration, total, prefix = '', suffix = '', decimals = 1, barLength = 100): |
| 37 """ | 37 """ |
| 38 Call in a loop to create terminal progress bar | 38 Call in a loop to create terminal progress bar |
| 39 @params: | 39 @params: |
| 40 iteration - Required : current iteration (Int) | 40 iteration - Required : current iteration (Int) |
| 41 total - Required : total iterations (Int) | 41 total - Required : total iterations (Int) |
| 42 prefix - Optional : prefix string (Str) | 42 prefix - Optional : prefix string (Str) |
| 43 suffix - Optional : suffix string (Str) | 43 suffix - Optional : suffix string (Str) |
| 44 decimals - Optional : positive number of decimals in percent complete (Int) | 44 decimals - Optional : positive number of decimals in percent complete (Int) |
| 45 barLength - Optional : character length of bar (Int) | 45 barLength - Optional : character length of bar (Int) |
| 46 """ | 46 """ |
| 47 formatStr = "{0:." + str(decimals) + "f}" | 47 formatStr = "{0:." + str(decimals) + "f}" |
| 48 percents = formatStr.format(100 * (iteration / float(total))) | 48 percents = formatStr.format(100 * (iteration / float(total))) |
| 49 filledLength = int(round(barLength * iteration / float(total))) | 49 filledLength = int(round(barLength * iteration / float(total))) |
| 50 bar = '=' * filledLength + '-' * (barLength - filledLength) | 50 bar = '=' * filledLength + '-' * (barLength - filledLength) |
| 51 sys.stdout.write('\r%s |%s| %s%s %s' % (prefix, bar, percents, '%', suffix)), | 51 sys.stdout.write('\r%s |%s| %s%s %s' % (prefix, bar, percents, '%', suffix)), |
| 52 sys.stdout.flush() | 52 sys.stdout.flush() |
| 53 if iteration == total: | 53 if iteration == total: |
| 54 sys.stdout.write('\n') | 54 sys.stdout.write('\n') |
| 55 sys.stdout.flush() | 55 sys.stdout.flush() |
| 56 def get_info(uniprot_accession_in,fasta_db): | 56 def get_info(uniprot_accession_in,fasta_db): |
| 57 # Get aminoacid lengths and gene name. | 57 # Get aminoacid lengths and gene name. |
| 58 error = open('error proteins.txt', 'a+') | 58 error = open('error proteins.txt', 'a+') |
| 59 data = open(fasta_db, 'r') | 59 data = open(fasta_db, 'r') |
| 60 data_lines = data.readlines() | 60 data_lines = data.readlines() |
| 61 db_len = len(data_lines) | 61 db_len = len(data_lines) |
| 62 seqlength = 0 | 62 seqlength = 0 |
| 63 count = 0 | 63 count = 0 |
| 64 last_line = data_lines[-1] | 64 last_line = data_lines[-1] |
| 65 for data_line in data_lines: | 65 for data_line in data_lines: |
| 66 if ">sp" in data_line: | 66 if ">sp" in data_line: |
| 67 namer = data_line.split("|")[2] | 67 namer = data_line.split("|")[2] |
| 68 if uniprot_accession_in == data_line.split("|")[1]: | 68 if uniprot_accession_in == data_line.split("|")[1]: |
| 69 match = count + 1 | 69 match = count + 1 |
| 70 if 'GN=' in data_line: | 70 if 'GN=' in data_line: |
| 71 lst = data_line.split('GN=') | 71 lst = data_line.split('GN=') |
| 72 lst2 = lst[1].split(' ') | 72 lst2 = lst[1].split(' ') |
| 73 genename = lst2[0] | 73 genename = lst2[0] |
| 74 if 'GN=' not in data_line: | 74 if 'GN=' not in data_line: |
| 75 genename = 'NA' | 75 genename = 'NA' |
| 76 while ">sp" not in data_lines[match]: | 76 while ">sp" not in data_lines[match]: |
| 77 if match <= db_len: | 77 if match <= db_len: |
| 78 seqlength = seqlength + len(data_lines[match].strip()) | 78 seqlength = seqlength + len(data_lines[match].strip()) |
| 79 if data_lines[match] == last_line: | 79 if data_lines[match] == last_line: |
| 80 break | 80 break |
| 81 match = match + 1 | 81 match = match + 1 |
| 82 else: | 82 else: |
| 83 break | 83 break |
| 84 return ReturnValue1(seqlength, genename) | 84 return ReturnValue1(seqlength, genename) |
| 85 if uniprot_accession_in == namer.split(" ")[0]: | 85 if uniprot_accession_in == namer.split(" ")[0]: |
| 86 match = count + 1 | 86 match = count + 1 |
| 87 # Ensures consistent spacing throughout. | 87 # Ensures consistent spacing throughout. |
| 88 if 'GN=' in data_line: | 88 if 'GN=' in data_line: |
| 89 lst = data_line.split('GN=') | 89 lst = data_line.split('GN=') |
| 90 lst2 = lst[1].split(' ') | 90 lst2 = lst[1].split(' ') |
| 91 genename = lst2[0] | 91 genename = lst2[0] |
| 92 if 'GN=' not in data_line: | 92 if 'GN=' not in data_line: |
| 93 genename = 'NA' | 93 genename = 'NA' |
| 94 while ">sp" not in data_lines[match]: | 94 while ">sp" not in data_lines[match]: |
| 95 if match <= db_len: | 95 if match <= db_len: |
| 96 seqlength = seqlength + len(data_lines[match].strip()) | 96 seqlength = seqlength + len(data_lines[match].strip()) |
| 97 if data_lines[match] == last_line: | 97 if data_lines[match] == last_line: |
| 98 break | 98 break |
| 99 match = match + 1 | 99 match = match + 1 |
| 100 else: | 100 else: |
| 101 break | 101 break |
| 102 return ReturnValue1(seqlength, genename) | 102 return ReturnValue1(seqlength, genename) |
| 103 count = count + 1 | 103 count = count + 1 |
| 104 if seqlength == 0: | 104 if seqlength == 0: |
| 105 error.write(uniprot_accession_in + '\t' + "Uniprot not in Fasta" + '\n') | 105 error.write(uniprot_accession_in + '\t' + "Uniprot not in Fasta" + '\n') |
| 106 error.close | 106 error.close |
| 107 seqlength = 'NA' | 107 seqlength = 'NA' |
| 108 genename = 'NA' | 108 genename = 'NA' |
| 109 return ReturnValue1(seqlength, genename) | 109 return ReturnValue1(seqlength, genename) |
| 110 def make_inter(mzIdentML,replicate,grouping): | 110 def make_inter(mzIdentML,replicate,grouping): |
| 111 accession_index = mzIdentML[0].index("accession") | 111 accession_index = mzIdentML[0].index("accession") |
| 112 PSMs = {} | 112 PSMs = {} |
| 113 accessions = [] | 113 accessions = [] |
| 114 cnt = 0 | 114 cnt = 0 |
| 133 file_list = files.split(",") | 133 file_list = files.split(",") |
| 134 | 134 |
| 135 make_prey = sys.argv[3] | 135 make_prey = sys.argv[3] |
| 136 db = sys.argv[4] | 136 db = sys.argv[4] |
| 137 if db == "None": | 137 if db == "None": |
| 138 db = str(ins_path) + "/SwissProt_HUMAN_2015_12.fasta" | 138 db = str(ins_path) + "/SwissProt_HUMAN_2015_12.fasta" |
| 139 make_bait = sys.argv[6] | 139 make_bait = sys.argv[6] |
| 140 bait_bool = sys.argv[7] | 140 bait_bool = sys.argv[7] |
| 141 prey_file = sys.argv[8] | 141 prey_file = sys.argv[8] |
| 142 bait_out = sys.argv[9] | 142 bait_out = sys.argv[9] |
| 143 inter_out = sys.argv[10] | 143 inter_out = sys.argv[10] |
| 144 | 144 |
| 145 def bait_create(baits, infile): | 145 def bait_create(baits, infile): |
| 146 # Verifies the Baits are valid in the Scaffold file and writes the Bait.txt. | 146 # Verifies the Baits are valid in the Scaffold file and writes the Bait.txt. |
| 147 baits = make_bait.split() | 147 baits = make_bait.split() |
| 148 i = 0 | 148 i = 0 |
| 149 bait_file_tmp = open("bait.txt", "w") | 149 bait_file_tmp = open("bait.txt", "w") |
| 150 order = [] | 150 order = [] |
| 151 bait_cache = [] | 151 bait_cache = [] |
| 152 while i < len(baits): | 152 while i < len(baits): |
| 153 if baits[i+2] == "true": | 153 if baits[i+2] == "true": |
| 154 T_C = "C" | 154 T_C = "C" |
| 155 else: | 155 else: |
| 156 T_C = "T" | 156 T_C = "T" |
| 157 bait_line = baits[i] + "\t" + baits[i+1] + "\t" + T_C + "\n" | 157 bait_line = baits[i] + "\t" + baits[i+1] + "\t" + T_C + "\n" |
| 158 bait_cache.append(str(bait_line)) | 158 bait_cache.append(str(bait_line)) |
| 159 i = i + 3 | 159 i = i + 3 |
| 160 | 160 |
| 161 for cache_line in bait_cache: | 161 for cache_line in bait_cache: |
| 162 bait_file_tmp.write(cache_line) | 162 bait_file_tmp.write(cache_line) |
| 163 | 163 |
| 164 bait_file_tmp.close() | 164 bait_file_tmp.close() |
| 165 | 165 |
| 166 if bait_bool == 'false': | 166 if bait_bool == 'false': |
| 167 bait_create(make_bait, infile) | 167 bait_create(make_bait, infile) |
| 168 bait = "bait.txt" | 168 bait = "bait.txt" |
| 169 else: | 169 else: |
| 170 bait_temp_file = open(sys.argv[2], 'r') | 170 bait_temp_file = open(sys.argv[2], 'r') |
| 171 bait_cache = bait_temp_file.readlines() | 171 bait_cache = bait_temp_file.readlines() |
| 172 bait_file_tmp = open("bait.txt", "wr") | 172 bait_file_tmp = open("bait.txt", "wr") |
| 173 for cache_line in bait_cache: | 173 for cache_line in bait_cache: |
| 174 bait_file_tmp.write(cache_line) | 174 bait_file_tmp.write(cache_line) |
| 175 bait_file_tmp.close() | 175 bait_file_tmp.close() |
| 176 bait = "bait.txt" | 176 bait = "bait.txt" |
| 177 bait = read_tab("bait.txt") | 177 bait = read_tab("bait.txt") |
| 178 | 178 |
| 179 inter = "" | 179 inter = "" |
| 180 cnt = 0 | 180 cnt = 0 |
| 181 accessions = [] | 181 accessions = [] |
| 182 for i in file_list: | 182 for i in file_list: |
| 183 cmd = (r"Rscript "+ str(ins_path) +"flatten_mzIdentML.R " + i) | 183 cmd = (r"Rscript "+ str(ins_path) +"flatten_mzIdentML.R " + i) |
| 184 os.system(cmd) | 184 os.system(cmd) |
| 185 mzIdentML = read_tab("flat_mzIdentML.txt") | 185 mzIdentML = read_tab("flat_mzIdentML.txt") |
| 186 inter = inter + make_inter(mzIdentML,bait[cnt][0],bait[cnt][1]).inter | 186 inter = inter + make_inter(mzIdentML,bait[cnt][0],bait[cnt][1]).inter |
| 187 print inter | 187 print inter |
| 188 accessions.append(make_inter(mzIdentML,bait[cnt][0],bait[cnt][1]).accessions) | 188 accessions.append(make_inter(mzIdentML,bait[cnt][0],bait[cnt][1]).accessions) |
| 189 print accessions | 189 print accessions |
| 190 cnt+=1 | 190 cnt+=1 |
| 191 | 191 |
| 192 with open("inter.txt","w") as x: | 192 with open("inter.txt","w") as x: |
| 193 x.write(inter) | 193 x.write(inter) |
| 194 if make_prey == "Y": | 194 if make_prey == "Y": |
| 203 printProgress(start,end,prefix = "Making Prey File:",suffix = "Complete",barLength=50) | 203 printProgress(start,end,prefix = "Making Prey File:",suffix = "Complete",barLength=50) |
| 204 | 204 |
| 205 for i in unique_accessions: | 205 for i in unique_accessions: |
| 206 prey = prey + i + "\t" + str(get_info(i,db).seqlength) + "\t" + get_info(i,db).genename + "\n" | 206 prey = prey + i + "\t" + str(get_info(i,db).seqlength) + "\t" + get_info(i,db).genename + "\n" |
| 207 start+=1 | 207 start+=1 |
| 208 printProgress(start, end) | 208 printProgress(start, end) |
| 209 with open("prey.txt","w") as x: | 209 with open("prey.txt","w") as x: |
| 210 x.write(prey) | 210 x.write(prey) |
| 211 | 211 |
| 212 os.rename("bait.txt", bait_out) | 212 os.rename("bait.txt", bait_out) |
| 213 os.rename("inter.txt", inter_out) | 213 os.rename("inter.txt", inter_out) |
| 214 if str(prey_file) != "None": | 214 if str(prey_file) != "None": |
| 215 os.rename("prey.txt", prey_file) | 215 os.rename("prey.txt", prey_file) |
