| 6 | 1 ####################################################################################### | 
|  | 2 # Python-code: SAINT pre-processing from MaxQuant "Samples Report" output | 
|  | 3 # Author: Brent Kuenzi | 
|  | 4 ####################################################################################### | 
|  | 5 # This program reads in a raw MaxQuant "Samples Report" output and a user generated | 
|  | 6 # bait file and autoformats it into prey and interaction files for SAINTexpress | 
|  | 7 # analysis | 
|  | 8 ####################################################################################### | 
|  | 9 # Copyright (C)  Brent Kuenzi. | 
|  | 10 # Permission is granted to copy, distribute and/or modify this document | 
|  | 11 # under the terms of the GNU Free Documentation License, Version 1.3 | 
|  | 12 # or any later version published by the Free Software Foundation; | 
|  | 13 # with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts. | 
|  | 14 # A copy of the license is included in the section entitled "GNU | 
|  | 15 # Free Documentation License". | 
|  | 16 ####################################################################################### | 
|  | 17 ## REQUIRED INPUT ## | 
|  | 18 | 
|  | 19 # 1) infile: MaxQuant "Samples Report" output | 
|  | 20 # 2) baitfile: SAINT formatted bait file generated in Galaxy | 
|  | 21 # 3) fasta_db: fasta database for use (defaults to SwissProt_HUMAN_2014_08.fasta) | 
|  | 22 # 4) prey: Y or N for generating a prey file | 
|  | 23 # 5) make_bait: String of bait names, assignment, and test or control boolean | 
|  | 24 ####################################################################################### | 
|  | 25 | 
|  | 26 | 
|  | 27 import sys | 
|  | 28 import os | 
|  | 29 | 
|  | 30 | 
|  | 31 mq_file = sys.argv[1] | 
|  | 32 ins_path = sys.argv[8] | 
|  | 33 names_path = str(ins_path) + r"uniprot_names.txt" | 
| 29 | 34 fasta_db = sys.argv[3] | 
|  | 35 | 
|  | 36 # Uses faster names list for filtering when default db used. | 
|  | 37 if fasta_db == "None": | 
|  | 38     cmd = (r"Rscript "+ str(ins_path) +"pre_process_protein_name_set.R " + str(mq_file) + | 
|  | 39         " " + str(names_path)) | 
|  | 40     os.system(cmd) | 
|  | 41 else: | 
|  | 42     cmd = (r"Rscript "+ str(ins_path) +"pre_process_protein_name_set.R " + str(mq_file) + | 
| 31 | 43         " " + str(fasta_db)) | 
| 29 | 44     os.system(cmd) | 
| 6 | 45 | 
| 31 | 46 | 
| 6 | 47 infile = "./tukeys_output.txt" | 
|  | 48 # The MaxQuant "Samples Report" output. | 
|  | 49 prey = sys.argv[2] | 
|  | 50 # Y or N boolean from Galaxy. | 
|  | 51 if fasta_db == "None": | 
|  | 52     fasta_db = str(ins_path)  + "SwissProt_HUMAN_2014_08.fasta" | 
|  | 53 make_bait = sys.argv[6] | 
|  | 54 bait_bool = sys.argv[9] | 
|  | 55 | 
|  | 56 def bait_create(baits, infile): | 
|  | 57     # Takes the Bait specified by the user and makes them into a Bait file and includes a | 
|  | 58     # check to make sure they are using valid baits. | 
|  | 59     baits = make_bait.split() | 
|  | 60     i = 0 | 
|  | 61     bait_file_tmp = open("bait.txt", "w") | 
|  | 62     order = [] | 
|  | 63     bait_cache = [] | 
|  | 64     while i < len(baits): | 
|  | 65         if baits[i+2] == "true": | 
|  | 66             T_C = "C" | 
|  | 67         else: | 
|  | 68             T_C = "T" | 
|  | 69         bait_line = baits[i] + "\t" + baits[i+1] + "\t" + T_C + "\n" | 
|  | 70         read_infile = open(infile, "r") | 
|  | 71         for input_line in read_infile : | 
|  | 72             input_line = input_line.replace("\"", "") | 
|  | 73             input_line = input_line.replace(r"Intensity.", "") | 
|  | 74             # R coerces "-" into "." changes them back and remove Intensity from the Bait names. | 
|  | 75             input_line = input_line.replace(r".", r"-") | 
|  | 76             temp = input_line.split() | 
|  | 77             if "mapped_protein" in str(temp): | 
|  | 78                 if baits[i] in temp: | 
|  | 79                     number_bait = temp.index(str(baits[i])) | 
|  | 80                     number_bait = number_bait - 9 | 
|  | 81                     bait_cache.append((number_bait, str(bait_line))) | 
|  | 82                     # Locates the Bait names in the column names and then sets the Baits in the | 
|  | 83                     # correct order in the cache thus the - 9 because the baits start at the 9th | 
|  | 84                     # column. | 
|  | 85                 else: | 
|  | 86                     print "Error: bad bait " + str(baits[i]) | 
|  | 87                     sys.exit() | 
|  | 88             else: | 
|  | 89                 pass | 
|  | 90         i = i + 3 | 
|  | 91     # Writes cache to Bait file. | 
|  | 92     bait_cache.sort() | 
|  | 93     for line in bait_cache: | 
|  | 94         bait_file_tmp.write(line[1]) | 
|  | 95 | 
|  | 96     bait_file_tmp.close() | 
|  | 97 | 
|  | 98 | 
|  | 99 if bait_bool == 'false': | 
|  | 100     bait_create(make_bait, infile) | 
|  | 101     baitfile = "bait.txt" | 
|  | 102 else: | 
|  | 103     bait_temp_file = open(sys.argv[10], 'r') | 
|  | 104     bait_cache = bait_temp_file.readlines() | 
|  | 105     bait_file_tmp = open("bait.txt", "wr") | 
|  | 106     for line in bait_cache: | 
|  | 107         bait_file_tmp.write(line) | 
|  | 108     bait_file_tmp.close() | 
|  | 109     baitfile = "bait.txt" | 
|  | 110 | 
|  | 111 | 
|  | 112 class ReturnValue1(object): | 
|  | 113     def __init__(self, sequence, gene): | 
|  | 114         self.seqlength = sequence | 
|  | 115         self.genename = gene | 
|  | 116 class ReturnValue2(object): | 
|  | 117     def __init__(self, getdata, getproteins, getheader): | 
|  | 118         self.data = getdata | 
|  | 119         self.proteins = getproteins | 
|  | 120         self.header = getheader | 
|  | 121 | 
|  | 122 | 
|  | 123 def main(MaxQuant_input, make_bait): | 
|  | 124     #bait_check(baitfile, MaxQuant_input) | 
|  | 125     make_inter(MaxQuant_input) | 
|  | 126     if prey == 'true': | 
|  | 127         make_prey(MaxQuant_input) | 
|  | 128         no_error_inter(MaxQuant_input) | 
|  | 129         os.rename('prey.txt', sys.argv[5]) | 
|  | 130     elif prey == 'false': | 
| 35 | 131         if os.path.isfile('./error_proteins.txt') == True: | 
| 6 | 132             no_error_inter(MaxQuant_input) | 
|  | 133         pass | 
|  | 134     elif prey != 'true' or 'false': | 
|  | 135         sys.exit("Invalid Prey Argument: Y or N") | 
|  | 136     os.rename('inter.txt', sys.argv[4]) | 
|  | 137     os.rename("bait.txt", sys.argv[7]) | 
|  | 138 | 
|  | 139 | 
|  | 140 def get_info(uniprot_accession_in): | 
|  | 141     # Get aa lengths and gene name. | 
| 35 | 142     error = open('./error_proteins.txt', 'a+') | 
| 6 | 143     data = open(fasta_db, 'r') | 
|  | 144     data_lines = data.readlines() | 
|  | 145     db_len = len(data_lines) | 
|  | 146     seqlength = 0 | 
|  | 147     count = 0 | 
| 54 | 148     last_line = data_lines[-1] | 
| 6 | 149     for data_line in data_lines: | 
|  | 150         if ">sp" in data_line: | 
|  | 151             if uniprot_accession_in == data_line.split("|")[1]: | 
|  | 152                 match = count + 1 | 
|  | 153                 if 'GN=' in data_line: | 
|  | 154                     lst = data_line.split('GN=') | 
|  | 155                     lst2 = lst[1].split(' ') | 
|  | 156                     genename = lst2[0] | 
|  | 157                 if 'GN=' not in data_line: | 
|  | 158                     genename = 'NA' | 
|  | 159                 while ">sp" not in data_lines[match]: | 
|  | 160                     if match <= db_len: | 
|  | 161                         seqlength = seqlength + len(data_lines[match].strip()) | 
| 54 | 162                         if data_lines[match] == last_line: | 
|  | 163                             break | 
| 6 | 164                         match = match + 1 | 
|  | 165                     else: | 
|  | 166                         break | 
|  | 167                 return ReturnValue1(seqlength, genename) | 
|  | 168         count = count + 1 | 
|  | 169 | 
|  | 170 | 
|  | 171     if seqlength == 0: | 
|  | 172         error.write(uniprot_accession_in + '\t' + "Uniprot not in Fasta" + '\n') | 
|  | 173         error.close | 
|  | 174         seqlength = 'NA' | 
|  | 175         genename = 'NA' | 
|  | 176         return ReturnValue1(seqlength, genename) | 
|  | 177 | 
|  | 178 | 
|  | 179 def readtab(infile): | 
|  | 180     with open(infile, 'r') as input_file: | 
|  | 181     # Read in tab-delim text file. | 
|  | 182         output = [] | 
|  | 183         for input_line in input_file: | 
|  | 184             input_line = input_line.strip() | 
|  | 185             temp = input_line.split('\t') | 
|  | 186             output.append(temp) | 
|  | 187     return output | 
|  | 188 | 
|  | 189 | 
|  | 190 def read_MaxQuant(MaxQuant_input): | 
|  | 191     # Get data, proteins and header from MaxQuant output. | 
|  | 192     dupes = readtab(MaxQuant_input) | 
|  | 193     header_start = 0 | 
|  | 194     header = dupes[header_start] | 
|  | 195     for var_MQ in header: | 
|  | 196         var_MQ = var_MQ.replace(r"\"", "") | 
|  | 197         var_MQ = var_MQ.replace(r"Intensity.", r"") | 
|  | 198         var_MQ = var_MQ.replace(r".", r"-") | 
|  | 199     data = dupes[header_start+1:len(dupes)] | 
|  | 200     # Cut off blank line and END OF FILE. | 
|  | 201     proteins = [] | 
|  | 202     for protein in data: | 
|  | 203         proteins.append(protein[0]) | 
|  | 204     return ReturnValue2(data, proteins, header) | 
|  | 205 | 
|  | 206 | 
|  | 207 def make_inter(MaxQuant_input): | 
|  | 208     bait = readtab(baitfile) | 
|  | 209     data = read_MaxQuant(MaxQuant_input).data | 
|  | 210     header = read_MaxQuant(MaxQuant_input).header | 
|  | 211     proteins = read_MaxQuant(MaxQuant_input).proteins | 
|  | 212     bait_index = [] | 
|  | 213     for bait_item in bait: | 
|  | 214         bait_index.append(header.index("mapped_protein") + 1) | 
|  | 215         # Find just the baits defined in bait file. | 
|  | 216     with open('inter.txt', 'w') as y: | 
|  | 217         a = 0; l = 0 | 
|  | 218         for bb in bait: | 
|  | 219             for lst in data: | 
|  | 220                 y.write(header[bait_index[l]] + '\t' + bb[1] + '\t' + proteins[a] + '\t' | 
|  | 221                         + lst[bait_index[l]] + '\n') | 
|  | 222                 a += 1 | 
|  | 223                 if a == len(proteins): | 
|  | 224                     a = 0; l += 1 | 
|  | 225 | 
|  | 226 | 
|  | 227 def make_prey(MaxQuant_input): | 
|  | 228     proteins = read_MaxQuant(MaxQuant_input).proteins | 
|  | 229     output_file = open("prey.txt", 'w') | 
|  | 230     for a in proteins: | 
|  | 231         a = a.replace("\n", "") | 
|  | 232         # Remove \n for input into function. | 
|  | 233         a = a.replace("\r", "") | 
|  | 234         # Ditto for \r. | 
|  | 235         seq = get_info(a).seqlength | 
|  | 236         GN = get_info(a).genename | 
|  | 237         if seq != 'NA': | 
| 34 | 238             if GN != 'NA': | 
|  | 239                 output_file.write(a+"\t"+str(seq)+ "\t" + str(GN) + "\n") | 
| 6 | 240     output_file.close() | 
|  | 241 | 
|  | 242 | 
|  | 243 def no_error_inter(MaxQuant_input): | 
|  | 244     # Remake inter file without protein errors from Uniprot. | 
| 35 | 245     err = readtab("./error_proteins.txt") | 
| 6 | 246     bait = readtab(baitfile) | 
|  | 247     data = read_MaxQuant(MaxQuant_input).data | 
|  | 248     header = read_MaxQuant(MaxQuant_input).header | 
|  | 249     header = [MQ_var.replace(r"\"", "") for MQ_var in header] | 
|  | 250     header = [MQ_var.replace(r"Intensity.", r"") for MQ_var in header] | 
|  | 251     header = [MQ_var.replace(r".", r"-") for MQ_var in header] | 
|  | 252     bait_index = [] | 
|  | 253     for bait_item in bait: | 
|  | 254         bait_index.append(header.index(bait_item[0])) | 
|  | 255     proteins = read_MaxQuant(MaxQuant_input).proteins | 
|  | 256     errors = [] | 
| 40 | 257     valid_prots = [] | 
| 6 | 258     for e in err: | 
|  | 259         errors.append(e[0]) | 
| 36 | 260     for a in proteins: | 
|  | 261         a = a.replace("\n", "") | 
|  | 262         # Remove \n for input into function. | 
|  | 263         a = a.replace("\r", "") | 
|  | 264         # Ditto for \r. | 
|  | 265         seq = get_info(a).seqlength | 
|  | 266         GN = get_info(a).genename | 
| 40 | 267         if seq != 'NA': | 
|  | 268             if GN != 'NA': | 
|  | 269                 valid_prots.append(a) | 
| 6 | 270     with open('inter.txt', 'w') as input_file: | 
|  | 271         l = 0; a = 0 | 
|  | 272         for bb in bait: | 
|  | 273             for lst in data: | 
| 46 | 274                 if lst[0] in valid_prots: | 
|  | 275                     input_file.write(header[bait_index[l]] + '\t' + bb[1] + '\t' + lst[0] + '\t' + lst[bait_index[l]] + '\n') | 
| 6 | 276                 a += 1 | 
| 47 | 277                 if a == len(proteins): | 
| 6 | 278                     l += 1; a = 0 | 
|  | 279 | 
|  | 280 | 
|  | 281 def bait_check(bait, MaxQuant_input): | 
|  | 282     # Check that bait names share header titles. | 
|  | 283     bait_in = readtab(bait) | 
|  | 284     header = read_MaxQuant(MaxQuant_input).header | 
|  | 285     for bait in bait_in: | 
|  | 286         if bait[0] not in header: | 
|  | 287             sys.exit("Bait must share header titles with MaxQuant output") | 
|  | 288 | 
|  | 289 if __name__ == '__main__': | 
|  | 290     main(infile, make_bait) |