| 9 | 1 ####################################################################################### | 
|  | 2 # Python-code: SAINT pre-processing from maxquant "Samples Report" output | 
|  | 3 # Author: Brent Kuenzi | 
|  | 4 ####################################################################################### | 
|  | 5 # This program reads in a raw maxquant "Samples Report" output and a user generated | 
|  | 6 # bait file and autoformats it into prey and interaction files for SAINTexpress | 
|  | 7 # analysis | 
|  | 8 ####################################################################################### | 
|  | 9 import sys | 
|  | 10 import urllib2 | 
|  | 11 import os | 
|  | 12 ####################################################################################### | 
|  | 13 ## REQUIRED INPUT ## | 
|  | 14 | 
|  | 15 # 1) infile: maxquant "Samples Report" output | 
|  | 16 # 2) baitfile: SAINT formatted bait file generated in Galaxy | 
|  | 17 # 3) prey: Y or N for generating a prey file (requires internet connection) | 
|  | 18 ####################################################################################### | 
|  | 19 mq_file = sys.argv[1] | 
| 22 | 20 ins_path = sys.argv[8] | 
|  | 21 names_path = str(ins_path) + r"uniprot_names.txt" | 
|  | 22 cmd = r"Rscript "+ str(ins_path) +"pre_process_protein_name_set.R " + str(mq_file) + " " + str(names_path) | 
| 9 | 23 os.system(cmd) | 
|  | 24 | 
|  | 25 infile = "./tukeys_output.txt" #maxquant "Samples Report" output | 
|  | 26 prey = sys.argv[2] # Y or N | 
|  | 27 fasta_db = sys.argv[3] | 
|  | 28 if fasta_db == "None": | 
| 22 | 29     fasta_db = str(ins_path)  + "SwissProt_HUMAN_2014_08.fasta" | 
| 9 | 30 make_bait= sys.argv[6] | 
|  | 31 | 
|  | 32 def bait_create(baits, infile): | 
|  | 33     #Takes the Bait specified by the user and makes them into a Bait file and includes a check to make sure they are using valid baits. | 
|  | 34     baits = make_bait.split() | 
|  | 35     i = 0 | 
|  | 36     bait_file_tmp = open("bait.txt", "wr") | 
|  | 37     order = [] | 
|  | 38     bait_cache = [] | 
|  | 39     while i < len(baits): | 
|  | 40         if baits[i+2] == "true": | 
|  | 41             T_C = "C" | 
|  | 42         else: | 
|  | 43             T_C = "T" | 
|  | 44         line1 = baits[i] + "\t" + baits[i+1] + "\t" + T_C + "\n" | 
|  | 45         q = open(infile,"r") | 
|  | 46         for line2 in q: | 
|  | 47            line2 = line2.replace("\"", "") | 
|  | 48            line2 = line2.replace(r"Intensity.", "") #R coerces "-" into "." this changes them back and remove Intensity from the Bait names. | 
|  | 49            line2 = line2.replace(r".", r"-") | 
|  | 50            temp = line2.split() | 
|  | 51            if "mapped_protein" in str(temp): | 
|  | 52                 #If the bait is in the original file then write to cache it if not exit. | 
|  | 53                 if baits[i] in temp: | 
|  | 54                     number_bait = temp.index(str(baits[i])) | 
|  | 55                     number_bait = number_bait - 9 | 
|  | 56                     bait_cache.append((number_bait, str(line1))) | 
|  | 57                 else: | 
|  | 58                     print "Error: bad bait " + str(baits[i]) | 
|  | 59                     sys.exit() | 
|  | 60            else: | 
|  | 61                 pass | 
|  | 62         i = i + 3 | 
|  | 63     #Writes cache to file. | 
|  | 64     bait_cache.sort() | 
|  | 65     for line in bait_cache: | 
|  | 66         bait_file_tmp.write(line[1]) | 
|  | 67 | 
|  | 68     bait_file_tmp.close() | 
|  | 69 | 
|  | 70 | 
|  | 71 baitfile = "bait.txt" | 
|  | 72 | 
|  | 73 class ReturnValue1(object): | 
|  | 74     def __init__(self, sequence, gene): | 
|  | 75      self.seqlength = sequence | 
|  | 76      self.genename = gene | 
|  | 77 class ReturnValue2(object): | 
|  | 78     def __init__(self, getdata, getproteins, getheader): | 
|  | 79         self.data = getdata | 
|  | 80         self.proteins = getproteins | 
|  | 81         self.header = getheader | 
|  | 82 | 
|  | 83 def main(maxquant_input, make_bait): | 
|  | 84     bait_create(make_bait, infile) | 
|  | 85     baitfile = "bait.txt" | 
|  | 86     #bait_check(baitfile, maxquant_input) | 
|  | 87     make_inter(maxquant_input) | 
|  | 88     if prey == 'true': | 
|  | 89         make_prey(maxquant_input) | 
|  | 90         no_error_inter(maxquant_input) | 
|  | 91         os.rename('prey.txt', sys.argv[5]) | 
|  | 92     elif prey == 'false': | 
|  | 93         if os.path.isfile('error proteins.txt') == True: | 
|  | 94             no_error_inter(maxquant_input) | 
|  | 95         pass | 
|  | 96     elif prey != 'true' or 'false': | 
|  | 97         sys.exit("Invalid Prey Argument: Y or N") | 
|  | 98     os.rename('inter.txt', sys.argv[4]) | 
|  | 99     os.rename("bait.txt", sys.argv[7]) | 
|  | 100 | 
|  | 101 | 
|  | 102 def get_info(uniprot_accession_in): # get aa lengths and gene name | 
|  | 103     error = open('error proteins.txt', 'a+') | 
|  | 104 #    while True: | 
|  | 105 #        i = 0 | 
|  | 106 #	try: | 
|  | 107 #            data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in + ".fasta") | 
|  | 108 #            break | 
|  | 109 #        except urllib2.HTTPError, err: | 
|  | 110 #            i = i + 1 | 
|  | 111 #            if i == 50: | 
|  | 112 #                sys.exit("More than 50 errors. Check your file or try again later.") | 
|  | 113 #            if err.code == 404: | 
|  | 114 #                error.write(uniprot_accession_in + '\t' + "Invalid URL. Check protein" + '\n') | 
|  | 115 #                seqlength = 'NA' | 
|  | 116 #                genename = 'NA' | 
|  | 117 #                return ReturnValue1(seqlength, genename) | 
|  | 118 #            elif err.code == 302: | 
|  | 119 #                sys.exit("Request timed out. Check connection and try again.") | 
|  | 120 #            else: | 
|  | 121 #                sys.exit("Uniprot had some other error") | 
|  | 122 # | 
|  | 123 # | 
|  | 124 #    if lines == []: | 
|  | 125 #        error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n') | 
|  | 126 #        error.close | 
|  | 127 # if lines == []: | 
|  | 128 #        error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n') | 
|  | 129 #        error.close | 
|  | 130 #        seqlength = 'NA' | 
|  | 131 #        genename = 'NA' | 
|  | 132 #        return ReturnValue1(seqlength, genename) | 
|  | 133 #    if lines != []: | 
|  | 134 #        seqlength = 0 | 
|  | 135 #        header = lines[0] | 
|  | 136 #        for line in lines[1:]: | 
|  | 137 #            line = line.replace("\n","") # strip \n or else it gets counted in the length | 
|  | 138 #            seqlength += len(line) | 
|  | 139 #        if 'GN=' in header: | 
|  | 140 #            lst = header.split('GN=') | 
|  | 141 #            lst2 = lst[1].split(' ') | 
|  | 142 #            genename = lst2[0] | 
|  | 143 #            error.close | 
|  | 144 #            return ReturnValue1(seqlength, genename) | 
|  | 145 #        if 'GN=' not in header: | 
|  | 146 #            genename = 'NA' | 
|  | 147 #            error.close | 
|  | 148 #            return ReturnValue1(seqlength, genename) | 
|  | 149 | 
|  | 150     data = open(fasta_db,'r') | 
|  | 151     lines = data.readlines() | 
|  | 152     db_len = len(lines) | 
|  | 153     seqlength = 0 | 
|  | 154     count = 0 | 
|  | 155     for i in lines: | 
|  | 156         if ">sp" in i: | 
|  | 157             if uniprot_accession_in == i.split("|")[1]: | 
|  | 158                 match = count+1 | 
|  | 159                 if 'GN=' in i: | 
|  | 160                     lst = i.split('GN=') | 
|  | 161                     lst2 = lst[1].split(' ') | 
|  | 162                     genename = lst2[0] | 
|  | 163                 if 'GN=' not in i: | 
|  | 164                     genename = 'NA' | 
|  | 165                 while ">sp" not in lines[match]: | 
|  | 166                     if match <= db_len: | 
|  | 167                         seqlength = seqlength + len(lines[match].strip()) | 
|  | 168                         match = match + 1 | 
|  | 169                     else: | 
|  | 170                         break | 
|  | 171                 return ReturnValue1(seqlength, genename) | 
|  | 172         count = count + 1 | 
|  | 173 | 
|  | 174 | 
|  | 175     if seqlength == 0: | 
|  | 176         error.write(uniprot_accession_in + '\t' + "Uniprot not in Fasta" + '\n') | 
|  | 177         error.close | 
|  | 178         seqlength = 'NA' | 
|  | 179         genename = 'NA' | 
|  | 180         return ReturnValue1(seqlength, genename) | 
|  | 181 | 
|  | 182 | 
|  | 183 def readtab(infile): | 
|  | 184     with open(infile,'r') as x: # read in tab-delim text | 
|  | 185         output = [] | 
|  | 186         for line in x: | 
|  | 187             line = line.strip() | 
|  | 188             temp = line.split('\t') | 
|  | 189             output.append(temp) | 
|  | 190     return output | 
|  | 191 def read_maxquant(maxquant_input): # Get data, proteins and header from maxquant output | 
|  | 192     dupes = readtab(maxquant_input) | 
|  | 193     header_start = 0 | 
|  | 194     header = dupes[header_start] | 
|  | 195     for i in header: | 
|  | 196         i = i.replace(r"\"", "") | 
|  | 197         i = i.replace(r"Intensity.", r"") | 
|  | 198         i = i.replace(r".", r"-") | 
|  | 199     data = dupes[header_start+1:len(dupes)] #cut off blank line and END OF FILE | 
|  | 200     proteins = [] | 
|  | 201     for protein in data: | 
|  | 202         proteins.append(protein[0]) | 
|  | 203     return ReturnValue2(data, proteins, header) | 
|  | 204 def make_inter(maxquant_input): | 
|  | 205     bait = readtab(baitfile) | 
|  | 206     data = read_maxquant(maxquant_input).data | 
|  | 207     header = read_maxquant(maxquant_input).header | 
|  | 208     proteins = read_maxquant(maxquant_input).proteins | 
|  | 209     bait_index = [] | 
|  | 210     for i in bait: | 
|  | 211         bait_index.append(header.index("mapped_protein") + 1) # Find just the baits defined in bait file | 
|  | 212     with open('inter.txt', 'w') as y: | 
|  | 213             a = 0; l=0 | 
|  | 214             for bb in bait: | 
|  | 215                 for lst in data: | 
|  | 216                     y.write(header[bait_index[l]] + '\t' + bb[1] + '\t' + proteins[a] + '\t' + lst[bait_index[l]] + '\n') | 
|  | 217                     a+=1 | 
|  | 218                     if a == len(proteins): | 
|  | 219                         a = 0; l+=1 | 
|  | 220 def make_prey(maxquant_input): | 
|  | 221     proteins = read_maxquant(maxquant_input).proteins | 
|  | 222     output_file = open("prey.txt",'w') | 
|  | 223     for a in proteins: | 
|  | 224         a = a.replace("\n","") # remove \n for input into function | 
|  | 225         a = a.replace("\r","") # ditto for \r | 
|  | 226         seq = get_info(a).seqlength | 
|  | 227         GN = get_info(a).genename | 
|  | 228         if seq != 'NA': | 
|  | 229             output_file.write(a+"\t"+str(seq)+ "\t" + str(GN) + "\n") | 
|  | 230     output_file.close() | 
|  | 231 def no_error_inter(maxquant_input): # remake inter file without protein errors from Uniprot | 
|  | 232     err = readtab("error proteins.txt") | 
|  | 233     bait = readtab(baitfile) | 
|  | 234     data = read_maxquant(maxquant_input).data | 
|  | 235     header = read_maxquant(maxquant_input).header | 
|  | 236     header = [i.replace(r"\"", "") for i in header] | 
|  | 237     header = [i.replace(r"Intensity.", r"") for i in header] | 
|  | 238     header = [i.replace(r".", r"-") for i in header] | 
|  | 239     print header | 
|  | 240     bait_index = [] | 
|  | 241     for i in bait: | 
|  | 242         bait_index.append(header.index(i[0])) | 
|  | 243     proteins = read_maxquant(maxquant_input).proteins | 
|  | 244     errors = [] | 
|  | 245     for e in err: | 
|  | 246         errors.append(e[0]) | 
|  | 247     with open('inter.txt', 'w') as y: | 
|  | 248         l = 0; a = 0 | 
|  | 249         for bb in bait: | 
|  | 250             for lst in data: | 
|  | 251                 if proteins[a] not in errors: | 
|  | 252                     y.write(header[bait_index[l]] + '\t' + bb[1] + '\t' + proteins[a] + '\t' + lst[bait_index[l]] + '\n') | 
|  | 253                 a+=1 | 
|  | 254                 if a == len(proteins): | 
|  | 255                     l += 1; a = 0 | 
|  | 256 def bait_check(bait, maxquant_input): # check that bait names share header titles | 
|  | 257     bait_in = readtab(bait) | 
|  | 258     header = read_maxquant(maxquant_input).header | 
|  | 259     for i in bait_in: | 
|  | 260         if i[0] not in header: | 
|  | 261             sys.exit("Bait must share header titles with maxquant output") | 
|  | 262 | 
|  | 263 if __name__ == '__main__': | 
|  | 264     main(infile, make_bait) |