| 26 | 1 ####################################################################################### | 
|  | 2 # Python-code: SAINT pre-processing from Scaffold "Samples Report" output | 
|  | 3 # Author: Brent Kuenzi | 
|  | 4 ####################################################################################### | 
|  | 5 # This program reads in a raw Scaffold "Samples Report" output and a user generated | 
|  | 6 # bait file and autoformats it into prey and interaction files for SAINTexpress | 
|  | 7 # analysis | 
|  | 8 ####################################################################################### | 
|  | 9 # Copyright (C)  Brent Kuenzi. | 
|  | 10 # Permission is granted to copy, distribute and/or modify this document | 
|  | 11 # under the terms of the GNU Free Documentation License, Version 1.3 | 
|  | 12 # or any later version published by the Free Software Foundation; | 
|  | 13 # with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts. | 
|  | 14 # A copy of the license is included in the section entitled "GNU | 
|  | 15 # Free Documentation License". | 
|  | 16 ####################################################################################### | 
|  | 17 ## REQUIRED INPUT ## | 
|  | 18 | 
|  | 19 # 1) infile: Scaffold "Samples Report" output | 
|  | 20 # 2) baitfile: SAINT formatted bait file generated in Galaxy | 
|  | 21 # 3) fasta_db: fasta database for use (defaults to SwissProt_HUMAN_2014_08.fasta) | 
|  | 22 # 4) prey: Y or N for generating a prey file | 
|  | 23 # 5) make_bait: String of bait names, assignment, and test or control boolean | 
|  | 24 ####################################################################################### | 
|  | 25 | 
|  | 26 import sys | 
|  | 27 import os.path | 
|  | 28 | 
|  | 29 | 
|  | 30 infile = sys.argv[1] | 
|  | 31 #Scaffold "Samples Report" output. | 
|  | 32 prey = sys.argv[2] | 
|  | 33 # Y or N boolean from Galaxy. | 
|  | 34 fasta_db = sys.argv[3] | 
|  | 35 tool_path = sys.argv[8] | 
|  | 36 if fasta_db == "None": | 
|  | 37     fasta_db = str(tool_path)  + "/SwissProt_HUMAN_2014_08.fasta" | 
|  | 38 make_bait = sys.argv[6] | 
|  | 39 bait_bool = sys.argv[9] | 
|  | 40 | 
|  | 41 | 
|  | 42 def bait_create(baits, infile): | 
|  | 43     # Verifies the Baits are valid in the Scaffold file and writes the Bait.txt. | 
|  | 44     baits = make_bait.split() | 
|  | 45     i = 0 | 
|  | 46     bait_file_tmp = open("bait.txt", "w") | 
|  | 47     order = [] | 
|  | 48     bait_cache = [] | 
|  | 49     while i < len(baits): | 
|  | 50         if baits[i+2] == "true": | 
|  | 51             T_C = "C" | 
|  | 52         else: | 
|  | 53             T_C = "T" | 
|  | 54         bait_line = baits[i] + "\t" + baits[i+1] + "\t" + T_C + "\n" | 
|  | 55         read_infile = open(infile, "r") | 
|  | 56         for input_line in read_infile: | 
|  | 57             input_line = input_line.strip() | 
|  | 58             temp = input_line.split('\t') | 
|  | 59             if "Quantitative Variance" in str(temp): | 
|  | 60                 if baits[i] in temp: | 
|  | 61                     number_bait = temp.index(str(baits[i])) | 
|  | 62                     number_bait = number_bait - 9 | 
|  | 63                     bait_cache.append((number_bait, str(bait_line))) | 
|  | 64                     # Locates the Bait names in the column names and then sets the Baits in the | 
|  | 65                     # correct order in the cache thus the - 9 because the baits start at the 9th | 
|  | 66                     # column. | 
|  | 67                 else: | 
|  | 68                     print "Error: bad bait " + str(baits[i]) | 
|  | 69                     sys.exit() | 
|  | 70             else: | 
|  | 71                 pass | 
|  | 72         i = i + 3 | 
|  | 73 | 
|  | 74     bait_cache.sort() | 
|  | 75     for cache_line in bait_cache: | 
|  | 76         bait_file_tmp.write(cache_line[1]) | 
|  | 77 | 
|  | 78     bait_file_tmp.close() | 
|  | 79 | 
|  | 80 if bait_bool == 'false': | 
|  | 81     bait_create(make_bait, infile) | 
|  | 82     baitfile = "bait.txt" | 
|  | 83 else: | 
|  | 84     bait_temp_file = open(sys.argv[10], 'r') | 
|  | 85     bait_cache = bait_temp_file.readlines() | 
|  | 86     print bait_cache | 
|  | 87     bait_file_tmp = open("bait.txt", "wr") | 
|  | 88     for cache_line in bait_cache: | 
|  | 89         bait_file_tmp.write(cache_line) | 
|  | 90     bait_file_tmp.close() | 
|  | 91     baitfile = "bait.txt" | 
|  | 92 | 
|  | 93 | 
|  | 94 class ReturnValue1(object): | 
|  | 95     def __init__(self, sequence, gene): | 
|  | 96         self.seqlength = sequence | 
|  | 97         self.genename = gene | 
|  | 98 | 
|  | 99 | 
|  | 100 class ReturnValue2(object): | 
|  | 101     def __init__(self, getdata, getproteins, getheader): | 
|  | 102         self.data = getdata | 
|  | 103         self.proteins = getproteins | 
|  | 104         self.header = getheader | 
|  | 105 | 
|  | 106 | 
|  | 107 def main(Scaffold_input, baits): | 
|  | 108     bait_check(baitfile, Scaffold_input) | 
|  | 109     make_inter(Scaffold_input) | 
|  | 110     if prey == 'true': | 
|  | 111         make_prey(Scaffold_input) | 
|  | 112         no_error_inter(Scaffold_input) | 
|  | 113         os.rename('prey.txt', sys.argv[5]) | 
|  | 114     elif prey == 'false': | 
|  | 115         if os.path.isfile('error proteins.txt') == True: | 
|  | 116             no_error_inter(Scaffold_input) | 
|  | 117         pass | 
|  | 118     elif prey != 'true' or 'false': | 
|  | 119         sys.exit("Invalid Prey Argument: Y or N") | 
|  | 120 | 
|  | 121 | 
|  | 122 def get_info(uniprot_accession_in): | 
|  | 123     # Get aminoacid lengths and gene name. | 
|  | 124     error = open('error proteins.txt', 'a+') | 
|  | 125     data = open(fasta_db, 'r') | 
|  | 126     data_lines = data.readlines() | 
|  | 127     db_len = len(data_lines) | 
|  | 128     seqlength = 0 | 
|  | 129     count = 0 | 
|  | 130     for data_line in data_lines: | 
|  | 131         if ">sp" in data_line: | 
|  | 132             namer = data_line.split("|")[2] | 
|  | 133         if uniprot_accession_in == data_line.split("|")[1]: | 
|  | 134             match = count + 1 | 
|  | 135             if 'GN=' in data_line: | 
|  | 136                 lst = data_linedata_line.split('GN=') | 
|  | 137                 lst2 = lst[1].split(' ') | 
|  | 138                 genename = lst2[0] | 
|  | 139             if 'GN=' not in data_line: | 
|  | 140                 genename = 'NA' | 
|  | 141             while ">sp" not in data_lines[match]: | 
|  | 142                 if match <= db_len: | 
|  | 143                     seqlength = seqlength + len(data_lines[match].strip()) | 
|  | 144                     match = match + 1 | 
|  | 145                 else: | 
|  | 146                     break | 
|  | 147             return ReturnValue1(seqlength, genename) | 
|  | 148         elif uniprot_accession_in == namer.split(" ")[0]: | 
|  | 149             match = count + 1 | 
|  | 150             # Ensures consistent spacing throughout. | 
|  | 151             if 'GN=' in data_line: | 
|  | 152                 lst = data_line.split('GN=') | 
|  | 153                 lst2 = lst[1].split(' ') | 
|  | 154                 genename = lst2[0] | 
|  | 155             if 'GN=' not in data_line: | 
|  | 156                 genename = 'NA' | 
|  | 157             while ">sp" not in lines[match]: | 
|  | 158                 if match <= db_len: | 
|  | 159                     seqlength = seqlength + len(lines[match].strip()) | 
|  | 160                     match = match + 1 | 
|  | 161                 else: | 
|  | 162                     break | 
|  | 163             return ReturnValue1(seqlength, genename) | 
|  | 164         count = count + 1 | 
|  | 165     if seqlength == 0: | 
|  | 166         error.write(uniprot_accession_in + '\t' + "Uniprot not in Fasta" + '\n') | 
|  | 167         error.close | 
|  | 168         seqlength = 'NA' | 
|  | 169         genename = 'NA' | 
|  | 170         return ReturnValue1(seqlength, genename) | 
|  | 171 | 
|  | 172 | 
|  | 173 def readtab(infile): | 
|  | 174     with open(infile, 'r') as input_file: | 
|  | 175     # read in tab-delim text | 
|  | 176         output = [] | 
|  | 177         for input_line in input_file: | 
|  | 178             input_line = input_line.strip() | 
|  | 179             temp = input_line.split('\t') | 
|  | 180             output.append(temp) | 
|  | 181     return output | 
|  | 182 | 
|  | 183 | 
|  | 184 def read_Scaffold(Scaffold_input): | 
|  | 185     # Get data, proteins and header from Scaffold output | 
|  | 186     dupes = readtab(Scaffold_input) | 
|  | 187     cnt = 0 | 
|  | 188     for Scaffold_line in dupes: | 
|  | 189         cnt += 1 | 
|  | 190         if Scaffold_line[0] == '#': | 
|  | 191         # Finds the start of second header. | 
|  | 192             header_start = cnt-1 | 
|  | 193     header = dupes[header_start] | 
|  | 194     prot_start = header.index("Accession Number") | 
|  | 195     data = dupes[header_start+1:len(dupes)-2] | 
|  | 196     # Cut off blank line and END OF FILE. | 
|  | 197     proteins = [] | 
|  | 198     for Scaffold_line in data: | 
|  | 199         Scaffold_line[4] = Scaffold_line[4].split()[0] | 
|  | 200         # Removes the (+##) that sometimes is attached. | 
|  | 201     for protein in data: | 
|  | 202         proteins.append(protein[prot_start]) | 
|  | 203     return ReturnValue2(data, proteins, header) | 
|  | 204 | 
|  | 205 | 
|  | 206 def make_inter(Scaffold_input): | 
|  | 207     bait = readtab(baitfile) | 
|  | 208     data = read_Scaffold(Scaffold_input).data | 
|  | 209     header = read_Scaffold(Scaffold_input).header | 
|  | 210     proteins = read_Scaffold(Scaffold_input).proteins | 
|  | 211     bait_index = [] | 
|  | 212     for bait_line in bait: | 
|  | 213         bait_index.append(header.index(bait_line[0])) | 
|  | 214         # Find just the baits defined in bait file. | 
|  | 215     with open('inter.txt', 'w') as inter_file: | 
|  | 216         a = 0; l = 0 | 
|  | 217         for bb in bait: | 
|  | 218             for lst in data: | 
|  | 219                 inter_file.write(header[bait_index[l]] + '\t' + bb[1] + '\t' + proteins[a] + '\t' | 
|  | 220                         + lst[bait_index[l]] + '\n') | 
|  | 221                 a += 1 | 
|  | 222                 if a == len(proteins): | 
|  | 223                     a = 0; l += 1 | 
|  | 224 | 
|  | 225 | 
|  | 226 def make_prey(Scaffold_input): | 
|  | 227     proteins = read_Scaffold(Scaffold_input).proteins | 
|  | 228     output_file = open("prey.txt", 'w') | 
|  | 229     for protein in proteins: | 
|  | 230         protein = protein.replace("\n", "") | 
|  | 231         # Remove \n for input into function. | 
|  | 232         protein = protein.replace("\r", "") | 
|  | 233         # Ditto for \r. | 
|  | 234         seq = get_info(protein).seqlength | 
|  | 235         GN = get_info(protein).genename | 
|  | 236         if seq != 'NA': | 
|  | 237             output_file.write(protein + "\t" + str(seq) + "\t" + str(GN) + "\n") | 
|  | 238     output_file.close() | 
|  | 239 | 
|  | 240 | 
|  | 241 def no_error_inter(Scaffold_input): | 
|  | 242     # Remake inter file without protein errors from Uniprot. | 
|  | 243     err = readtab("error proteins.txt") | 
|  | 244     bait = readtab(baitfile) | 
|  | 245     data = read_Scaffold(Scaffold_input).data | 
|  | 246     header = read_Scaffold(Scaffold_input).header | 
|  | 247     bait_index = [] | 
|  | 248     for bait_line in bait: | 
|  | 249         bait_index.append(header.index(bait_line[0])) | 
|  | 250     proteins = read_Scaffold(Scaffold_input).proteins | 
|  | 251     errors = [] | 
|  | 252     for e in err: | 
|  | 253         errors.append(e[0]) | 
|  | 254     with open('inter.txt', 'w') as y: | 
|  | 255         l = 0; a = 0 | 
|  | 256         for bb in bait: | 
|  | 257             for lst in data: | 
|  | 258                 if proteins[a] not in errors: | 
|  | 259                     y.write(header[bait_index[l]] + '\t' + bb[1] + '\t' + proteins[a] + '\t' | 
|  | 260                             + lst[bait_index[l]] + '\n') | 
|  | 261                 a += 1 | 
|  | 262                 if a == len(proteins): | 
|  | 263                     l += 1; a = 0 | 
|  | 264 | 
|  | 265 | 
|  | 266 def bait_check(bait, Scaffold_input): | 
|  | 267     # Check that bait names share Scaffold header titles. | 
|  | 268     bait_in = readtab(bait) | 
|  | 269     header = read_Scaffold(Scaffold_input).header | 
|  | 270     for i in bait_in: | 
|  | 271         if i[0] not in header: | 
|  | 272             sys.exit("Bait must share header titles with Scaffold output") | 
|  | 273 | 
|  | 274 if __name__ == '__main__': | 
|  | 275     main(infile, baitfile) | 
|  | 276 | 
|  | 277 os.rename("inter.txt", sys.argv[4]) | 
|  | 278 os.rename("bait.txt", sys.argv[7]) |