| 
28
 | 
     1 #######################################################################################
 | 
| 
 | 
     2 # Python-code: SAINT pre-processing from MaxQuant "Samples Report" output
 | 
| 
 | 
     3 # Author: Brent Kuenzi
 | 
| 
 | 
     4 #######################################################################################
 | 
| 
 | 
     5 # This program reads in a raw MaxQuant "Samples Report" output and a user generated
 | 
| 
 | 
     6 # bait file and autoformats it into prey and interaction files for SAINTexpress
 | 
| 
 | 
     7 # analysis
 | 
| 
 | 
     8 #######################################################################################
 | 
| 
 | 
     9 # Copyright (C)  Brent Kuenzi.
 | 
| 
 | 
    10 # Permission is granted to copy, distribute and/or modify this document
 | 
| 
 | 
    11 # under the terms of the GNU Free Documentation License, Version 1.3
 | 
| 
 | 
    12 # or any later version published by the Free Software Foundation;
 | 
| 
 | 
    13 # with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts.
 | 
| 
 | 
    14 # A copy of the license is included in the section entitled "GNU
 | 
| 
 | 
    15 # Free Documentation License".
 | 
| 
 | 
    16 #######################################################################################
 | 
| 
 | 
    17 ## REQUIRED INPUT ##
 | 
| 
 | 
    18 
 | 
| 
 | 
    19 # 1) infile: MaxQuant "Samples Report" output
 | 
| 
 | 
    20 # 2) baitfile: SAINT formatted bait file generated in Galaxy
 | 
| 
 | 
    21 # 3) fasta_db: fasta database for use (defaults to SwissProt_HUMAN_2014_08.fasta)
 | 
| 
 | 
    22 # 4) prey: Y or N for generating a prey file
 | 
| 
 | 
    23 # 5) make_bait: String of bait names, assignment, and test or control boolean
 | 
| 
 | 
    24 #######################################################################################
 | 
| 
 | 
    25 
 | 
| 
 | 
    26 
 | 
| 
 | 
    27 import sys
 | 
| 
 | 
    28 import os
 | 
| 
 | 
    29 
 | 
| 
 | 
    30 
 | 
| 
 | 
    31 mq_file = sys.argv[1]
 | 
| 
 | 
    32 ins_path = sys.argv[8]
 | 
| 
 | 
    33 names_path = str(ins_path) + r"uniprot_names.txt"
 | 
| 
 | 
    34 cmd = (r"Rscript "+ str(ins_path) +"pre_process_protein_name_set.R " + str(mq_file) +
 | 
| 
 | 
    35        " " + str(names_path))
 | 
| 
 | 
    36 os.system(cmd)
 | 
| 
 | 
    37 
 | 
| 
 | 
    38 infile = "./tukeys_output.txt" 
 | 
| 
 | 
    39 # The MaxQuant "Samples Report" output.
 | 
| 
 | 
    40 prey = sys.argv[2] 
 | 
| 
 | 
    41 # Y or N boolean from Galaxy.
 | 
| 
 | 
    42 fasta_db = sys.argv[3]
 | 
| 
 | 
    43 if fasta_db == "None":
 | 
| 
 | 
    44     fasta_db = str(ins_path)  + "SwissProt_HUMAN_2014_08.fasta"
 | 
| 
 | 
    45 make_bait = sys.argv[6]
 | 
| 
 | 
    46 bait_bool = sys.argv[9]
 | 
| 
 | 
    47 
 | 
| 
 | 
    48 def bait_create(baits, infile):
 | 
| 
 | 
    49     # Takes the Bait specified by the user and makes them into a Bait file and includes a
 | 
| 
 | 
    50     # check to make sure they are using valid baits.
 | 
| 
 | 
    51     baits = make_bait.split()
 | 
| 
 | 
    52     i = 0
 | 
| 
 | 
    53     bait_file_tmp = open("bait.txt", "w")
 | 
| 
 | 
    54     order = []
 | 
| 
 | 
    55     bait_cache = []
 | 
| 
 | 
    56     while i < len(baits):
 | 
| 
 | 
    57         if baits[i+2] == "true":
 | 
| 
 | 
    58             T_C = "C"
 | 
| 
 | 
    59         else:
 | 
| 
 | 
    60             T_C = "T"
 | 
| 
 | 
    61         bait_line = baits[i] + "\t" + baits[i+1] + "\t" + T_C + "\n"
 | 
| 
 | 
    62         read_infile = open(infile, "r")
 | 
| 
 | 
    63         for input_line in read_infile :
 | 
| 
 | 
    64             input_line = input_line.replace("\"", "")
 | 
| 
 | 
    65             input_line = input_line.replace(r"Intensity.", "")
 | 
| 
 | 
    66             # R coerces "-" into "." changes them back and remove Intensity from the Bait names.
 | 
| 
 | 
    67             input_line = input_line.replace(r".", r"-")
 | 
| 
 | 
    68             temp = input_line.split()
 | 
| 
 | 
    69             if "mapped_protein" in str(temp):
 | 
| 
 | 
    70                 if baits[i] in temp:
 | 
| 
 | 
    71                     number_bait = temp.index(str(baits[i]))
 | 
| 
 | 
    72                     number_bait = number_bait - 9
 | 
| 
 | 
    73                     bait_cache.append((number_bait, str(bait_line)))
 | 
| 
 | 
    74                     # Locates the Bait names in the column names and then sets the Baits in the 
 | 
| 
 | 
    75                     # correct order in the cache thus the - 9 because the baits start at the 9th
 | 
| 
 | 
    76                     # column.
 | 
| 
 | 
    77                 else:
 | 
| 
 | 
    78                     print "Error: bad bait " + str(baits[i])
 | 
| 
 | 
    79                     sys.exit()
 | 
| 
 | 
    80             else:
 | 
| 
 | 
    81                 pass
 | 
| 
 | 
    82         i = i + 3
 | 
| 
 | 
    83     # Writes cache to Bait file.
 | 
| 
 | 
    84     bait_cache.sort()
 | 
| 
 | 
    85     for line in bait_cache:
 | 
| 
 | 
    86         bait_file_tmp.write(line[1])
 | 
| 
 | 
    87 
 | 
| 
 | 
    88     bait_file_tmp.close()
 | 
| 
 | 
    89 
 | 
| 
 | 
    90 
 | 
| 
 | 
    91 if bait_bool == 'false':
 | 
| 
 | 
    92     bait_create(make_bait, infile)
 | 
| 
 | 
    93     baitfile = "bait.txt"
 | 
| 
 | 
    94 else:
 | 
| 
 | 
    95     bait_temp_file = open(sys.argv[10], 'r')
 | 
| 
 | 
    96     bait_cache = bait_temp_file.readlines()
 | 
| 
 | 
    97     bait_file_tmp = open("bait.txt", "wr")
 | 
| 
 | 
    98     for line in bait_cache:
 | 
| 
 | 
    99         bait_file_tmp.write(line)
 | 
| 
 | 
   100     bait_file_tmp.close()
 | 
| 
 | 
   101     baitfile = "bait.txt"
 | 
| 
 | 
   102 
 | 
| 
 | 
   103 
 | 
| 
 | 
   104 class ReturnValue1(object):
 | 
| 
 | 
   105     def __init__(self, sequence, gene):
 | 
| 
 | 
   106         self.seqlength = sequence
 | 
| 
 | 
   107         self.genename = gene
 | 
| 
 | 
   108 class ReturnValue2(object):
 | 
| 
 | 
   109     def __init__(self, getdata, getproteins, getheader):
 | 
| 
 | 
   110         self.data = getdata
 | 
| 
 | 
   111         self.proteins = getproteins
 | 
| 
 | 
   112         self.header = getheader
 | 
| 
 | 
   113 
 | 
| 
 | 
   114 
 | 
| 
 | 
   115 def main(MaxQuant_input, make_bait):
 | 
| 
 | 
   116     #bait_check(baitfile, MaxQuant_input)
 | 
| 
 | 
   117     make_inter(MaxQuant_input)
 | 
| 
 | 
   118     if prey == 'true':
 | 
| 
 | 
   119         make_prey(MaxQuant_input)
 | 
| 
 | 
   120         no_error_inter(MaxQuant_input)
 | 
| 
 | 
   121         os.rename('prey.txt', sys.argv[5])
 | 
| 
 | 
   122     elif prey == 'false':
 | 
| 
 | 
   123         if os.path.isfile('error proteins.txt') == True:
 | 
| 
 | 
   124             no_error_inter(MaxQuant_input)
 | 
| 
 | 
   125         pass
 | 
| 
 | 
   126     elif prey != 'true' or 'false':
 | 
| 
 | 
   127         sys.exit("Invalid Prey Argument: Y or N")
 | 
| 
 | 
   128     os.rename('inter.txt', sys.argv[4])
 | 
| 
 | 
   129     os.rename("bait.txt", sys.argv[7])
 | 
| 
 | 
   130 
 | 
| 
 | 
   131 
 | 
| 
 | 
   132 def get_info(uniprot_accession_in): 
 | 
| 
 | 
   133     # Get aa lengths and gene name.
 | 
| 
 | 
   134     error = open('error proteins.txt', 'a+')
 | 
| 
 | 
   135     data = open(fasta_db, 'r')
 | 
| 
 | 
   136     data_lines = data.readlines()
 | 
| 
 | 
   137     db_len = len(data_lines)
 | 
| 
 | 
   138     seqlength = 0
 | 
| 
 | 
   139     count = 0
 | 
| 
 | 
   140     for data_line in data_lines:
 | 
| 
 | 
   141         if ">sp" in data_line:
 | 
| 
 | 
   142             if uniprot_accession_in == data_line.split("|")[1]:
 | 
| 
 | 
   143                 match = count + 1
 | 
| 
 | 
   144                 if 'GN=' in data_line:
 | 
| 
 | 
   145                     lst = data_line.split('GN=')
 | 
| 
 | 
   146                     lst2 = lst[1].split(' ')
 | 
| 
 | 
   147                     genename = lst2[0]
 | 
| 
 | 
   148                 if 'GN=' not in data_line:
 | 
| 
 | 
   149                     genename = 'NA'
 | 
| 
 | 
   150                 while ">sp" not in data_lines[match]:
 | 
| 
 | 
   151                     if match <= db_len:
 | 
| 
 | 
   152                         seqlength = seqlength + len(data_lines[match].strip())
 | 
| 
 | 
   153                         match = match + 1
 | 
| 
 | 
   154                     else:
 | 
| 
 | 
   155                         break
 | 
| 
 | 
   156                 return ReturnValue1(seqlength, genename)
 | 
| 
 | 
   157         count = count + 1
 | 
| 
 | 
   158 
 | 
| 
 | 
   159 
 | 
| 
 | 
   160     if seqlength == 0:
 | 
| 
 | 
   161         error.write(uniprot_accession_in + '\t' + "Uniprot not in Fasta" + '\n')
 | 
| 
 | 
   162         error.close
 | 
| 
 | 
   163         seqlength = 'NA'
 | 
| 
 | 
   164         genename = 'NA'
 | 
| 
 | 
   165         return ReturnValue1(seqlength, genename)
 | 
| 
 | 
   166 
 | 
| 
 | 
   167 
 | 
| 
 | 
   168 def readtab(infile):
 | 
| 
 | 
   169     with open(infile, 'r') as input_file:
 | 
| 
 | 
   170     # Read in tab-delim text file.
 | 
| 
 | 
   171         output = []
 | 
| 
 | 
   172         for input_line in input_file:
 | 
| 
 | 
   173             input_line = input_line.strip()
 | 
| 
 | 
   174             temp = input_line.split('\t')
 | 
| 
 | 
   175             output.append(temp)
 | 
| 
 | 
   176     return output
 | 
| 
 | 
   177 
 | 
| 
 | 
   178 
 | 
| 
 | 
   179 def read_MaxQuant(MaxQuant_input):
 | 
| 
 | 
   180     # Get data, proteins and header from MaxQuant output.
 | 
| 
 | 
   181     dupes = readtab(MaxQuant_input)
 | 
| 
 | 
   182     header_start = 0
 | 
| 
 | 
   183     header = dupes[header_start]
 | 
| 
 | 
   184     for var_MQ in header:
 | 
| 
 | 
   185         var_MQ = var_MQ.replace(r"\"", "")
 | 
| 
 | 
   186         var_MQ = var_MQ.replace(r"Intensity.", r"")
 | 
| 
 | 
   187         var_MQ = var_MQ.replace(r".", r"-")
 | 
| 
 | 
   188     data = dupes[header_start+1:len(dupes)]
 | 
| 
 | 
   189     # Cut off blank line and END OF FILE.
 | 
| 
 | 
   190     proteins = []
 | 
| 
 | 
   191     for protein in data:
 | 
| 
 | 
   192         proteins.append(protein[0])
 | 
| 
 | 
   193     return ReturnValue2(data, proteins, header)
 | 
| 
 | 
   194 
 | 
| 
 | 
   195 
 | 
| 
 | 
   196 def make_inter(MaxQuant_input):
 | 
| 
 | 
   197     bait = readtab(baitfile)
 | 
| 
 | 
   198     data = read_MaxQuant(MaxQuant_input).data
 | 
| 
 | 
   199     header = read_MaxQuant(MaxQuant_input).header
 | 
| 
 | 
   200     proteins = read_MaxQuant(MaxQuant_input).proteins
 | 
| 
 | 
   201     bait_index = []
 | 
| 
 | 
   202     for bait_item in bait:
 | 
| 
 | 
   203         bait_index.append(header.index("mapped_protein") + 1)
 | 
| 
 | 
   204         # Find just the baits defined in bait file.
 | 
| 
 | 
   205     with open('inter.txt', 'w') as y:
 | 
| 
 | 
   206         a = 0; l = 0
 | 
| 
 | 
   207         for bb in bait:
 | 
| 
 | 
   208             for lst in data:
 | 
| 
 | 
   209                 y.write(header[bait_index[l]] + '\t' + bb[1] + '\t' + proteins[a] + '\t'
 | 
| 
 | 
   210                         + lst[bait_index[l]] + '\n')
 | 
| 
 | 
   211                 a += 1
 | 
| 
 | 
   212                 if a == len(proteins):
 | 
| 
 | 
   213                     a = 0; l += 1
 | 
| 
 | 
   214 
 | 
| 
 | 
   215 
 | 
| 
 | 
   216 def make_prey(MaxQuant_input):
 | 
| 
 | 
   217     proteins = read_MaxQuant(MaxQuant_input).proteins
 | 
| 
 | 
   218     output_file = open("prey.txt", 'w')
 | 
| 
 | 
   219     for a in proteins:
 | 
| 
 | 
   220         a = a.replace("\n", "")
 | 
| 
 | 
   221         # Remove \n for input into function.
 | 
| 
 | 
   222         a = a.replace("\r", "")
 | 
| 
 | 
   223         # Ditto for \r.
 | 
| 
 | 
   224         seq = get_info(a).seqlength
 | 
| 
 | 
   225         GN = get_info(a).genename
 | 
| 
 | 
   226         if seq != 'NA':
 | 
| 
 | 
   227             output_file.write(a+"\t"+str(seq)+ "\t" + str(GN) + "\n")
 | 
| 
 | 
   228     output_file.close()
 | 
| 
 | 
   229 
 | 
| 
 | 
   230 
 | 
| 
 | 
   231 def no_error_inter(MaxQuant_input):
 | 
| 
 | 
   232     # Remake inter file without protein errors from Uniprot.
 | 
| 
 | 
   233     err = readtab("error proteins.txt")
 | 
| 
 | 
   234     bait = readtab(baitfile)
 | 
| 
 | 
   235     data = read_MaxQuant(MaxQuant_input).data
 | 
| 
 | 
   236     header = read_MaxQuant(MaxQuant_input).header
 | 
| 
 | 
   237     header = [MQ_var.replace(r"\"", "") for MQ_var in header]
 | 
| 
 | 
   238     header = [MQ_var.replace(r"Intensity.", r"") for MQ_var in header]
 | 
| 
 | 
   239     header = [MQ_var.replace(r".", r"-") for MQ_var in header]
 | 
| 
 | 
   240     bait_index = []
 | 
| 
 | 
   241     for bait_item in bait:
 | 
| 
 | 
   242         bait_index.append(header.index(bait_item[0]))
 | 
| 
 | 
   243     proteins = read_MaxQuant(MaxQuant_input).proteins
 | 
| 
 | 
   244     errors = []
 | 
| 
 | 
   245     for e in err:
 | 
| 
 | 
   246         errors.append(e[0])
 | 
| 
 | 
   247     with open('inter.txt', 'w') as input_file:
 | 
| 
 | 
   248         l = 0; a = 0
 | 
| 
 | 
   249         for bb in bait:
 | 
| 
 | 
   250             for lst in data:
 | 
| 
 | 
   251                 if proteins[a] not in errors:
 | 
| 
 | 
   252                     input_file.write(header[bait_index[l]] + '\t' + bb[1] + '\t' + proteins[a] + '\t' 
 | 
| 
 | 
   253                             + lst[bait_index[l]] + '\n')
 | 
| 
 | 
   254                 a += 1
 | 
| 
 | 
   255                 if a == len(proteins):
 | 
| 
 | 
   256                     l += 1; a = 0
 | 
| 
 | 
   257 
 | 
| 
 | 
   258 
 | 
| 
 | 
   259 def bait_check(bait, MaxQuant_input):
 | 
| 
 | 
   260     # Check that bait names share header titles.
 | 
| 
 | 
   261     bait_in = readtab(bait)
 | 
| 
 | 
   262     header = read_MaxQuant(MaxQuant_input).header
 | 
| 
 | 
   263     for bait in bait_in:
 | 
| 
 | 
   264         if bait[0] not in header:
 | 
| 
 | 
   265             sys.exit("Bait must share header titles with MaxQuant output")
 | 
| 
 | 
   266 
 | 
| 
 | 
   267 if __name__ == '__main__':
 | 
| 
 | 
   268     main(infile, make_bait)
 |