| 
9
 | 
     1 #######################################################################################
 | 
| 
 | 
     2 # Python-code: SAINT pre-processing from maxquant "Samples Report" output
 | 
| 
 | 
     3 # Author: Brent Kuenzi
 | 
| 
 | 
     4 #######################################################################################
 | 
| 
 | 
     5 # This program reads in a raw maxquant "Samples Report" output and a user generated
 | 
| 
 | 
     6 # bait file and autoformats it into prey and interaction files for SAINTexpress 
 | 
| 
 | 
     7 # analysis
 | 
| 
 | 
     8 #######################################################################################
 | 
| 
 | 
     9 import sys
 | 
| 
 | 
    10 import urllib2
 | 
| 
 | 
    11 import os
 | 
| 
 | 
    12 #######################################################################################
 | 
| 
 | 
    13 ## REQUIRED INPUT ##
 | 
| 
 | 
    14 
 | 
| 
 | 
    15 # 1) infile: maxquant "Samples Report" output
 | 
| 
 | 
    16 # 2) baitfile: SAINT formatted bait file generated in Galaxy
 | 
| 
 | 
    17 # 3) prey: Y or N for generating a prey file (requires internet connection)
 | 
| 
 | 
    18 #######################################################################################
 | 
| 
 | 
    19 mq_file = sys.argv[1]
 | 
| 
 | 
    20 cmd = r"Rscript /home/bornea/galaxy_moffitt_dev/tools/Moffitt_Tools/bubblebeam/pre_process_protein_name_set.R " + str(mq_file) 
 | 
| 
 | 
    21 os.system(cmd)
 | 
| 
 | 
    22 
 | 
| 
 | 
    23 infile = "./tukeys_output.txt" #maxquant "Samples Report" output
 | 
| 
 | 
    24 prey = sys.argv[2] # Y or N
 | 
| 
 | 
    25 fasta_db = sys.argv[3]
 | 
| 
 | 
    26 if fasta_db == "None":
 | 
| 
 | 
    27     fasta_db = "/home/bornea/galaxy_moffitt_dev/tools/Moffitt_Tools/bubblebeam/SwissProt_HUMAN_2014_08.fasta"
 | 
| 
 | 
    28 make_bait= sys.argv[6]
 | 
| 
 | 
    29 
 | 
| 
 | 
    30 def bait_create(baits, infile):
 | 
| 
 | 
    31     #Takes the Bait specified by the user and makes them into a Bait file and includes a check to make sure they are using valid baits.
 | 
| 
 | 
    32     baits = make_bait.split()
 | 
| 
 | 
    33     i = 0
 | 
| 
 | 
    34     bait_file_tmp = open("bait.txt", "wr")
 | 
| 
 | 
    35     order = [] 
 | 
| 
 | 
    36     bait_cache = []
 | 
| 
 | 
    37     while i < len(baits):
 | 
| 
 | 
    38         if baits[i+2] == "true":
 | 
| 
 | 
    39             T_C = "C"
 | 
| 
 | 
    40         else:
 | 
| 
 | 
    41             T_C = "T"
 | 
| 
 | 
    42         line1 = baits[i] + "\t" + baits[i+1] + "\t" + T_C + "\n"
 | 
| 
 | 
    43         q = open(infile,"r")
 | 
| 
 | 
    44         for line2 in q:
 | 
| 
 | 
    45            line2 = line2.replace("\"", "")
 | 
| 
 | 
    46            line2 = line2.replace(r"Intensity.", "") #R coerces "-" into "." this changes them back and remove Intensity from the Bait names.
 | 
| 
 | 
    47            line2 = line2.replace(r".", r"-")
 | 
| 
 | 
    48            temp = line2.split()
 | 
| 
 | 
    49            if "mapped_protein" in str(temp):
 | 
| 
 | 
    50                 #If the bait is in the original file then write to cache it if not exit.
 | 
| 
 | 
    51                 if baits[i] in temp:
 | 
| 
 | 
    52                     number_bait = temp.index(str(baits[i]))
 | 
| 
 | 
    53                     number_bait = number_bait - 9
 | 
| 
 | 
    54                     bait_cache.append((number_bait, str(line1)))
 | 
| 
 | 
    55                 else:
 | 
| 
 | 
    56                     print "Error: bad bait " + str(baits[i])
 | 
| 
 | 
    57                     sys.exit()
 | 
| 
 | 
    58            else: 
 | 
| 
 | 
    59                 pass                    
 | 
| 
 | 
    60         i = i + 3
 | 
| 
 | 
    61     #Writes cache to file.
 | 
| 
 | 
    62     bait_cache.sort()
 | 
| 
 | 
    63     for line in bait_cache:
 | 
| 
 | 
    64         bait_file_tmp.write(line[1])            
 | 
| 
 | 
    65         
 | 
| 
 | 
    66     bait_file_tmp.close()  
 | 
| 
 | 
    67 
 | 
| 
 | 
    68 
 | 
| 
 | 
    69 baitfile = "bait.txt"
 | 
| 
 | 
    70 
 | 
| 
 | 
    71 class ReturnValue1(object):
 | 
| 
 | 
    72     def __init__(self, sequence, gene):
 | 
| 
 | 
    73      self.seqlength = sequence
 | 
| 
 | 
    74      self.genename = gene
 | 
| 
 | 
    75 class ReturnValue2(object):
 | 
| 
 | 
    76     def __init__(self, getdata, getproteins, getheader):
 | 
| 
 | 
    77         self.data = getdata
 | 
| 
 | 
    78         self.proteins = getproteins
 | 
| 
 | 
    79         self.header = getheader
 | 
| 
 | 
    80 
 | 
| 
 | 
    81 def main(maxquant_input, make_bait):  
 | 
| 
 | 
    82     bait_create(make_bait, infile)
 | 
| 
 | 
    83     baitfile = "bait.txt"
 | 
| 
 | 
    84     #bait_check(baitfile, maxquant_input)
 | 
| 
 | 
    85     make_inter(maxquant_input)
 | 
| 
 | 
    86     if prey == 'true':
 | 
| 
 | 
    87         make_prey(maxquant_input)
 | 
| 
 | 
    88         no_error_inter(maxquant_input)
 | 
| 
 | 
    89         os.rename('prey.txt', sys.argv[5])
 | 
| 
 | 
    90     elif prey == 'false':
 | 
| 
 | 
    91         if os.path.isfile('error proteins.txt') == True:
 | 
| 
 | 
    92             no_error_inter(maxquant_input)
 | 
| 
 | 
    93         pass
 | 
| 
 | 
    94     elif prey != 'true' or 'false':
 | 
| 
 | 
    95         sys.exit("Invalid Prey Argument: Y or N")
 | 
| 
 | 
    96     os.rename('inter.txt', sys.argv[4])
 | 
| 
 | 
    97     os.rename("bait.txt", sys.argv[7])
 | 
| 
 | 
    98 
 | 
| 
 | 
    99 
 | 
| 
 | 
   100 def get_info(uniprot_accession_in): # get aa lengths and gene name
 | 
| 
 | 
   101     error = open('error proteins.txt', 'a+')
 | 
| 
 | 
   102 #    while True:
 | 
| 
 | 
   103 #        i = 0
 | 
| 
 | 
   104 #	try:  
 | 
| 
 | 
   105 #            data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in + ".fasta")
 | 
| 
 | 
   106 #            break
 | 
| 
 | 
   107 #        except urllib2.HTTPError, err:
 | 
| 
 | 
   108 #            i = i + 1
 | 
| 
 | 
   109 #            if i == 50:
 | 
| 
 | 
   110 #                sys.exit("More than 50 errors. Check your file or try again later.")
 | 
| 
 | 
   111 #            if err.code == 404:
 | 
| 
 | 
   112 #                error.write(uniprot_accession_in + '\t' + "Invalid URL. Check protein" + '\n')
 | 
| 
 | 
   113 #                seqlength = 'NA'
 | 
| 
 | 
   114 #                genename = 'NA'
 | 
| 
 | 
   115 #                return ReturnValue1(seqlength, genename)
 | 
| 
 | 
   116 #            elif err.code == 302:
 | 
| 
 | 
   117 #                sys.exit("Request timed out. Check connection and try again.")
 | 
| 
 | 
   118 #            else:
 | 
| 
 | 
   119 #                sys.exit("Uniprot had some other error")
 | 
| 
 | 
   120 #
 | 
| 
 | 
   121 #
 | 
| 
 | 
   122 #    if lines == []:
 | 
| 
 | 
   123 #        error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n')
 | 
| 
 | 
   124 #        error.close
 | 
| 
 | 
   125 # if lines == []:
 | 
| 
 | 
   126 #        error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n')
 | 
| 
 | 
   127 #        error.close
 | 
| 
 | 
   128 #        seqlength = 'NA'
 | 
| 
 | 
   129 #        genename = 'NA'
 | 
| 
 | 
   130 #        return ReturnValue1(seqlength, genename)
 | 
| 
 | 
   131 #    if lines != []:
 | 
| 
 | 
   132 #        seqlength = 0
 | 
| 
 | 
   133 #        header = lines[0]
 | 
| 
 | 
   134 #        for line in lines[1:]:
 | 
| 
 | 
   135 #            line = line.replace("\n","") # strip \n or else it gets counted in the length
 | 
| 
 | 
   136 #            seqlength += len(line) 
 | 
| 
 | 
   137 #        if 'GN=' in header:
 | 
| 
 | 
   138 #            lst = header.split('GN=')
 | 
| 
 | 
   139 #            lst2 = lst[1].split(' ')
 | 
| 
 | 
   140 #            genename = lst2[0]
 | 
| 
 | 
   141 #            error.close
 | 
| 
 | 
   142 #            return ReturnValue1(seqlength, genename)
 | 
| 
 | 
   143 #        if 'GN=' not in header:
 | 
| 
 | 
   144 #            genename = 'NA'
 | 
| 
 | 
   145 #            error.close
 | 
| 
 | 
   146 #            return ReturnValue1(seqlength, genename)
 | 
| 
 | 
   147 
 | 
| 
 | 
   148     data = open(fasta_db,'r')
 | 
| 
 | 
   149     lines = data.readlines()
 | 
| 
 | 
   150     db_len = len(lines)
 | 
| 
 | 
   151     seqlength = 0
 | 
| 
 | 
   152     count = 0
 | 
| 
 | 
   153     for i in lines:           
 | 
| 
 | 
   154         if ">sp" in i:
 | 
| 
 | 
   155             if uniprot_accession_in == i.split("|")[1]:
 | 
| 
 | 
   156                 match = count+1
 | 
| 
 | 
   157                 if 'GN=' in i:
 | 
| 
 | 
   158                     lst = i.split('GN=')
 | 
| 
 | 
   159                     lst2 = lst[1].split(' ')
 | 
| 
 | 
   160                     genename = lst2[0]
 | 
| 
 | 
   161                 if 'GN=' not in i:
 | 
| 
 | 
   162                     genename = 'NA'
 | 
| 
 | 
   163                 while ">sp" not in lines[match]:
 | 
| 
 | 
   164                     if match <= db_len:
 | 
| 
 | 
   165                         seqlength = seqlength + len(lines[match].strip())
 | 
| 
 | 
   166                         match = match + 1
 | 
| 
 | 
   167                     else:
 | 
| 
 | 
   168                         break
 | 
| 
 | 
   169                 return ReturnValue1(seqlength, genename)
 | 
| 
 | 
   170         count = count + 1
 | 
| 
 | 
   171         
 | 
| 
 | 
   172 
 | 
| 
 | 
   173     if seqlength == 0:
 | 
| 
 | 
   174         error.write(uniprot_accession_in + '\t' + "Uniprot not in Fasta" + '\n')
 | 
| 
 | 
   175         error.close
 | 
| 
 | 
   176         seqlength = 'NA'
 | 
| 
 | 
   177         genename = 'NA'
 | 
| 
 | 
   178         return ReturnValue1(seqlength, genename)
 | 
| 
 | 
   179 
 | 
| 
 | 
   180 
 | 
| 
 | 
   181 def readtab(infile):
 | 
| 
 | 
   182     with open(infile,'r') as x: # read in tab-delim text
 | 
| 
 | 
   183         output = []
 | 
| 
 | 
   184         for line in x:
 | 
| 
 | 
   185             line = line.strip()
 | 
| 
 | 
   186             temp = line.split('\t')
 | 
| 
 | 
   187             output.append(temp)
 | 
| 
 | 
   188     return output
 | 
| 
 | 
   189 def read_maxquant(maxquant_input): # Get data, proteins and header from maxquant output
 | 
| 
 | 
   190     dupes = readtab(maxquant_input)
 | 
| 
 | 
   191     header_start = 0 
 | 
| 
 | 
   192     header = dupes[header_start]
 | 
| 
 | 
   193     for i in header:
 | 
| 
 | 
   194         i = i.replace(r"\"", "")
 | 
| 
 | 
   195         i = i.replace(r"Intensity.", r"")
 | 
| 
 | 
   196         i = i.replace(r".", r"-")
 | 
| 
 | 
   197     data = dupes[header_start+1:len(dupes)] #cut off blank line and END OF FILE
 | 
| 
 | 
   198     proteins = []
 | 
| 
 | 
   199     for protein in data:
 | 
| 
 | 
   200         proteins.append(protein[0])
 | 
| 
 | 
   201     return ReturnValue2(data, proteins, header)
 | 
| 
 | 
   202 def make_inter(maxquant_input):
 | 
| 
 | 
   203     bait = readtab(baitfile)
 | 
| 
 | 
   204     data = read_maxquant(maxquant_input).data
 | 
| 
 | 
   205     header = read_maxquant(maxquant_input).header
 | 
| 
 | 
   206     proteins = read_maxquant(maxquant_input).proteins
 | 
| 
 | 
   207     bait_index = []
 | 
| 
 | 
   208     for i in bait:
 | 
| 
 | 
   209         bait_index.append(header.index("mapped_protein") + 1) # Find just the baits defined in bait file
 | 
| 
 | 
   210     with open('inter.txt', 'w') as y:
 | 
| 
 | 
   211             a = 0; l=0
 | 
| 
 | 
   212             for bb in bait:
 | 
| 
 | 
   213                 for lst in data:
 | 
| 
 | 
   214                     y.write(header[bait_index[l]] + '\t' + bb[1] + '\t' + proteins[a] + '\t' + lst[bait_index[l]] + '\n')
 | 
| 
 | 
   215                     a+=1
 | 
| 
 | 
   216                     if a == len(proteins):
 | 
| 
 | 
   217                         a = 0; l+=1
 | 
| 
 | 
   218 def make_prey(maxquant_input):
 | 
| 
 | 
   219     proteins = read_maxquant(maxquant_input).proteins
 | 
| 
 | 
   220     output_file = open("prey.txt",'w')
 | 
| 
 | 
   221     for a in proteins:
 | 
| 
 | 
   222         a = a.replace("\n","") # remove \n for input into function
 | 
| 
 | 
   223         a = a.replace("\r","") # ditto for \r
 | 
| 
 | 
   224         seq = get_info(a).seqlength
 | 
| 
 | 
   225         GN = get_info(a).genename
 | 
| 
 | 
   226         if seq != 'NA':
 | 
| 
 | 
   227             output_file.write(a+"\t"+str(seq)+ "\t" + str(GN) + "\n")
 | 
| 
 | 
   228     output_file.close()
 | 
| 
 | 
   229 def no_error_inter(maxquant_input): # remake inter file without protein errors from Uniprot
 | 
| 
 | 
   230     err = readtab("error proteins.txt")
 | 
| 
 | 
   231     bait = readtab(baitfile)
 | 
| 
 | 
   232     data = read_maxquant(maxquant_input).data
 | 
| 
 | 
   233     header = read_maxquant(maxquant_input).header
 | 
| 
 | 
   234     header = [i.replace(r"\"", "") for i in header]
 | 
| 
 | 
   235     header = [i.replace(r"Intensity.", r"") for i in header]
 | 
| 
 | 
   236     header = [i.replace(r".", r"-") for i in header]
 | 
| 
 | 
   237     print header
 | 
| 
 | 
   238     bait_index = []
 | 
| 
 | 
   239     for i in bait:
 | 
| 
 | 
   240         bait_index.append(header.index(i[0]))
 | 
| 
 | 
   241     proteins = read_maxquant(maxquant_input).proteins
 | 
| 
 | 
   242     errors = []
 | 
| 
 | 
   243     for e in err:
 | 
| 
 | 
   244         errors.append(e[0])
 | 
| 
 | 
   245     with open('inter.txt', 'w') as y:
 | 
| 
 | 
   246         l = 0; a = 0
 | 
| 
 | 
   247         for bb in bait:
 | 
| 
 | 
   248             for lst in data:
 | 
| 
 | 
   249                 if proteins[a] not in errors:
 | 
| 
 | 
   250                     y.write(header[bait_index[l]] + '\t' + bb[1] + '\t' + proteins[a] + '\t' + lst[bait_index[l]] + '\n')
 | 
| 
 | 
   251                 a+=1
 | 
| 
 | 
   252                 if a == len(proteins):
 | 
| 
 | 
   253                     l += 1; a = 0
 | 
| 
 | 
   254 def bait_check(bait, maxquant_input): # check that bait names share header titles
 | 
| 
 | 
   255     bait_in = readtab(bait)
 | 
| 
 | 
   256     header = read_maxquant(maxquant_input).header
 | 
| 
 | 
   257     for i in bait_in:
 | 
| 
 | 
   258         if i[0] not in header:
 | 
| 
 | 
   259             sys.exit("Bait must share header titles with maxquant output")
 | 
| 
 | 
   260 
 | 
| 
 | 
   261 if __name__ == '__main__':
 | 
| 
 | 
   262     main(infile, make_bait)
 |