| 
8
 | 
     1 #######################################################################################
 | 
| 
 | 
     2 # Python-code: SAINT pre-processing from Scaffold "Samples Report" output
 | 
| 
 | 
     3 # Author: Brent Kuenzi
 | 
| 
 | 
     4 #######################################################################################
 | 
| 
 | 
     5 # This program reads in a raw Scaffold "Samples Report" output and a user generated
 | 
| 
 | 
     6 # bait file and autoformats it into prey and interaction files for SAINTexpress 
 | 
| 
 | 
     7 # analysis
 | 
| 
 | 
     8 #######################################################################################
 | 
| 
 | 
     9 import sys
 | 
| 
 | 
    10 import urllib2
 | 
| 
 | 
    11 import os.path
 | 
| 
 | 
    12 #######################################################################################
 | 
| 
 | 
    13 ## REQUIRED INPUT ##
 | 
| 
 | 
    14 
 | 
| 
 | 
    15 # 1) infile: Scaffold "Samples Report" output
 | 
| 
 | 
    16 # 2) baitfile: SAINT formatted bait file generated in Galaxy
 | 
| 
 | 
    17 # 3) prey: Y or N for generating a prey file (requires internet connection)
 | 
| 
 | 
    18 #######################################################################################
 | 
| 
 | 
    19 infile = sys.argv[1] #Scaffold "Samples Report" output
 | 
| 
 | 
    20 prey = sys.argv[2] # Y or N
 | 
| 
 | 
    21 fasta_db = sys.argv[3]
 | 
| 
 | 
    22 tool_path = sys.argv[8]
 | 
| 
 | 
    23 if fasta_db == "None":
 | 
| 
 | 
    24     fasta_db = str(tool_path)  + "/SwissProt_HUMAN_2014_08.fasta"
 | 
| 
 | 
    25 make_bait= sys.argv[5]
 | 
| 
 | 
    26 
 | 
| 
 | 
    27 
 | 
| 
 | 
    28 baits = make_bait.split()
 | 
| 
 | 
    29 i = 0
 | 
| 
 | 
    30 bait_file_tmp = open("bait.txt", "wr")
 | 
| 
 | 
    31 order = [] 
 | 
| 
 | 
    32 bait_cache = []
 | 
| 
 | 
    33 
 | 
| 
 | 
    34 while i < len(baits):
 | 
| 
 | 
    35     if baits[i+2] == "true":
 | 
| 
 | 
    36         T_C = "C"
 | 
| 
 | 
    37     else:
 | 
| 
 | 
    38         T_C = "T"
 | 
| 
 | 
    39     line1 = baits[i] + "\t" + baits[i+1] + "\t" + T_C + "\n"
 | 
| 
 | 
    40     q = open(infile,"r")
 | 
| 
 | 
    41     for line2 in q:
 | 
| 
 | 
    42         line2 = line2.strip()
 | 
| 
 | 
    43         temp = line2.split('\t')
 | 
| 
 | 
    44         if "Quantitative Variance" in str(temp):
 | 
| 
 | 
    45             if baits[i] in temp:
 | 
| 
 | 
    46                 number_bait = temp.index(str(baits[i]))
 | 
| 
 | 
    47                 number_bait = number_bait - 9
 | 
| 
 | 
    48                 bait_cache.append((number_bait, str(line1)))
 | 
| 
 | 
    49             else:
 | 
| 
 | 
    50                 print "Error: bad bait " + str(baits[i])
 | 
| 
 | 
    51                 sys.exit()
 | 
| 
 | 
    52         else: 
 | 
| 
 | 
    53             pass                    
 | 
| 
 | 
    54     i = i + 3
 | 
| 
 | 
    55 
 | 
| 
 | 
    56 bait_cache.sort()
 | 
| 
 | 
    57 for line in bait_cache:
 | 
| 
 | 
    58     bait_file_tmp.write(line[1])            
 | 
| 
 | 
    59         
 | 
| 
 | 
    60 bait_file_tmp.close()
 | 
| 
 | 
    61 baitfile = "bait.txt" 
 | 
| 
 | 
    62 
 | 
| 
 | 
    63 class ReturnValue1(object):
 | 
| 
 | 
    64     def __init__(self, sequence, gene):
 | 
| 
 | 
    65      self.seqlength = sequence
 | 
| 
 | 
    66      self.genename = gene
 | 
| 
 | 
    67 class ReturnValue2(object):
 | 
| 
 | 
    68     def __init__(self, getdata, getproteins, getheader):
 | 
| 
 | 
    69         self.data = getdata
 | 
| 
 | 
    70         self.proteins = getproteins
 | 
| 
 | 
    71         self.header = getheader
 | 
| 
 | 
    72 
 | 
| 
 | 
    73 def main(scaffold_input, baitfile): 
 | 
| 
 | 
    74     bait_check(baitfile, scaffold_input)
 | 
| 
 | 
    75     make_inter(scaffold_input)
 | 
| 
 | 
    76     if prey == 'true':
 | 
| 
 | 
    77         make_prey(scaffold_input)
 | 
| 
 | 
    78         no_error_inter(scaffold_input)
 | 
| 
 | 
    79         os.rename('prey.txt', sys.argv[5])
 | 
| 
 | 
    80     elif prey == 'false':
 | 
| 
 | 
    81         if os.path.isfile('error proteins.txt') == True:
 | 
| 
 | 
    82             no_error_inter(scaffold_input)
 | 
| 
 | 
    83         pass
 | 
| 
 | 
    84     elif prey != 'true' or 'false':
 | 
| 
 | 
    85         sys.exit("Invalid Prey Argument: Y or N")
 | 
| 
 | 
    86 
 | 
| 
 | 
    87 def get_info(uniprot_accession_in): # get aa lengths and gene name
 | 
| 
 | 
    88     error = open('error proteins.txt', 'a+')
 | 
| 
 | 
    89 #    while True:
 | 
| 
 | 
    90 #        i = 0
 | 
| 
 | 
    91 #	try:  
 | 
| 
 | 
    92 #            data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in + ".fasta")
 | 
| 
 | 
    93 #            break
 | 
| 
 | 
    94 #        except urllib2.HTTPError, err:
 | 
| 
 | 
    95 #            i = i + 1
 | 
| 
 | 
    96 #            if i == 50:
 | 
| 
 | 
    97 #                sys.exit("More than 50 errors. Check your file or try again later.")
 | 
| 
 | 
    98 #            if err.code == 404:
 | 
| 
 | 
    99 #                error.write(uniprot_accession_in + '\t' + "Invalid URL. Check protein" + '\n')
 | 
| 
 | 
   100 #                seqlength = 'NA'
 | 
| 
 | 
   101 #                genename = 'NA'
 | 
| 
 | 
   102 #                return ReturnValue1(seqlength, genename)
 | 
| 
 | 
   103 #            elif err.code == 302:
 | 
| 
 | 
   104 #                sys.exit("Request timed out. Check connection and try again.")
 | 
| 
 | 
   105 #            else:
 | 
| 
 | 
   106 #                sys.exit("Uniprot had some other error")
 | 
| 
 | 
   107 #    lines = data.readlines()
 | 
| 
 | 
   108 #    if lines == []:
 | 
| 
 | 
   109 #        error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n')
 | 
| 
 | 
   110 #        error.close
 | 
| 
 | 
   111 #        seqlength = 'NA'
 | 
| 
 | 
   112 #        genename = 'NA'
 | 
| 
 | 
   113 #        return ReturnValue1(seqlength, genename)
 | 
| 
 | 
   114 #    if lines != []:
 | 
| 
 | 
   115 #        seqlength = 0
 | 
| 
 | 
   116 #        header = lines[0]
 | 
| 
 | 
   117 #        for line in lines[1:]:
 | 
| 
 | 
   118 #            line = line.replace("\n","") # strip \n or else it gets counted in the length
 | 
| 
 | 
   119 #            seqlength += len(line) 
 | 
| 
 | 
   120 #        if 'GN=' in header:
 | 
| 
 | 
   121 #            lst = header.split('GN=')
 | 
| 
 | 
   122 #            lst2 = lst[1].split(' ')
 | 
| 
 | 
   123 #            genename = lst2[0]
 | 
| 
 | 
   124 #            error.close
 | 
| 
 | 
   125 #            return ReturnValue1(seqlength, genename)
 | 
| 
 | 
   126 #        if 'GN=' not in header:
 | 
| 
 | 
   127 #            genename = 'NA'
 | 
| 
 | 
   128 #            error.close
 | 
| 
 | 
   129 #            return ReturnValue1(seqlength, genename)
 | 
| 
 | 
   130     data = open(fasta_db,'r')
 | 
| 
 | 
   131     lines = data.readlines()
 | 
| 
 | 
   132     db_len = len(lines)
 | 
| 
 | 
   133     seqlength = 0
 | 
| 
 | 
   134     count = 0
 | 
| 
 | 
   135     for i in lines:           
 | 
| 
 | 
   136         if ">sp" in i:
 | 
| 
 | 
   137             if uniprot_accession_in == i.split("|")[1]:
 | 
| 
 | 
   138                 match = count+1
 | 
| 
 | 
   139                 if 'GN=' in i:
 | 
| 
 | 
   140                     lst = i.split('GN=')
 | 
| 
 | 
   141                     lst2 = lst[1].split(' ')
 | 
| 
 | 
   142                     genename = lst2[0]
 | 
| 
 | 
   143                 if 'GN=' not in i:
 | 
| 
 | 
   144                     genename = 'NA'
 | 
| 
 | 
   145                 while ">sp" not in lines[match]:
 | 
| 
 | 
   146                     if match <= db_len:
 | 
| 
 | 
   147                         seqlength = seqlength + len(lines[match].strip())
 | 
| 
 | 
   148                         match = match + 1
 | 
| 
 | 
   149                     else:
 | 
| 
 | 
   150                         break
 | 
| 
 | 
   151                 return ReturnValue1(seqlength, genename)
 | 
| 
 | 
   152         count = count + 1
 | 
| 
 | 
   153         
 | 
| 
 | 
   154 
 | 
| 
 | 
   155     if seqlength == 0:
 | 
| 
 | 
   156         error.write(uniprot_accession_in + '\t' + "Uniprot not in Fasta" + '\n')
 | 
| 
 | 
   157         error.close
 | 
| 
 | 
   158         seqlength = 'NA'
 | 
| 
 | 
   159         genename = 'NA'
 | 
| 
 | 
   160         return ReturnValue1(seqlength, genename)
 | 
| 
 | 
   161 
 | 
| 
 | 
   162 def readtab(infile):
 | 
| 
 | 
   163     with open(infile,'r') as x: # read in tab-delim text
 | 
| 
 | 
   164         output = []
 | 
| 
 | 
   165         for line in x:
 | 
| 
 | 
   166             line = line.strip()
 | 
| 
 | 
   167             temp = line.split('\t')
 | 
| 
 | 
   168             output.append(temp)
 | 
| 
 | 
   169     return output
 | 
| 
 | 
   170 def read_scaffold(scaffold_input): # Get data, proteins and header from scaffold output
 | 
| 
 | 
   171     dupes = readtab(scaffold_input)
 | 
| 
 | 
   172     cnt = 0
 | 
| 
 | 
   173     for i in dupes:
 | 
| 
 | 
   174         cnt += 1
 | 
| 
 | 
   175         if i[0] == '#': # finds the start of second header
 | 
| 
 | 
   176             header_start = cnt-1
 | 
| 
 | 
   177     header = dupes[header_start]
 | 
| 
 | 
   178     prot_start = header.index("Accession Number")
 | 
| 
 | 
   179     data = dupes[header_start+1:len(dupes)-2] # cut off blank line and END OF FILE
 | 
| 
 | 
   180     proteins = []
 | 
| 
 | 
   181     for i in data:
 | 
| 
 | 
   182         i[4] = i[4].split()[0] # removes the (+##) that sometimes is attached
 | 
| 
 | 
   183     for protein in data:
 | 
| 
 | 
   184         proteins.append(protein[prot_start])
 | 
| 
 | 
   185     return ReturnValue2(data, proteins, header)
 | 
| 
 | 
   186 def make_inter(scaffold_input):
 | 
| 
 | 
   187     bait = readtab(baitfile)
 | 
| 
 | 
   188     data = read_scaffold(scaffold_input).data
 | 
| 
 | 
   189     header = read_scaffold(scaffold_input).header
 | 
| 
 | 
   190     proteins = read_scaffold(scaffold_input).proteins
 | 
| 
 | 
   191     bait_index = []
 | 
| 
 | 
   192     for i in bait:
 | 
| 
 | 
   193         bait_index.append(header.index(i[0])) # Find just the baits defined in bait file
 | 
| 
 | 
   194     with open('inter.txt', 'w') as y:
 | 
| 
 | 
   195             a = 0; l=0
 | 
| 
 | 
   196             for bb in bait:
 | 
| 
 | 
   197                 for lst in data:
 | 
| 
 | 
   198                     y.write(header[bait_index[l]] + '\t' + bb[1] + '\t' + proteins[a] + '\t' + lst[bait_index[l]] + '\n')
 | 
| 
 | 
   199                     a+=1
 | 
| 
 | 
   200                     if a == len(proteins):
 | 
| 
 | 
   201                         a = 0; l+=1
 | 
| 
 | 
   202 def make_prey(scaffold_input):
 | 
| 
 | 
   203     proteins = read_scaffold(scaffold_input).proteins
 | 
| 
 | 
   204     output_file = open("prey.txt",'w')
 | 
| 
 | 
   205     for a in proteins:
 | 
| 
 | 
   206         a = a.replace("\n","") # remove \n for input into function
 | 
| 
 | 
   207         a = a.replace("\r","") # ditto for \r
 | 
| 
 | 
   208         seq = get_info(a).seqlength
 | 
| 
 | 
   209         GN = get_info(a).genename
 | 
| 
 | 
   210         if seq != 'NA':
 | 
| 
 | 
   211             output_file.write(a+"\t"+str(seq)+ "\t" + str(GN) + "\n")
 | 
| 
 | 
   212     output_file.close()
 | 
| 
 | 
   213 def no_error_inter(scaffold_input): # remake inter file without protein errors from Uniprot
 | 
| 
 | 
   214     err = readtab("error proteins.txt")
 | 
| 
 | 
   215     bait = readtab(baitfile)
 | 
| 
 | 
   216     data = read_scaffold(scaffold_input).data
 | 
| 
 | 
   217     header = read_scaffold(scaffold_input).header
 | 
| 
 | 
   218     bait_index = []
 | 
| 
 | 
   219     for i in bait:
 | 
| 
 | 
   220         bait_index.append(header.index(i[0]))
 | 
| 
 | 
   221     proteins = read_scaffold(scaffold_input).proteins
 | 
| 
 | 
   222     errors = []
 | 
| 
 | 
   223     for e in err:
 | 
| 
 | 
   224         errors.append(e[0])
 | 
| 
 | 
   225     with open('inter.txt', 'w') as y:
 | 
| 
 | 
   226         l = 0; a = 0
 | 
| 
 | 
   227         for bb in bait:
 | 
| 
 | 
   228             for lst in data:
 | 
| 
 | 
   229                 if proteins[a] not in errors:
 | 
| 
 | 
   230                     y.write(header[bait_index[l]] + '\t' + bb[1] + '\t' + proteins[a] + '\t' + lst[bait_index[l]] + '\n')
 | 
| 
 | 
   231                 a+=1
 | 
| 
 | 
   232                 if a == len(proteins):
 | 
| 
 | 
   233                     l += 1; a = 0
 | 
| 
 | 
   234 def bait_check(bait, scaffold_input): # check that bait names share header titles
 | 
| 
 | 
   235     bait_in = readtab(bait)
 | 
| 
 | 
   236     header = read_scaffold(scaffold_input).header
 | 
| 
 | 
   237     for i in bait_in:
 | 
| 
 | 
   238         if i[0] not in header:
 | 
| 
 | 
   239             sys.exit("Bait must share header titles with Scaffold output")
 | 
| 
 | 
   240 
 | 
| 
 | 
   241 if __name__ == '__main__':
 | 
| 
 | 
   242     main(infile, baitfile)
 | 
| 
 | 
   243 
 | 
| 
 | 
   244 os.rename('inter.txt', sys.argv[4])
 | 
| 
 | 
   245 os.rename("bait.txt", sys.argv[7])
 |