| 
8
 | 
     1 #######################################################################################
 | 
| 
 | 
     2 # Python-code: SAINT pre-processing from Scaffold "Samples Report" output
 | 
| 
 | 
     3 # Author: Brent Kuenzi
 | 
| 
 | 
     4 #######################################################################################
 | 
| 
 | 
     5 # This program reads in a raw Scaffold "Samples Report" output and a user generated
 | 
| 
 | 
     6 # bait file and autoformats it into prey and interaction files for SAINTexpress 
 | 
| 
 | 
     7 # analysis
 | 
| 
 | 
     8 #######################################################################################
 | 
| 
 | 
     9 import sys
 | 
| 
 | 
    10 import urllib2
 | 
| 
 | 
    11 import os.path
 | 
| 
 | 
    12 #######################################################################################
 | 
| 
 | 
    13 ## REQUIRED INPUT ##
 | 
| 
 | 
    14 
 | 
| 
 | 
    15 # 1) infile: Scaffold "Samples Report" output
 | 
| 
 | 
    16 # 2) baitfile: SAINT formatted bait file generated in Galaxy
 | 
| 
 | 
    17 # 3) prey: Y or N for generating a prey file (requires internet connection)
 | 
| 
 | 
    18 #######################################################################################
 | 
| 
 | 
    19 infile = sys.argv[1] #Scaffold "Samples Report" output
 | 
| 
 | 
    20 prey = sys.argv[2] # Y or N
 | 
| 
 | 
    21 fasta_db = sys.argv[3]
 | 
| 
 | 
    22 tool_path = sys.argv[8]
 | 
| 
 | 
    23 if fasta_db == "None":
 | 
| 
 | 
    24     fasta_db = str(tool_path)  + "/SwissProt_HUMAN_2014_08.fasta"
 | 
| 
13
 | 
    25 make_bait= sys.argv[6]
 | 
| 
8
 | 
    26 
 | 
| 
 | 
    27 
 | 
| 
 | 
    28 baits = make_bait.split()
 | 
| 
13
 | 
    29 print baits
 | 
| 
8
 | 
    30 i = 0
 | 
| 
 | 
    31 bait_file_tmp = open("bait.txt", "wr")
 | 
| 
 | 
    32 order = [] 
 | 
| 
 | 
    33 bait_cache = []
 | 
| 
 | 
    34 
 | 
| 
 | 
    35 while i < len(baits):
 | 
| 
 | 
    36     if baits[i+2] == "true":
 | 
| 
 | 
    37         T_C = "C"
 | 
| 
 | 
    38     else:
 | 
| 
 | 
    39         T_C = "T"
 | 
| 
 | 
    40     line1 = baits[i] + "\t" + baits[i+1] + "\t" + T_C + "\n"
 | 
| 
 | 
    41     q = open(infile,"r")
 | 
| 
 | 
    42     for line2 in q:
 | 
| 
 | 
    43         line2 = line2.strip()
 | 
| 
 | 
    44         temp = line2.split('\t')
 | 
| 
 | 
    45         if "Quantitative Variance" in str(temp):
 | 
| 
 | 
    46             if baits[i] in temp:
 | 
| 
 | 
    47                 number_bait = temp.index(str(baits[i]))
 | 
| 
 | 
    48                 number_bait = number_bait - 9
 | 
| 
 | 
    49                 bait_cache.append((number_bait, str(line1)))
 | 
| 
 | 
    50             else:
 | 
| 
 | 
    51                 print "Error: bad bait " + str(baits[i])
 | 
| 
 | 
    52                 sys.exit()
 | 
| 
 | 
    53         else: 
 | 
| 
 | 
    54             pass                    
 | 
| 
 | 
    55     i = i + 3
 | 
| 
 | 
    56 
 | 
| 
 | 
    57 bait_cache.sort()
 | 
| 
 | 
    58 for line in bait_cache:
 | 
| 
 | 
    59     bait_file_tmp.write(line[1])            
 | 
| 
 | 
    60         
 | 
| 
 | 
    61 bait_file_tmp.close()
 | 
| 
 | 
    62 baitfile = "bait.txt" 
 | 
| 
 | 
    63 
 | 
| 
 | 
    64 class ReturnValue1(object):
 | 
| 
 | 
    65     def __init__(self, sequence, gene):
 | 
| 
 | 
    66      self.seqlength = sequence
 | 
| 
 | 
    67      self.genename = gene
 | 
| 
 | 
    68 class ReturnValue2(object):
 | 
| 
 | 
    69     def __init__(self, getdata, getproteins, getheader):
 | 
| 
 | 
    70         self.data = getdata
 | 
| 
 | 
    71         self.proteins = getproteins
 | 
| 
 | 
    72         self.header = getheader
 | 
| 
 | 
    73 
 | 
| 
 | 
    74 def main(scaffold_input, baitfile): 
 | 
| 
 | 
    75     bait_check(baitfile, scaffold_input)
 | 
| 
 | 
    76     make_inter(scaffold_input)
 | 
| 
 | 
    77     if prey == 'true':
 | 
| 
 | 
    78         make_prey(scaffold_input)
 | 
| 
 | 
    79         no_error_inter(scaffold_input)
 | 
| 
 | 
    80         os.rename('prey.txt', sys.argv[5])
 | 
| 
 | 
    81     elif prey == 'false':
 | 
| 
 | 
    82         if os.path.isfile('error proteins.txt') == True:
 | 
| 
 | 
    83             no_error_inter(scaffold_input)
 | 
| 
 | 
    84         pass
 | 
| 
 | 
    85     elif prey != 'true' or 'false':
 | 
| 
 | 
    86         sys.exit("Invalid Prey Argument: Y or N")
 | 
| 
 | 
    87 
 | 
| 
 | 
    88 def get_info(uniprot_accession_in): # get aa lengths and gene name
 | 
| 
 | 
    89     error = open('error proteins.txt', 'a+')
 | 
| 
 | 
    90 #    while True:
 | 
| 
 | 
    91 #        i = 0
 | 
| 
 | 
    92 #	try:  
 | 
| 
 | 
    93 #            data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in + ".fasta")
 | 
| 
 | 
    94 #            break
 | 
| 
 | 
    95 #        except urllib2.HTTPError, err:
 | 
| 
 | 
    96 #            i = i + 1
 | 
| 
 | 
    97 #            if i == 50:
 | 
| 
 | 
    98 #                sys.exit("More than 50 errors. Check your file or try again later.")
 | 
| 
 | 
    99 #            if err.code == 404:
 | 
| 
 | 
   100 #                error.write(uniprot_accession_in + '\t' + "Invalid URL. Check protein" + '\n')
 | 
| 
 | 
   101 #                seqlength = 'NA'
 | 
| 
 | 
   102 #                genename = 'NA'
 | 
| 
 | 
   103 #                return ReturnValue1(seqlength, genename)
 | 
| 
 | 
   104 #            elif err.code == 302:
 | 
| 
 | 
   105 #                sys.exit("Request timed out. Check connection and try again.")
 | 
| 
 | 
   106 #            else:
 | 
| 
 | 
   107 #                sys.exit("Uniprot had some other error")
 | 
| 
 | 
   108 #    lines = data.readlines()
 | 
| 
 | 
   109 #    if lines == []:
 | 
| 
 | 
   110 #        error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n')
 | 
| 
 | 
   111 #        error.close
 | 
| 
 | 
   112 #        seqlength = 'NA'
 | 
| 
 | 
   113 #        genename = 'NA'
 | 
| 
 | 
   114 #        return ReturnValue1(seqlength, genename)
 | 
| 
 | 
   115 #    if lines != []:
 | 
| 
 | 
   116 #        seqlength = 0
 | 
| 
 | 
   117 #        header = lines[0]
 | 
| 
 | 
   118 #        for line in lines[1:]:
 | 
| 
 | 
   119 #            line = line.replace("\n","") # strip \n or else it gets counted in the length
 | 
| 
 | 
   120 #            seqlength += len(line) 
 | 
| 
 | 
   121 #        if 'GN=' in header:
 | 
| 
 | 
   122 #            lst = header.split('GN=')
 | 
| 
 | 
   123 #            lst2 = lst[1].split(' ')
 | 
| 
 | 
   124 #            genename = lst2[0]
 | 
| 
 | 
   125 #            error.close
 | 
| 
 | 
   126 #            return ReturnValue1(seqlength, genename)
 | 
| 
 | 
   127 #        if 'GN=' not in header:
 | 
| 
 | 
   128 #            genename = 'NA'
 | 
| 
 | 
   129 #            error.close
 | 
| 
 | 
   130 #            return ReturnValue1(seqlength, genename)
 | 
| 
 | 
   131     data = open(fasta_db,'r')
 | 
| 
 | 
   132     lines = data.readlines()
 | 
| 
 | 
   133     db_len = len(lines)
 | 
| 
 | 
   134     seqlength = 0
 | 
| 
 | 
   135     count = 0
 | 
| 
 | 
   136     for i in lines:           
 | 
| 
 | 
   137         if ">sp" in i:
 | 
| 
 | 
   138             if uniprot_accession_in == i.split("|")[1]:
 | 
| 
 | 
   139                 match = count+1
 | 
| 
 | 
   140                 if 'GN=' in i:
 | 
| 
 | 
   141                     lst = i.split('GN=')
 | 
| 
 | 
   142                     lst2 = lst[1].split(' ')
 | 
| 
 | 
   143                     genename = lst2[0]
 | 
| 
 | 
   144                 if 'GN=' not in i:
 | 
| 
 | 
   145                     genename = 'NA'
 | 
| 
 | 
   146                 while ">sp" not in lines[match]:
 | 
| 
 | 
   147                     if match <= db_len:
 | 
| 
 | 
   148                         seqlength = seqlength + len(lines[match].strip())
 | 
| 
 | 
   149                         match = match + 1
 | 
| 
 | 
   150                     else:
 | 
| 
 | 
   151                         break
 | 
| 
 | 
   152                 return ReturnValue1(seqlength, genename)
 | 
| 
 | 
   153         count = count + 1
 | 
| 
 | 
   154         
 | 
| 
 | 
   155 
 | 
| 
 | 
   156     if seqlength == 0:
 | 
| 
 | 
   157         error.write(uniprot_accession_in + '\t' + "Uniprot not in Fasta" + '\n')
 | 
| 
 | 
   158         error.close
 | 
| 
 | 
   159         seqlength = 'NA'
 | 
| 
 | 
   160         genename = 'NA'
 | 
| 
 | 
   161         return ReturnValue1(seqlength, genename)
 | 
| 
 | 
   162 
 | 
| 
 | 
   163 def readtab(infile):
 | 
| 
 | 
   164     with open(infile,'r') as x: # read in tab-delim text
 | 
| 
 | 
   165         output = []
 | 
| 
 | 
   166         for line in x:
 | 
| 
 | 
   167             line = line.strip()
 | 
| 
 | 
   168             temp = line.split('\t')
 | 
| 
 | 
   169             output.append(temp)
 | 
| 
 | 
   170     return output
 | 
| 
 | 
   171 def read_scaffold(scaffold_input): # Get data, proteins and header from scaffold output
 | 
| 
 | 
   172     dupes = readtab(scaffold_input)
 | 
| 
 | 
   173     cnt = 0
 | 
| 
 | 
   174     for i in dupes:
 | 
| 
 | 
   175         cnt += 1
 | 
| 
 | 
   176         if i[0] == '#': # finds the start of second header
 | 
| 
 | 
   177             header_start = cnt-1
 | 
| 
 | 
   178     header = dupes[header_start]
 | 
| 
 | 
   179     prot_start = header.index("Accession Number")
 | 
| 
 | 
   180     data = dupes[header_start+1:len(dupes)-2] # cut off blank line and END OF FILE
 | 
| 
 | 
   181     proteins = []
 | 
| 
 | 
   182     for i in data:
 | 
| 
 | 
   183         i[4] = i[4].split()[0] # removes the (+##) that sometimes is attached
 | 
| 
 | 
   184     for protein in data:
 | 
| 
 | 
   185         proteins.append(protein[prot_start])
 | 
| 
 | 
   186     return ReturnValue2(data, proteins, header)
 | 
| 
 | 
   187 def make_inter(scaffold_input):
 | 
| 
 | 
   188     bait = readtab(baitfile)
 | 
| 
 | 
   189     data = read_scaffold(scaffold_input).data
 | 
| 
 | 
   190     header = read_scaffold(scaffold_input).header
 | 
| 
 | 
   191     proteins = read_scaffold(scaffold_input).proteins
 | 
| 
 | 
   192     bait_index = []
 | 
| 
 | 
   193     for i in bait:
 | 
| 
 | 
   194         bait_index.append(header.index(i[0])) # Find just the baits defined in bait file
 | 
| 
 | 
   195     with open('inter.txt', 'w') as y:
 | 
| 
 | 
   196             a = 0; l=0
 | 
| 
 | 
   197             for bb in bait:
 | 
| 
 | 
   198                 for lst in data:
 | 
| 
 | 
   199                     y.write(header[bait_index[l]] + '\t' + bb[1] + '\t' + proteins[a] + '\t' + lst[bait_index[l]] + '\n')
 | 
| 
 | 
   200                     a+=1
 | 
| 
 | 
   201                     if a == len(proteins):
 | 
| 
 | 
   202                         a = 0; l+=1
 | 
| 
 | 
   203 def make_prey(scaffold_input):
 | 
| 
 | 
   204     proteins = read_scaffold(scaffold_input).proteins
 | 
| 
 | 
   205     output_file = open("prey.txt",'w')
 | 
| 
 | 
   206     for a in proteins:
 | 
| 
 | 
   207         a = a.replace("\n","") # remove \n for input into function
 | 
| 
 | 
   208         a = a.replace("\r","") # ditto for \r
 | 
| 
 | 
   209         seq = get_info(a).seqlength
 | 
| 
 | 
   210         GN = get_info(a).genename
 | 
| 
 | 
   211         if seq != 'NA':
 | 
| 
 | 
   212             output_file.write(a+"\t"+str(seq)+ "\t" + str(GN) + "\n")
 | 
| 
 | 
   213     output_file.close()
 | 
| 
 | 
   214 def no_error_inter(scaffold_input): # remake inter file without protein errors from Uniprot
 | 
| 
 | 
   215     err = readtab("error proteins.txt")
 | 
| 
 | 
   216     bait = readtab(baitfile)
 | 
| 
 | 
   217     data = read_scaffold(scaffold_input).data
 | 
| 
 | 
   218     header = read_scaffold(scaffold_input).header
 | 
| 
 | 
   219     bait_index = []
 | 
| 
 | 
   220     for i in bait:
 | 
| 
 | 
   221         bait_index.append(header.index(i[0]))
 | 
| 
 | 
   222     proteins = read_scaffold(scaffold_input).proteins
 | 
| 
 | 
   223     errors = []
 | 
| 
 | 
   224     for e in err:
 | 
| 
 | 
   225         errors.append(e[0])
 | 
| 
 | 
   226     with open('inter.txt', 'w') as y:
 | 
| 
 | 
   227         l = 0; a = 0
 | 
| 
 | 
   228         for bb in bait:
 | 
| 
 | 
   229             for lst in data:
 | 
| 
 | 
   230                 if proteins[a] not in errors:
 | 
| 
 | 
   231                     y.write(header[bait_index[l]] + '\t' + bb[1] + '\t' + proteins[a] + '\t' + lst[bait_index[l]] + '\n')
 | 
| 
 | 
   232                 a+=1
 | 
| 
 | 
   233                 if a == len(proteins):
 | 
| 
 | 
   234                     l += 1; a = 0
 | 
| 
 | 
   235 def bait_check(bait, scaffold_input): # check that bait names share header titles
 | 
| 
 | 
   236     bait_in = readtab(bait)
 | 
| 
 | 
   237     header = read_scaffold(scaffold_input).header
 | 
| 
 | 
   238     for i in bait_in:
 | 
| 
 | 
   239         if i[0] not in header:
 | 
| 
 | 
   240             sys.exit("Bait must share header titles with Scaffold output")
 | 
| 
 | 
   241 
 | 
| 
 | 
   242 if __name__ == '__main__':
 | 
| 
 | 
   243     main(infile, baitfile)
 | 
| 
 | 
   244 
 | 
| 
 | 
   245 os.rename('inter.txt', sys.argv[4])
 | 
| 
 | 
   246 os.rename("bait.txt", sys.argv[7])
 |