Mercurial > repos > bornea > query_crapome
diff CRAPomeQuery.py @ 0:4d47d78b193a draft
Uploaded
author | bornea |
---|---|
date | Mon, 18 Apr 2016 12:16:53 -0400 |
parents | |
children | d1a26feef9de |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CRAPomeQuery.py Mon Apr 18 12:16:53 2016 -0400 @@ -0,0 +1,162 @@ +# -*- coding: utf-8 -*- +""" +Created on Thu Apr 14 16:58:05 2016 + +@author: brentkuenzi +""" +################################################################################ +# This program will read in a SAINT formatted 'prey.txt' file or a file +# containing a single column list of uniprot accessions (e.g. "P00533" or +# "EGFR_HUMAN")query the CRAPome database (v1.1), and return a file specifying +# the prevalence of each protein in the CRAPome. +################################################################################ +# Copyright (C) Brent Kuenzi. +# Permission is granted to copy, distribute and/or modify this document +# under the terms of the GNU Free Documentation License, Version 1.3 +# or any later version published by the Free Software Foundation; +# with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts. +# A copy of the license is included in the section entitled "GNU +# Free Documentation License". +################################################################################ +## REQUIRED INPUT ## +# 1) crappyData: Prey.txt or single column list of Uniprot accessions +crappyData = sys.argv[1] # Prey file or File with single column of accessions +# 2) Species: HUMAN or YEAST +species = sys.argv[2] # HUMAN or YEAST +db_path = sys.argv[4] +################################################################################ +## Dependencies ## +import urllib2 +import sys +import numpy +import os +################################################################################ +## Global Variables ## +if species == "HUMAN": + database = "Human_CRAPome_v1-1.txt" +if species == "YEAST": + database = "Yeast_CRAPome_v1-1.txt" +################################################################################ +## CRAPomeQuery ## +class ReturnValue1(object): + def __init__(self, uniprot_acc, gene, swissprot): + self.up = uniprot_acc + self.gn = gene + self.sp = swissprot +def get_info(uniprot_accession_in): #get aa lengths and gene name + error = open('error proteins.txt', 'a+') + i=0 + while i==0: + try: + data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in + ".fasta") + break + except urllib2.HTTPError, err: + i = i + 1 + if i == 50: + sys.exit("More than 50 errors. Check your file or try again later.") + if err.code == 404: + error.write(uniprot_accession_in + '\t' + "Invalid URL. Check protein" + '\n') + seqlength = 'NA' + genename = 'NA' + return ReturnValue1(seqlength, genename) + elif err.code == 302: + sys.exit("Request timed out. Check connection and try again.") + else: + sys.exit("Uniprot had some other error") + lines = data.readlines() + header = lines[0] + lst = header.split('|') + lst2 = lst[2].split(' ') + swissprot = lst2[0] + uniprot_acc = lst[1] + if lines == []: + error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n') + error.close + uniprot_acc = 'NA' + genename = 'NA' + return ReturnValue1(uniprot_acc, genename, swissprot) + if lines != []: + seqlength = 0 + header = lines[0] + if 'GN=' in header: + lst = header.split('GN=') + lst2 = lst[1].split(' ') + genename = lst2[0] + error.close + return ReturnValue1(uniprot_acc, genename, swissprot) + if 'GN=' not in header: + genename = 'NA' + error.close + return ReturnValue1(uniprot_acc, genename, swissprot) +def readtab(infile): # read in tab-delim text + with open(infile,'r') as x: + output = [] + for line in x: + line = line.strip() + temp = line.split('\t') + output.append(temp) + return output +def crapome(infile): # Query CRAPome + data = readtab(infile) + crapome = readtab(database) + filt = [] + for i in data: # Filter CRAPome database on our data + flag = 0 # is protein in CRAPome? + ac_flag = 0 # is it _SPECIES or not + unique = 0 # only take first ID in CRAPome + if "_"+species in i[0]: + ac = i[0] + else: + ac = get_info(i[0]).sp # query swissprot if not _SPECIES + ac_flag +=1 + for j in crapome: + if ac == j[2]: + if ac_flag == 0: # if _SPECIES + if unique == 0: + filt.append(j) + flag+=1 + unique+=1 + if ac_flag != 0: # if not _SPECIES + if unique == 0: + unique+=1 + j[2] = i[0] # change to user input + filt.append(j) + flag +=1 + if flag == 0: # if protein is not present in CRAPome database then add it + filt.append(["\t", "\t", i[0], "Invalid identifier / gene not available"]) + total = 0 # Experiment counter + query = [] + for i in filt: # Create CRAPome file as list + temp=[] + if len(i) > 5: + cnt=0 + temp.append(i[2]) # append accession + temp.append(i[0]) # append gene name + ave = [] + total = len(i[3:]) # calculate total experiments + for j in i[3:]: + if j != '0': + ave.append(int(j)) # calculate Ave.SC on only experiments with ID + cnt+=1 + temp.append(str(cnt) + " / "+str(total)) # format ratio + if ave != []: + temp.append(str(round(numpy.mean(ave),1))) # calculate Ave.SC + temp.append(str(max(ave))) # calculate Max.SC + else: + temp.append(0) # add 0 if has not been ID'd in CRAPome + temp.append(0) # add 0 if has not been ID'd in CRAPome + else: + temp.append(i[2]) # append accession + temp.append(i[3]) + query.append(temp) # final query results + + header = ["User Input","Mapped Gene Symbol","Num of Expt. (found/total)","Ave SC","Max SC"] + with open("Crappy Data.txt","wt") as x: # write file + x.write("\t".join(header) + "\n") + for i in query: + x.write("\t".join(i) + "\n") +if __name__ == '__main__': + crapome(crappyData) + +os.rename("Crappy Data.txt", sys.argv[3]) +## END ## \ No newline at end of file