view CRAPomeQuery.py @ 19:f1459638e1bc draft default tip

Uploaded
author bornea
date Fri, 29 Apr 2016 15:49:34 -0400
parents 378fc3676c78
children
line wrap: on
line source

# -*- coding: utf-8 -*-
"""
Created on Thu Apr 14 16:58:05 2016

@author: brentkuenzi
"""
################################################################################
## Dependencies ##
import urllib2
import sys
import numpy
import os
################################################################################
# This program will read in a SAINT formatted 'prey.txt' file or a file
# containing a single column list of uniprot accessions (e.g. "P00533" or 
# "EGFR_HUMAN")query the CRAPome database (v1.1), and return a file specifying 
# the prevalence of each protein in the CRAPome.
################################################################################
# Copyright (C)  Brent Kuenzi.
# Permission is granted to copy, distribute and/or modify this document
# under the terms of the GNU Free Documentation License, Version 1.3
# or any later version published by the Free Software Foundation;
# with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts.
# A copy of the license is included in the section entitled "GNU
# Free Documentation License".
################################################################################
## REQUIRED INPUT ##
# 1) crappyData: Prey.txt or single column list of Uniprot accessions
crappyData = sys.argv[1] # Prey file or File with single column of accessions
# 2) Species: HUMAN or YEAST
species = sys.argv[2] # HUMAN or YEAST
fasta_db = sys.argv[3]
db_path = sys.argv[5]
################################################################################
## Global Variables ##
if species == "HUMAN":
    database = str(db_path) + "Human_CRAPome_v1-1.txt"
if species == "YEAST":
    database = str(db_path) + "Yeast_CRAPome_v1-1.txt"
################################################################################
## CRAPomeQuery ##
class ReturnValue1(object):
    def __init__(self, uniprot_acc, gene, swissprot):
        self.up = uniprot_acc
        self.gn = gene
        self.sp = swissprot
def get_info(uniprot_accession_in): #get aa lengths and gene name
    error = open('error proteins.txt', 'a+')
    i=0
    if fasta_db == "None":
        while i==0:
            try:
                data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in + ".fasta")
                break
            except urllib2.HTTPError, err:
                i = i + 1
                if i == 50:
                    sys.exit("More than 50 errors. Check your file or try again later.")
                if err.code == 404:
                    error.write(uniprot_accession_in + '\t' + "Invalid URL. Check protein" + '\n')
                    seqlength = 'NA'
                    genename = 'NA'
                    return ReturnValue1(seqlength, genename)
                elif err.code == 302:
                    sys.exit("Request timed out. Check connection and try again.")
                else:
                    sys.exit("Uniprot had some other error")
        lines = data.readlines()
        header = lines[0]
        lst = header.split('|')
        lst2 = lst[2].split(' ')
        swissprot = lst2[0]
        uniprot_acc = lst[1]
        if lines == []:
            error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n')
            error.close
            uniprot_acc = 'NA'
            genename = 'NA'
            return ReturnValue1(uniprot_acc, genename, swissprot)
        if lines != []:
            seqlength = 0
            header = lines[0]
            if 'GN=' in header:
                lst = header.split('GN=')
                lst2 = lst[1].split(' ')
                genename = lst2[0]
                error.close
                return ReturnValue1(uniprot_acc, genename, swissprot)
            if 'GN=' not in header:
                genename = 'NA'
                error.close
                return ReturnValue1(uniprot_acc, genename, swissprot)
    else:
        data = open(fasta_db, 'r')
        data_lines = data.readlines()
        db_len = len(data_lines)
        for fasta_line in data_lines:
            if uniprot_accession_in in fasta_line:
                sp, uniprot_acc, name_info = fasta_line.split("|")
                swissprot = name_info.split(' ')[0]
                if 'GN=' in name_info:
                    lst = name_info.split('GN=')
                    lst2 = lst[1].split(' ')
                    genename = lst2[0]
                    return ReturnValue1(uniprot_acc, genename, swissprot)
                if 'GN=' not in name_info:
                    genename = 'NA'
                    return ReturnValue1(uniprot_acc, genename, swissprot)
        if sp == ">sp":
            error.close()
        else:
            error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n')
            error.close
            uniprot_acc = 'NA'
            genename = 'NA'
            return ReturnValue1(uniprot_acc, genename, swissprot)


def readtab(infile): # read in tab-delim text
    with open(infile,'r') as x:
        output = []
        for line in x:
            line = line.strip()
            temp = line.split('\t')
            output.append(temp)
    return output
def crapome(infile): # Query CRAPome
    data = readtab(infile)
    crapome = readtab(database)
    filt = []
    for i in data: # Filter CRAPome database on our data
        flag = 0 # is protein in CRAPome?
        ac_flag = 0 # is it _SPECIES or not
        unique = 0 # only take first ID in CRAPome
        if "_"+species in i[0]:
            ac = i[0] 
        else:
            ac = get_info(i[0]).sp # query swissprot if not _SPECIES
            ac_flag +=1
        for j in crapome:
            if ac == j[2]:
                if ac_flag == 0: # if _SPECIES
                    if unique == 0:
                        filt.append(j)
                        flag+=1
                        unique+=1
                if ac_flag != 0: # if not _SPECIES
                    if unique == 0:
                        unique+=1
                        j[2] = i[0] # change to user input
                        filt.append(j)
                        flag +=1
        if flag == 0: # if protein is not present in CRAPome database then add it
            filt.append(["\t", "\t", i[0], "Invalid identifier / gene not available"])
    total = 0 # Experiment counter
    query = []
    for i in filt: # Create CRAPome file as list
        temp=[]
        if len(i) > 5:
            cnt=0
            temp.append(i[2]) # append accession
            temp.append(i[0]) # append gene name
            ave = []
            total = len(i[3:]) # calculate total experiments
            for j in i[3:]:
                if j != '0':
                    ave.append(int(j)) # calculate Ave.SC on only experiments with ID
                    cnt+=1
            temp.append(str(cnt) + " / "+str(total)) # format ratio
            if ave != []:
                temp.append(str(round(numpy.mean(ave),1))) # calculate Ave.SC
                temp.append(str(max(ave))) # calculate Max.SC
            else:
                temp.append(0) # add 0 if has not been ID'd in CRAPome
                temp.append(0) # add 0 if has not been ID'd in CRAPome
        else:
            temp.append(i[2]) # append accession
            temp.append(i[3])
            temp.append("NA")
            temp.append("NA")
            temp.append("NA")
        query.append(temp) # final query results

    header = ["User Input","Mapped Gene Symbol","Num of Expt. (found/total)","Ave SC","Max SC"]
    with open("Crappy Data.txt","wt") as x: # write file
        x.write("\t".join(header) + "\n")
        for i in query:
            x.write("\t".join(i) + "\n")
if __name__ == '__main__':
    crapome(crappyData)

os.rename("Crappy Data.txt", sys.argv[4])
## END ##