changeset 17:378fc3676c78 draft

Uploaded
author bornea
date Fri, 29 Apr 2016 15:42:22 -0400
parents a5444c834e72
children edde2724910f
files CRAPomeQuery.py
diffstat 1 files changed, 80 insertions(+), 49 deletions(-) [+]
line wrap: on
line diff
--- a/CRAPomeQuery.py	Fri Apr 29 15:42:11 2016 -0400
+++ b/CRAPomeQuery.py	Fri Apr 29 15:42:22 2016 -0400
@@ -5,6 +5,12 @@
 @author: brentkuenzi
 """
 ################################################################################
+## Dependencies ##
+import urllib2
+import sys
+import numpy
+import os
+################################################################################
 # This program will read in a SAINT formatted 'prey.txt' file or a file
 # containing a single column list of uniprot accessions (e.g. "P00533" or 
 # "EGFR_HUMAN")query the CRAPome database (v1.1), and return a file specifying 
@@ -18,24 +24,19 @@
 # A copy of the license is included in the section entitled "GNU
 # Free Documentation License".
 ################################################################################
-## Dependencies ##
-import urllib2
-import sys
-import numpy
-import os
-################################################################################
 ## REQUIRED INPUT ##
 # 1) crappyData: Prey.txt or single column list of Uniprot accessions
 crappyData = sys.argv[1] # Prey file or File with single column of accessions
 # 2) Species: HUMAN or YEAST
 species = sys.argv[2] # HUMAN or YEAST
-db_path = sys.argv[4]
+fasta_db = sys.argv[3]
+db_path = sys.argv[5]
 ################################################################################
 ## Global Variables ##
 if species == "HUMAN":
-    database = "Human_CRAPome_v1-1.txt"
+    database = str(db_path) + "Human_CRAPome_v1-1.txt"
 if species == "YEAST":
-    database = "Yeast_CRAPome_v1-1.txt"
+    database = str(db_path) + "Yeast_CRAPome_v1-1.txt"
 ################################################################################
 ## CRAPomeQuery ##
 class ReturnValue1(object):
@@ -46,48 +47,75 @@
 def get_info(uniprot_accession_in): #get aa lengths and gene name
     error = open('error proteins.txt', 'a+')
     i=0
-    while i==0:
-        try:
-            data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in + ".fasta")
-            break
-        except urllib2.HTTPError, err:
-            i = i + 1
-            if i == 50:
-                sys.exit("More than 50 errors. Check your file or try again later.")
-            if err.code == 404:
-                error.write(uniprot_accession_in + '\t' + "Invalid URL. Check protein" + '\n')
-                seqlength = 'NA'
+    if fasta_db == "None":
+        while i==0:
+            try:
+                data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in + ".fasta")
+                break
+            except urllib2.HTTPError, err:
+                i = i + 1
+                if i == 50:
+                    sys.exit("More than 50 errors. Check your file or try again later.")
+                if err.code == 404:
+                    error.write(uniprot_accession_in + '\t' + "Invalid URL. Check protein" + '\n')
+                    seqlength = 'NA'
+                    genename = 'NA'
+                    return ReturnValue1(seqlength, genename)
+                elif err.code == 302:
+                    sys.exit("Request timed out. Check connection and try again.")
+                else:
+                    sys.exit("Uniprot had some other error")
+        lines = data.readlines()
+        header = lines[0]
+        lst = header.split('|')
+        lst2 = lst[2].split(' ')
+        swissprot = lst2[0]
+        uniprot_acc = lst[1]
+        if lines == []:
+            error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n')
+            error.close
+            uniprot_acc = 'NA'
+            genename = 'NA'
+            return ReturnValue1(uniprot_acc, genename, swissprot)
+        if lines != []:
+            seqlength = 0
+            header = lines[0]
+            if 'GN=' in header:
+                lst = header.split('GN=')
+                lst2 = lst[1].split(' ')
+                genename = lst2[0]
+                error.close
+                return ReturnValue1(uniprot_acc, genename, swissprot)
+            if 'GN=' not in header:
                 genename = 'NA'
-                return ReturnValue1(seqlength, genename)
-            elif err.code == 302:
-                sys.exit("Request timed out. Check connection and try again.")
-            else:
-                sys.exit("Uniprot had some other error")
-    lines = data.readlines()
-    header = lines[0]
-    lst = header.split('|')
-    lst2 = lst[2].split(' ')
-    swissprot = lst2[0]
-    uniprot_acc = lst[1]
-    if lines == []:
-        error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n')
-        error.close
-        uniprot_acc = 'NA'
-        genename = 'NA'
-        return ReturnValue1(uniprot_acc, genename, swissprot)
-    if lines != []:
-        seqlength = 0
-        header = lines[0]
-        if 'GN=' in header:
-            lst = header.split('GN=')
-            lst2 = lst[1].split(' ')
-            genename = lst2[0]
+                error.close
+                return ReturnValue1(uniprot_acc, genename, swissprot)
+    else:
+        data = open(fasta_db, 'r')
+        data_lines = data.readlines()
+        db_len = len(data_lines)
+        for fasta_line in data_lines:
+            if uniprot_accession_in in fasta_line:
+                sp, uniprot_acc, name_info = fasta_line.split("|")
+                swissprot = name_info.split(' ')[0]
+                if 'GN=' in name_info:
+                    lst = name_info.split('GN=')
+                    lst2 = lst[1].split(' ')
+                    genename = lst2[0]
+                    return ReturnValue1(uniprot_acc, genename, swissprot)
+                if 'GN=' not in name_info:
+                    genename = 'NA'
+                    return ReturnValue1(uniprot_acc, genename, swissprot)
+        if sp = ">sp":
+            error.close()
+        else:
+            error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n')
             error.close
-            return ReturnValue1(uniprot_acc, genename, swissprot)
-        if 'GN=' not in header:
+            uniprot_acc = 'NA'
             genename = 'NA'
-            error.close
             return ReturnValue1(uniprot_acc, genename, swissprot)
+
+
 def readtab(infile): # read in tab-delim text
     with open(infile,'r') as x:
         output = []
@@ -134,7 +162,7 @@
             temp.append(i[0]) # append gene name
             ave = []
             total = len(i[3:]) # calculate total experiments
-            for j in i[3:]: 
+            for j in i[3:]:
                 if j != '0':
                     ave.append(int(j)) # calculate Ave.SC on only experiments with ID
                     cnt+=1
@@ -148,6 +176,9 @@
         else:
             temp.append(i[2]) # append accession
             temp.append(i[3])
+            temp.append("NA")
+            temp.append("NA")
+            temp.append("NA")
         query.append(temp) # final query results
 
     header = ["User Input","Mapped Gene Symbol","Num of Expt. (found/total)","Ave SC","Max SC"]
@@ -158,5 +189,5 @@
 if __name__ == '__main__':
     crapome(crappyData)
 
-os.rename("Crappy Data.txt", sys.argv[3])
+os.rename("Crappy Data.txt", sys.argv[4])
 ## END ##
\ No newline at end of file