annotate domain_search.py @ 0:e4b3fc88efe0 draft

Uploaded
author pedro_araujo
date Wed, 27 Jan 2021 13:50:11 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
1
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
2 class DomainSearch:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
3
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
4 def __init__(self):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
5 '''
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
6 This still needs a bit of modifications
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
7 :param phagesProteins: protein function and sequences, as provided in NCBI. Each phage ID has every protein represented with a dicionary with keys as protein IDs
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
8 :param phageDomains: for each phage and each of it's proteins, a list of predicted domains is given. If unavailable, it returns an empty list
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
9 '''
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
10 import json
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
11 import pandas as pd
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
12 with open('files/phagesProteins.json', encoding='utf-8') as F:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
13 self.phagesProteins = json.loads(F.read())
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
14 # with open('files/bactProteins.json', encoding='utf-8') as F: # For later use, implement the same way as phage, more or less. Include psort
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
15 # self.bacProt = json.loads(F.read())
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
16 data = pd.read_csv('files/NCBI_Phage_Bacteria_Data.csv', header=0, index_col=0)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
17 for phage in data.index:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
18 if data.loc[phage, 'Host_ID'] == '[]':
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
19 try: del self.phagesProteins[phage]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
20 except: pass
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
21 self._filter_phage()
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
22
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
23 def _create_fasta(self, dic, name):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
24 '''
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
25 Creates a fasta file containing every protein sequence for a given dictionary.
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
26 :return:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
27 '''
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
28 with open('files/' + name, 'w') as F:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
29 for org in dic:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
30 for prot in dic[org]:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
31 F.write('>' + org + '-' + prot + '\n' + dic[org][prot][1] + '\n')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
32
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
33 def _filter_phage(self):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
34 self.known_function = {}
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
35 self.unknown_function = {}
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
36 for phage in self.phagesProteins.keys():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
37 self.known_function[phage] = {}
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
38 self.unknown_function[phage] = {}
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
39 for prot in self.phagesProteins[phage].keys():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
40 func = self.phagesProteins[phage][prot][0]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
41 if (not any(i in func.lower() for i in ['hypothetical', 'unknown', 'kda', 'uncharacterized', 'hyphothetical']) and len(func) > 3) and not ('gp' in func.lower() and len(func.split(' ')) < 2) and not (len(func.split(' ')) == 1 and len(func) < 5):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
42 self.known_function[phage][prot] = self.phagesProteins[phage][prot]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
43 else:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
44 self.unknown_function[phage][prot] = self.phagesProteins[phage][prot]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
45
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
46 def scanInterPro(self, InterPro_path='/home/pedro-linux/Downloads/interproscan-5.46-81.0/', out_path='/home/pedro-linux/OneDrive/UMinho/Cenas_de_tese_idk/test_tese_process/files/'):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
47 '''
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
48 Creates a fasta file containing every protein and scans it using Interproscan. Creates a tsv file
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
49 :param InterPro_path: path to the interproscan executable
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
50 :param out_path: path to save the tsv output
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
51 :return: domains_output.tsv, a file that contains the domain associated with each protein
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
52 '''
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
53 import os
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
54 self._create_fasta(self.unknown_function, 'unknown_phages.fasta')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
55 os.system(InterPro_path + 'interproscan.sh -b ' + out_path + '/interpro/domains_output -i ' + out_path + 'unknown_phages.fasta -f tsv')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
56
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
57 def iter_interpro(self, InterPro_path='/home/pedro-linux/Downloads/interproscan-5.46-81.0/', out_path='/home/pedro-linux/OneDrive/UMinho/Cenas_de_tese_idk/test_tese_process/files/interpro/'):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
58 import os
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
59 from pathlib import Path
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
60 count = 0
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
61 F = open('files/interpro/temp_100.fasta', 'w')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
62 for phage in self.unknown_function:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
63 for prot in self.unknown_function[phage]:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
64 count += 1
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
65 my_file = Path("files/interpro/domains_output" + str(count) + ".tsv")
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
66 if count % 100 == 0 and not my_file.is_file():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
67 F.write('>' + prot + '\n' + self.unknown_function[phage][prot][1] + '\n')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
68 F.close()
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
69 os.system(InterPro_path + 'interproscan.sh -b ' + out_path + 'domains_output' + str(count) + ' -i ' + out_path + 'temp_100.fasta -f tsv')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
70 F = open('files/interpro/temp_100.fasta', 'w')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
71 else:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
72 F.write('>' + prot + '\n' + self.unknown_function[phage][prot][1] + '\n')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
73 if count % 100 != 0:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
74 F.close()
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
75 os.system(InterPro_path + 'interproscan.sh -b ' + out_path + 'domains_output' + str(count) + ' -i ' + out_path + 'temp_100.fasta -f tsv')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
76
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
77 def processInterPro(self):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
78 '''
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
79 Processes the tsv file created from scanInterPro. Domains are saved in the protdomains variable.
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
80 :return: phageDomains, a dictionary that, for each protein in a given species, has domains associated
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
81 '''
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
82 import os
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
83 from pathlib import Path
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
84 import pandas as pd
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
85 import re
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
86 my_file = Path("files/interpro/domains_output.tsv")
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
87 if not my_file.is_file():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
88 with open('files/interpro/domains_output.tsv', 'w') as F:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
89 for file in os.listdir('files/interpro/'):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
90 if 'temp_100' not in file:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
91 with open('files/interpro/' + file, 'r') as f:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
92 F.write(f.read())
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
93 domains = pd.read_csv('files/interpro/domains_output.tsv', sep='\t', index_col=0, header=None, names=list(range(13)))
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
94 domains = domains.fillna('-')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
95 domains = domains[domains.loc[:, 3] != 'Coils']
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
96 domains = domains[domains.loc[:, 3] != 'MobiDBLite']
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
97 # domains = domains.groupby(domains.index).last()
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
98 add_domains = {}
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
99 for spec in self.phagesProteins:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
100 for prot in self.phagesProteins[spec]:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
101 if prot in domains.index:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
102 temp = '-'
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
103 try:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
104 for i in range(domains.loc[prot, :].shape[0]):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
105 if '-' not in domains.loc[prot, 12].iloc[i].lower():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
106 if float(domains.loc[id, 8].iloc[i]) < 1.0:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
107 temp = domains.loc[id, 12].iloc[i]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
108 break
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
109 except:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
110 if float(domains.loc[id, 8]) < 1.0:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
111 temp = domains.loc[id, 12]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
112 x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp) # se tiver hits, remover
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
113 if temp != '-' and not any(z in temp.lower() for z in ['unknown', 'ucp', 'uncharacterized', 'consensus']) and len(temp) > 3 and not x:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
114 if temp not in add_domains.keys():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
115 add_domains[temp] = input('Add function: ' + temp).lower()
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
116 if 'y' in add_domains[temp]:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
117 self.phagesProteins[spec][prot][0] = temp
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
118 else:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
119 try:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
120 for i in range(domains.loc[prot, :].shape[0]):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
121 if '-' not in domains.loc[prot, 5].iloc[i].lower():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
122 temp = domains.loc[prot, 5].iloc[i]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
123 break
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
124 except:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
125 temp = domains.loc[prot, 5]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
126 x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
127 if temp != '-' and not any(z in temp.lower() for z in ['unknown', 'ucp', 'uncharacterized', 'consensus']) and len(temp) > 3 and not x:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
128 if temp not in add_domains.keys():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
129 add_domains[temp] = input('Add function: ' + temp).lower()
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
130 if 'y' in add_domains[temp]:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
131 self.phagesProteins[spec][prot][0] = temp
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
132
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
133 def find_domains_interpro(self, dic):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
134 import os
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
135 import pandas as pd
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
136 import re
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
137 InterPro_path='/home/pedro-linux/Downloads/interproscan-5.46-81.0/'
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
138 out_path='/home/pedro-linux/OneDrive/UMinho/Cenas_de_tese_idk/WholeProcess/files/'
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
139 with open('files/SinglePhageProteins.fasta', 'w') as F:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
140 for prot in dic.keys():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
141 F.write('>' + dic[prot][0] + '\n' + dic[prot][1] + '\n')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
142 os.system(InterPro_path + 'interproscan.sh -b ' + out_path + 'single_phage_domains -i ' + out_path + 'SinglePhageProteins.fasta -f tsv')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
143
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
144 domains = pd.read_csv('files/single_phage_domains.tsv', sep='\t', index_col=0, header=None, names=list(range(13)))
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
145 domains = domains.fillna('-')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
146 for prot in dic:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
147 if prot in domains.index:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
148 temp = '-'
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
149 try:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
150 for i in range(domains.loc[prot, :].shape[0]):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
151 if 'coil' not in domains.loc[prot, 12].iloc[i].lower() and '-' not in domains.loc[prot, 12].iloc[i].lower():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
152 temp = domains.loc[prot, 12].iloc[i]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
153 break
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
154 except:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
155 temp = domains.loc[prot, 12]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
156 x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp) # se tiver hits, remover
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
157 if temp != '-' and 'unknown' not in temp and 'UCP' not in temp and len(temp)>3 and not x:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
158 dic[prot][0] = temp
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
159 else:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
160 try:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
161 for i in range(domains.loc[prot, :].shape[0]):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
162 if 'coil' not in domains.loc[prot, 5].iloc[i].lower() and '-' not in domains.loc[prot, 12].iloc[i].lower():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
163 temp = domains.loc[prot, 5].iloc[i]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
164 break
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
165 except:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
166 temp = domains.loc[prot, 5]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
167 x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
168 if temp != '-' and 'unknown' not in temp and 'UCP' not in temp and len(temp) > 3 and not x:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
169 dic[prot][0] = temp
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
170 return dic
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
171
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
172 def fillDomainsBLAST(self):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
173 '''
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
174 Using the NCBIWWW package, it searches for domains with BLAST. Domains are saved in the protdomains variable.
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
175 :return: phageDomains, a dictionary that, for each protein in a given species, has domains associated
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
176 '''
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
177 print('Finding functions/domains with BLAST')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
178 from Bio.Blast import NCBIWWW
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
179 from Bio.Blast import NCBIXML
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
180 import pickle
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
181 from pathlib import Path
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
182 my_file = Path("files/phage_list_blast")
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
183 if my_file.is_file():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
184 with open('files/phage_list_blast', 'rb') as f:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
185 list_done = pickle.load(f)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
186 else:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
187 list_done = []
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
188 for spec in self.phagesProteins:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
189 if spec not in list_done:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
190 for prot in self.phagesProteins[spec]:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
191 if 'hypothetical' in self.phagesProteins[spec][prot][0].lower() or 'uncharacterized' in self.phagesProteins[spec][prot][0].lower() or 'unknown' in self.phagesProteins[spec][prot][0].lower():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
192 # if not self.phageDomains[bac][prot]:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
193 result_handle = NCBIWWW.qblast('blastp', 'nr', self.phagesProteins[spec][prot][1], entrez_query='Acinetobacter baumannii (taxid:470), Escherichia coli (taxid:562), Klebsiella pneumonia (taxid:573)')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
194 blastout = NCBIXML.read(result_handle)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
195 for ali in blastout.alignments:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
196 if 'hypothetical' not in ali.hit_def.lower() and 'uncharacterized' not in ali.hit_def.lower():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
197 print(ali.hit_def[:ali.hit_def.find(' [')])
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
198 self.phagesProteins[spec][prot][0] = ali.hit_def[:ali.hit_def.find(' [')]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
199 break
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
200 list_done.append(spec)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
201 with open('files/phage_list_blast', 'wb') as f:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
202 pickle.dump(list_done, f)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
203 self.saveDomains()
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
204
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
205 def find_domains_blast(self, dic):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
206 from Bio.Blast import NCBIWWW
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
207 from Bio.Blast import NCBIXML
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
208
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
209 for prot in dic.keys():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
210 if 'hypothetical' in dic[prot][0].lower() or 'uncharacterized' in dic[prot][0].lower() or 'unknown' in dic[prot][0].lower():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
211 result_handle = NCBIWWW.qblast('blastp', 'nr', prot, entrez_query='Acinetobacter baumannii (taxid:470), Escherichia coli (taxid:562), Klebsiella pneumonia (taxid:573)')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
212 blastout = NCBIXML.read(result_handle)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
213 for ali in blastout.alignments:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
214 if 'hypothetical' not in ali.hit_def.lower() and 'uncharacterized' not in ali.hit_def.lower():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
215 print(ali.hit_def[:ali.hit_def.find(' [')])
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
216 self.phagesProteins[spec][prot][0] = ali.hit_def[:ali.hit_def.find(' [')]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
217 break
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
218 return dic
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
219
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
220 def fillDomainsUniProt(self):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
221 '''
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
222 Using the UniProt website, similar sequences are obtained and the ones with function assigned are saved into the domains. Domains are saved in the protdomains variable.
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
223 :return: phageDomains, a dictionary that, for each protein in a given species, has domains associated
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
224 '''
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
225 print('Finding functions/domains with UniProt')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
226 import requests
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
227 import pickle
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
228 from pathlib import Path
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
229 my_file = Path("files/phage_list_uniprot")
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
230 if my_file.is_file():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
231 with open('files/phage_list_uniprot', 'rb') as f:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
232 list_done = pickle.load(f)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
233 else:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
234 list_done = []
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
235 for phage in self.phagesProteins:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
236 if phage not in list_done:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
237 for accID in self.phagesProteins[phage]:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
238 if 'hypothetical' in self.phagesProteins[phage][accID][0].lower() or 'uncharacterized' in self.phagesProteins[phage][accID][0].lower() or 'unknown' in self.phagesProteins[phage][accID][0].lower():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
239 # if not self.phageDomains[phage][accID]:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
240 fullURL = ('https://www.uniprot.org/uniprot/?query=' + accID + '&sort=score&format=list')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
241 result = requests.get(fullURL)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
242 uniprot_acc = result.text.strip()
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
243 fullURL = ('https://www.uniprot.org/uniprot/?query=cluster:(uniprot:' + uniprot_acc + '* identity:1.0) not id:' + uniprot_acc + '&format=txt')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
244 result = requests.get(fullURL)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
245 listResults = result.text.split('\n')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
246 for entry in listResults:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
247 if entry[:2] == 'DE':
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
248 start_pos = entry.find('Full=') + 5
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
249 end_pos = entry.find(' {ECO')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
250 domain = entry[start_pos:end_pos]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
251 if not any(z in domain.lower() for z in ['uncharacterized', 'flags', 'domain', 'bacteriophage protein', 'family protein', 'phage-like', 'phage protein', 'unassigned', 'orf', 'gene']) and len(domain) > 5:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
252 print(domain)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
253 self.phagesProteins[phage][accID][0] = domain
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
254 break
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
255 list_done.append(phage)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
256 with open('files/phage_list_uniprot', 'wb') as f:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
257 pickle.dump(list_done, f)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
258 self.saveDomains()
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
259
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
260 def find_domains_uniprot(self, dic):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
261 import requests
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
262 for accID in dic.keys():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
263 if 'hypothetical' in dic[accID][0].lower() or 'uncharacterized' in dic[accID][0].lower() or 'unknown' in dic[accID][0].lower():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
264 fullURL = ('https://www.uniprot.org/uniprot/?query=' + accID + '&sort=score&format=list')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
265 result = requests.get(fullURL)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
266 uniprot_acc = result.text.strip()
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
267 fullURL = ('https://www.uniprot.org/uniprot/?query=cluster:(uniprot:' + uniprot_acc + '* identity:1.0) not id:' + uniprot_acc + '&format=txt')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
268 result = requests.get(fullURL)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
269 listResults = result.text.split('\n')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
270 for entry in listResults:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
271 if entry[:2] == 'DE':
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
272 start_pos = entry.find('Full=') + 5
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
273 end_pos = entry.find(' {ECO')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
274 domain = entry[start_pos:end_pos]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
275 if not any(z in domain.lower() for z in ['uncharacterized', 'flags', 'domain', 'bacteriophage protein', 'family protein', 'phage-like', 'phage protein', 'unassigned', 'orf', 'gene']) and len(domain) > 5:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
276 dic[accID][0] = domain
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
277 break
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
278 return dic
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
279
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
280 def cdHit(self):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
281 import os
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
282 from pathlib import Path
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
283 my_file = Path('files/phagesProteins.fasta')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
284 if not my_file.is_file():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
285 self._create_fasta(self.phagesProteins, 'phagesProteins.fasta')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
286 my_file = Path('files/complete_cdhit.clstr')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
287 if not my_file.is_file():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
288 os.system('cd-hit -i files/phagesProteins.fasta -d 50 -o files/complete_cdhit')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
289 # clusters = {}
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
290 temp_cluster = []
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
291 list_found = []
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
292 found = False
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
293 with open('files/complete_cdhit.clstr', 'r') as f:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
294 for line in f.readlines():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
295 if '>Cluster' in line:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
296 if temp_cluster and found:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
297 if len(list_found) == 1:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
298 function = list_found[0]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
299 else:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
300 x = int(input(str(list_found) + '\nChoose from 1 to ' + str(len(list_found)) + ': ')) - 1
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
301 function = list_found[x]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
302 for clust in temp_cluster:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
303 self.phagesProteins[clust[clust.find('-') + 1:]][clust[:clust.find('-')]][0] = function
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
304
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
305 temp_cluster = []
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
306 list_found = []
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
307 found = False
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
308 else:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
309 pos_i = line.find('>') + 1
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
310 pos_f = line.find('...')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
311 pos_m = line.find('-')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
312 prot = line[pos_i:pos_m]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
313 phage = line[pos_m + 1:pos_f]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
314 if prot in self.known_function[phage].keys() and not found:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
315 function = self.known_function[phage][prot][0]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
316 list_found.append(function)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
317 found = True
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
318 elif prot in self.known_function[phage].keys() and found:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
319 if function != self.known_function[phage][prot][0] and self.known_function[phage][prot][0] not in list_found:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
320 function = self.known_function[phage][prot][0]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
321 list_found.append(function)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
322 elif prot in self.unknown_function[phage].keys():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
323 temp_cluster.append(line[pos_i:pos_f])
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
324
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
325 def create_blast_db(self):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
326 import os
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
327 self._create_fasta(self.known_function, 'database_phages.fasta')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
328 os.system('makeblastdb -in files/database_phages.fasta -dbtype prot -title PhageProts -parse_seqids -out files/database_phages')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
329 self._create_fasta(self.unknown_function, 'unknown_phages.fasta')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
330 os.system('blastp -db files/database_phages -query files/unknown_phages.fasta -out files/test_blast -num_threads 2 -outfmt 6')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
331
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
332 def process_blastdb(self, blastdb):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
333 import pandas as pd
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
334 blast_domains = pd.read_csv('files/' + blastdb, sep='\t', header=None)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
335 for phage in self.unknown_function.keys():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
336 for prot in self.unknown_function[phage]:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
337 evalue = []
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
338 bitscore = []
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
339 pred = blast_domains[blast_domains[0] == phage + '-' + prot]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
340 if pred.shape[0] == 0: break
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
341 for i in pred[10]:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
342 evalue.append(float(i))
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
343 for i in pred[11]:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
344 bitscore.append(float(i))
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
345 if min(evalue) < 1.0 and max(bitscore) > 30.0:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
346 ind = evalue.index(min(evalue))
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
347 if ind != bitscore.index(max(bitscore)):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
348 ind = bitscore.index(max(bitscore))
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
349 temp = pred.iloc[ind,1]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
350 known_phage = temp[:temp.find('-')]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
351 known_prot = temp[temp.find('-')+1:]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
352 if self.known_function[known_phage][known_prot]:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
353 new_func = self.known_function[known_phage][known_prot][0]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
354 for j in self.known_function.keys():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
355 if pred.iloc[ind,1] in self.known_function[j].keys():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
356 new_func = self.known_function[j][pred.iloc[ind,1]][0]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
357 break
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
358 self.phagesProteins[phage][prot][0] = new_func
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
359 self.saveDomains()
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
360
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
361 def extract_bact_location(self):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
362 import pandas as pd
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
363 import ast
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
364 import requests
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
365 import re
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
366 from pathlib import Path
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
367 data = pd.read_csv('files/NCBI_Phage_Bacteria_Data.csv', header=0, index_col=0)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
368 all_bact = []
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
369 for i in data.index:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
370 for bact in ast.literal_eval(data.loc[i, 'Host_ID']):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
371 if bact[:-2] not in all_bact:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
372 all_bact.append(bact[:-2])
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
373 fullURL = ('https://db.psort.org/downloads/precomputed?version=3.00')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
374 result = requests.get(fullURL)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
375 psort = result.text.strip()
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
376 urls = re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+/[a-z]+/\S+\"{1}', psort)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
377 i = 1
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
378 while i < len(urls):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
379 temp = urls[i]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
380 bact = temp[temp.rfind('=') + 1:temp.find('"')]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
381 if bact not in all_bact:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
382 i += 3
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
383 else:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
384 my_file = Path('files/psort/' + bact + ".faa.out")
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
385 if not my_file.is_file():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
386 temp_url = urls[i+1].strip('"')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
387 r = requests.get(temp_url)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
388 with open('files/psort/' + bact + ".faa.out", 'wb') as f:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
389 f.write(r.content)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
390 i += 3
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
391
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
392 def create_fasta_psort(self):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
393 from pathlib import Path
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
394 import json
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
395 import os
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
396 for bact in os.listdir('files/bacteria'):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
397 my_file = Path('files/psort/' + bact[:-5] + '.faa.out')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
398 if not my_file.is_file():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
399 with open('files/bacteria/' + bact, encoding='utf-8') as F:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
400 bact_prots = json.loads(F.read())
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
401 self._create_fasta(bact_prots, 'psort/' + bact[:-5] + '.fasta')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
402 os.system('./psortb -n -i /home/pedro-linux/OneDrive/UMinho/Cenas_de_tese_idk/test_tese_process/files/psort/' + bact[:-5] + '.fasta -r . -o long')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
403 os.listdir('./psortb')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
404 os.replace('', '/home/pedro-linux/OneDrive/UMinho/Cenas_de_tese_idk/test_tese_process/files/psort/' + bact[:-5] + '.faa.out') # move and rename output
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
405 os.remove('files/psort/' + bact[:-5] + '.fasta')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
406
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
407 def saveDomains(self):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
408 '''
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
409 Saves the protdomain variable in a file.
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
410 :return: SearchedDomains.json
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
411 '''
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
412 import json
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
413 with open('files/phagesProteins.json', 'w') as f:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
414 json.dump(self.phagesProteins, f)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
415 # with open('files/phagesProteins.fasta', 'w') as F:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
416 # for phage in self.phagesProteins.keys():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
417 # for prot in self.phagesProteins[phage]:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
418 # F.write('>' + prot + '\n' + self.phagesProteins[phage][prot][1] + '\n')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
419
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
420
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
421 if __name__ == '__main__':
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
422 test = DomainSearch()
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
423
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
424 test.extract_bact_location()
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
425 test.create_fasta_psort()
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
426
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
427 test.create_blast_db()
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
428 test.process_blastdb('test_blast')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
429
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
430 test.cdHit()
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
431 test.scanInterPro()
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
432 test.processInterPro()
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
433
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
434 test.fillDomainsBLAST()
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
435 test.fillDomainsUniProt()
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
436 test.saveDomains()