annotate phage_host_prediction/domain_search.py @ 2:3e1e8be4e65c draft default tip

Uploaded
author pedro_araujo
date Fri, 02 Apr 2021 10:11:13 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
2 class DomainSearch:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
3
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
4 def __init__(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
5 '''
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
6 This still needs a bit of modifications
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
7 :param phagesProteins: protein function and sequences, as provided in NCBI. Each phage ID has every protein represented with a dicionary with keys as protein IDs
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
8 :param phageDomains: for each phage and each of it's proteins, a list of predicted domains is given. If unavailable, it returns an empty list
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
9 '''
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
10 import json
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
11 import pandas as pd
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
12 with open('files/phagesProteins.json', encoding='utf-8') as F:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
13 self.phagesProteins = json.loads(F.read())
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
14 # with open('files/bactProteins.json', encoding='utf-8') as F: # For later use, implement the same way as phage, more or less. Include psort
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
15 # self.bacProt = json.loads(F.read())
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
16 data = pd.read_csv('files/NCBI_Phage_Bacteria_Data.csv', header=0, index_col=0)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
17 for phage in data.index:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
18 if data.loc[phage, 'Host_ID'] == '[]':
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
19 try: del self.phagesProteins[phage]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
20 except: pass
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
21 self._filter_phage()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
22
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
23 def _create_fasta(self, dic, name):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
24 '''
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
25 Creates a fasta file containing every protein sequence for a given dictionary.
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
26 :return:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
27 '''
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
28 with open('files/' + name, 'w') as F:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
29 for org in dic:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
30 for prot in dic[org]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
31 F.write('>' + org + '-' + prot + '\n' + dic[org][prot][1] + '\n')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
32
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
33 def _filter_phage(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
34 self.known_function = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
35 self.unknown_function = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
36 for phage in self.phagesProteins.keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
37 self.known_function[phage] = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
38 self.unknown_function[phage] = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
39 for prot in self.phagesProteins[phage].keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
40 func = self.phagesProteins[phage][prot][0]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
41 if (not any(i in func.lower() for i in ['hypothetical', 'unknown', 'kda', 'uncharacterized', 'hyphothetical']) and len(func) > 3) and not ('gp' in func.lower() and len(func.split(' ')) < 2) and not (len(func.split(' ')) == 1 and len(func) < 5):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
42 self.known_function[phage][prot] = self.phagesProteins[phage][prot]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
43 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
44 self.unknown_function[phage][prot] = self.phagesProteins[phage][prot]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
45
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
46 def scanInterPro(self, InterPro_path='/home/pedro-linux/Downloads/interproscan-5.46-81.0/', out_path='/home/pedro-linux/OneDrive/UMinho/Cenas_de_tese_idk/test_tese_process/files/'):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
47 '''
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
48 Creates a fasta file containing every protein and scans it using Interproscan. Creates a tsv file
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
49 :param InterPro_path: path to the interproscan executable
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
50 :param out_path: path to save the tsv output
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
51 :return: domains_output.tsv, a file that contains the domain associated with each protein
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
52 '''
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
53 import os
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
54 self._create_fasta(self.unknown_function, 'unknown_phages.fasta')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
55 os.system(InterPro_path + 'interproscan.sh -b ' + out_path + '/interpro/domains_output -i ' + out_path + 'unknown_phages.fasta -f tsv')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
56
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
57 def iter_interpro(self, InterPro_path='/home/pedro-linux/Downloads/interproscan-5.46-81.0/', out_path='/home/pedro-linux/OneDrive/UMinho/Cenas_de_tese_idk/test_tese_process/files/interpro/'):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
58 import os
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
59 from pathlib import Path
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
60 count = 0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
61 F = open('files/interpro/temp_100.fasta', 'w')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
62 for phage in self.unknown_function:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
63 for prot in self.unknown_function[phage]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
64 count += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
65 my_file = Path("files/interpro/domains_output" + str(count) + ".tsv")
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
66 if count % 100 == 0 and not my_file.is_file():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
67 F.write('>' + prot + '\n' + self.unknown_function[phage][prot][1] + '\n')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
68 F.close()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
69 os.system(InterPro_path + 'interproscan.sh -b ' + out_path + 'domains_output' + str(count) + ' -i ' + out_path + 'temp_100.fasta -f tsv')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
70 F = open('files/interpro/temp_100.fasta', 'w')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
71 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
72 F.write('>' + prot + '\n' + self.unknown_function[phage][prot][1] + '\n')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
73 if count % 100 != 0:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
74 F.close()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
75 os.system(InterPro_path + 'interproscan.sh -b ' + out_path + 'domains_output' + str(count) + ' -i ' + out_path + 'temp_100.fasta -f tsv')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
76
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
77 def processInterPro(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
78 '''
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
79 Processes the tsv file created from scanInterPro. Domains are saved in the protdomains variable.
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
80 :return: phageDomains, a dictionary that, for each protein in a given species, has domains associated
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
81 '''
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
82 import os
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
83 from pathlib import Path
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
84 import pandas as pd
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
85 import re
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
86 my_file = Path("files/interpro/domains_output.tsv")
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
87 if not my_file.is_file():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
88 with open('files/interpro/domains_output.tsv', 'w') as F:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
89 for file in os.listdir('files/interpro/'):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
90 if 'temp_100' not in file:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
91 with open('files/interpro/' + file, 'r') as f:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
92 F.write(f.read())
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
93 domains = pd.read_csv('files/interpro/domains_output.tsv', sep='\t', index_col=0, header=None, names=list(range(13)))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
94 domains = domains.fillna('-')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
95 domains = domains[domains.loc[:, 3] != 'Coils']
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
96 domains = domains[domains.loc[:, 3] != 'MobiDBLite']
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
97 # domains = domains.groupby(domains.index).last()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
98 add_domains = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
99 for spec in self.phagesProteins:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
100 for prot in self.phagesProteins[spec]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
101 if prot in domains.index:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
102 temp = '-'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
103 try:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
104 for i in range(domains.loc[prot, :].shape[0]):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
105 if '-' not in domains.loc[prot, 12].iloc[i].lower():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
106 if float(domains.loc[id, 8].iloc[i]) < 1.0:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
107 temp = domains.loc[id, 12].iloc[i]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
108 break
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
109 except:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
110 if float(domains.loc[id, 8]) < 1.0:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
111 temp = domains.loc[id, 12]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
112 x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp) # se tiver hits, remover
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
113 if temp != '-' and not any(z in temp.lower() for z in ['unknown', 'ucp', 'uncharacterized', 'consensus']) and len(temp) > 3 and not x:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
114 if temp not in add_domains.keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
115 add_domains[temp] = input('Add function: ' + temp).lower()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
116 if 'y' in add_domains[temp]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
117 self.phagesProteins[spec][prot][0] = temp
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
118 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
119 try:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
120 for i in range(domains.loc[prot, :].shape[0]):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
121 if '-' not in domains.loc[prot, 5].iloc[i].lower():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
122 temp = domains.loc[prot, 5].iloc[i]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
123 break
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
124 except:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
125 temp = domains.loc[prot, 5]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
126 x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
127 if temp != '-' and not any(z in temp.lower() for z in ['unknown', 'ucp', 'uncharacterized', 'consensus']) and len(temp) > 3 and not x:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
128 if temp not in add_domains.keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
129 add_domains[temp] = input('Add function: ' + temp).lower()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
130 if 'y' in add_domains[temp]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
131 self.phagesProteins[spec][prot][0] = temp
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
132
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
133 def find_domains_interpro(self, dic):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
134 import os
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
135 import pandas as pd
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
136 import re
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
137 InterPro_path='/home/pedro-linux/Downloads/interproscan-5.46-81.0/'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
138 out_path='/home/pedro-linux/OneDrive/UMinho/Cenas_de_tese_idk/WholeProcess/files/'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
139 with open('files/SinglePhageProteins.fasta', 'w') as F:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
140 for prot in dic.keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
141 F.write('>' + dic[prot][0] + '\n' + dic[prot][1] + '\n')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
142 os.system(InterPro_path + 'interproscan.sh -b ' + out_path + 'single_phage_domains -i ' + out_path + 'SinglePhageProteins.fasta -f tsv')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
143
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
144 domains = pd.read_csv('files/single_phage_domains.tsv', sep='\t', index_col=0, header=None, names=list(range(13)))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
145 domains = domains.fillna('-')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
146 for prot in dic:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
147 if prot in domains.index:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
148 temp = '-'
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
149 try:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
150 for i in range(domains.loc[prot, :].shape[0]):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
151 if 'coil' not in domains.loc[prot, 12].iloc[i].lower() and '-' not in domains.loc[prot, 12].iloc[i].lower():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
152 temp = domains.loc[prot, 12].iloc[i]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
153 break
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
154 except:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
155 temp = domains.loc[prot, 12]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
156 x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp) # se tiver hits, remover
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
157 if temp != '-' and 'unknown' not in temp and 'UCP' not in temp and len(temp)>3 and not x:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
158 dic[prot][0] = temp
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
159 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
160 try:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
161 for i in range(domains.loc[prot, :].shape[0]):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
162 if 'coil' not in domains.loc[prot, 5].iloc[i].lower() and '-' not in domains.loc[prot, 12].iloc[i].lower():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
163 temp = domains.loc[prot, 5].iloc[i]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
164 break
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
165 except:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
166 temp = domains.loc[prot, 5]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
167 x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
168 if temp != '-' and 'unknown' not in temp and 'UCP' not in temp and len(temp) > 3 and not x:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
169 dic[prot][0] = temp
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
170 return dic
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
171
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
172 def fillDomainsBLAST(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
173 '''
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
174 Using the NCBIWWW package, it searches for domains with BLAST. Domains are saved in the protdomains variable.
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
175 :return: phageDomains, a dictionary that, for each protein in a given species, has domains associated
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
176 '''
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
177 print('Finding functions/domains with BLAST')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
178 from Bio.Blast import NCBIWWW
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
179 from Bio.Blast import NCBIXML
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
180 import pickle
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
181 from pathlib import Path
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
182 my_file = Path("files/phage_list_blast")
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
183 if my_file.is_file():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
184 with open('files/phage_list_blast', 'rb') as f:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
185 list_done = pickle.load(f)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
186 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
187 list_done = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
188 for spec in self.phagesProteins:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
189 if spec not in list_done:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
190 for prot in self.phagesProteins[spec]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
191 if 'hypothetical' in self.phagesProteins[spec][prot][0].lower() or 'uncharacterized' in self.phagesProteins[spec][prot][0].lower() or 'unknown' in self.phagesProteins[spec][prot][0].lower():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
192 # if not self.phageDomains[bac][prot]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
193 result_handle = NCBIWWW.qblast('blastp', 'nr', self.phagesProteins[spec][prot][1], entrez_query='Acinetobacter baumannii (taxid:470), Escherichia coli (taxid:562), Klebsiella pneumonia (taxid:573)')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
194 blastout = NCBIXML.read(result_handle)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
195 for ali in blastout.alignments:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
196 if 'hypothetical' not in ali.hit_def.lower() and 'uncharacterized' not in ali.hit_def.lower():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
197 print(ali.hit_def[:ali.hit_def.find(' [')])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
198 self.phagesProteins[spec][prot][0] = ali.hit_def[:ali.hit_def.find(' [')]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
199 break
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
200 list_done.append(spec)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
201 with open('files/phage_list_blast', 'wb') as f:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
202 pickle.dump(list_done, f)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
203 self.saveDomains()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
204
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
205 def find_domains_blast(self, dic):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
206 from Bio.Blast import NCBIWWW
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
207 from Bio.Blast import NCBIXML
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
208
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
209 for prot in dic.keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
210 if 'hypothetical' in dic[prot][0].lower() or 'uncharacterized' in dic[prot][0].lower() or 'unknown' in dic[prot][0].lower():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
211 result_handle = NCBIWWW.qblast('blastp', 'nr', prot, entrez_query='Acinetobacter baumannii (taxid:470), Escherichia coli (taxid:562), Klebsiella pneumonia (taxid:573)')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
212 blastout = NCBIXML.read(result_handle)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
213 for ali in blastout.alignments:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
214 if 'hypothetical' not in ali.hit_def.lower() and 'uncharacterized' not in ali.hit_def.lower():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
215 print(ali.hit_def[:ali.hit_def.find(' [')])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
216 self.phagesProteins[spec][prot][0] = ali.hit_def[:ali.hit_def.find(' [')]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
217 break
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
218 return dic
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
219
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
220 def fillDomainsUniProt(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
221 '''
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
222 Using the UniProt website, similar sequences are obtained and the ones with function assigned are saved into the domains. Domains are saved in the protdomains variable.
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
223 :return: phageDomains, a dictionary that, for each protein in a given species, has domains associated
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
224 '''
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
225 print('Finding functions/domains with UniProt')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
226 import requests
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
227 import pickle
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
228 from pathlib import Path
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
229 my_file = Path("files/phage_list_uniprot")
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
230 if my_file.is_file():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
231 with open('files/phage_list_uniprot', 'rb') as f:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
232 list_done = pickle.load(f)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
233 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
234 list_done = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
235 for phage in self.phagesProteins:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
236 if phage not in list_done:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
237 for accID in self.phagesProteins[phage]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
238 if 'hypothetical' in self.phagesProteins[phage][accID][0].lower() or 'uncharacterized' in self.phagesProteins[phage][accID][0].lower() or 'unknown' in self.phagesProteins[phage][accID][0].lower():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
239 # if not self.phageDomains[phage][accID]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
240 fullURL = ('https://www.uniprot.org/uniprot/?query=' + accID + '&sort=score&format=list')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
241 result = requests.get(fullURL)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
242 uniprot_acc = result.text.strip()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
243 fullURL = ('https://www.uniprot.org/uniprot/?query=cluster:(uniprot:' + uniprot_acc + '* identity:1.0) not id:' + uniprot_acc + '&format=txt')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
244 result = requests.get(fullURL)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
245 listResults = result.text.split('\n')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
246 for entry in listResults:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
247 if entry[:2] == 'DE':
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
248 start_pos = entry.find('Full=') + 5
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
249 end_pos = entry.find(' {ECO')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
250 domain = entry[start_pos:end_pos]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
251 if not any(z in domain.lower() for z in ['uncharacterized', 'flags', 'domain', 'bacteriophage protein', 'family protein', 'phage-like', 'phage protein', 'unassigned', 'orf', 'gene']) and len(domain) > 5:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
252 print(domain)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
253 self.phagesProteins[phage][accID][0] = domain
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
254 break
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
255 list_done.append(phage)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
256 with open('files/phage_list_uniprot', 'wb') as f:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
257 pickle.dump(list_done, f)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
258 self.saveDomains()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
259
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
260 def find_domains_uniprot(self, dic):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
261 import requests
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
262 for accID in dic.keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
263 if 'hypothetical' in dic[accID][0].lower() or 'uncharacterized' in dic[accID][0].lower() or 'unknown' in dic[accID][0].lower():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
264 fullURL = ('https://www.uniprot.org/uniprot/?query=' + accID + '&sort=score&format=list')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
265 result = requests.get(fullURL)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
266 uniprot_acc = result.text.strip()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
267 fullURL = ('https://www.uniprot.org/uniprot/?query=cluster:(uniprot:' + uniprot_acc + '* identity:1.0) not id:' + uniprot_acc + '&format=txt')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
268 result = requests.get(fullURL)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
269 listResults = result.text.split('\n')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
270 for entry in listResults:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
271 if entry[:2] == 'DE':
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
272 start_pos = entry.find('Full=') + 5
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
273 end_pos = entry.find(' {ECO')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
274 domain = entry[start_pos:end_pos]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
275 if not any(z in domain.lower() for z in ['uncharacterized', 'flags', 'domain', 'bacteriophage protein', 'family protein', 'phage-like', 'phage protein', 'unassigned', 'orf', 'gene']) and len(domain) > 5:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
276 dic[accID][0] = domain
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
277 break
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
278 return dic
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
279
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
280 def cdHit(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
281 import os
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
282 from pathlib import Path
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
283 my_file = Path('files/phagesProteins.fasta')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
284 if not my_file.is_file():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
285 self._create_fasta(self.phagesProteins, 'phagesProteins.fasta')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
286 my_file = Path('files/complete_cdhit.clstr')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
287 if not my_file.is_file():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
288 os.system('cd-hit -i files/phagesProteins.fasta -d 50 -o files/complete_cdhit')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
289 # clusters = {}
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
290 temp_cluster = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
291 list_found = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
292 found = False
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
293 with open('files/complete_cdhit.clstr', 'r') as f:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
294 for line in f.readlines():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
295 if '>Cluster' in line:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
296 if temp_cluster and found:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
297 if len(list_found) == 1:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
298 function = list_found[0]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
299 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
300 x = int(input(str(list_found) + '\nChoose from 1 to ' + str(len(list_found)) + ': ')) - 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
301 function = list_found[x]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
302 for clust in temp_cluster:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
303 self.phagesProteins[clust[clust.find('-') + 1:]][clust[:clust.find('-')]][0] = function
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
304
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
305 temp_cluster = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
306 list_found = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
307 found = False
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
308 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
309 pos_i = line.find('>') + 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
310 pos_f = line.find('...')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
311 pos_m = line.find('-')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
312 prot = line[pos_i:pos_m]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
313 phage = line[pos_m + 1:pos_f]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
314 if prot in self.known_function[phage].keys() and not found:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
315 function = self.known_function[phage][prot][0]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
316 list_found.append(function)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
317 found = True
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
318 elif prot in self.known_function[phage].keys() and found:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
319 if function != self.known_function[phage][prot][0] and self.known_function[phage][prot][0] not in list_found:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
320 function = self.known_function[phage][prot][0]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
321 list_found.append(function)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
322 elif prot in self.unknown_function[phage].keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
323 temp_cluster.append(line[pos_i:pos_f])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
324
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
325 def create_blast_db(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
326 import os
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
327 self._create_fasta(self.known_function, 'database_phages.fasta')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
328 os.system('makeblastdb -in files/database_phages.fasta -dbtype prot -title PhageProts -parse_seqids -out files/database_phages')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
329 self._create_fasta(self.unknown_function, 'unknown_phages.fasta')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
330 os.system('blastp -db files/database_phages -query files/unknown_phages.fasta -out files/test_blast -num_threads 2 -outfmt 6')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
331
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
332 def process_blastdb(self, blastdb):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
333 import pandas as pd
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
334 blast_domains = pd.read_csv('files/' + blastdb, sep='\t', header=None)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
335 for phage in self.unknown_function.keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
336 for prot in self.unknown_function[phage]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
337 evalue = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
338 bitscore = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
339 pred = blast_domains[blast_domains[0] == phage + '-' + prot]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
340 if pred.shape[0] == 0: break
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
341 for i in pred[10]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
342 evalue.append(float(i))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
343 for i in pred[11]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
344 bitscore.append(float(i))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
345 if min(evalue) < 1.0 and max(bitscore) > 30.0:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
346 ind = evalue.index(min(evalue))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
347 if ind != bitscore.index(max(bitscore)):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
348 ind = bitscore.index(max(bitscore))
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
349 temp = pred.iloc[ind,1]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
350 known_phage = temp[:temp.find('-')]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
351 known_prot = temp[temp.find('-')+1:]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
352 if self.known_function[known_phage][known_prot]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
353 new_func = self.known_function[known_phage][known_prot][0]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
354 for j in self.known_function.keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
355 if pred.iloc[ind,1] in self.known_function[j].keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
356 new_func = self.known_function[j][pred.iloc[ind,1]][0]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
357 break
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
358 self.phagesProteins[phage][prot][0] = new_func
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
359 self.saveDomains()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
360
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
361 def extract_bact_location(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
362 import pandas as pd
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
363 import ast
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
364 import requests
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
365 import re
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
366 from pathlib import Path
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
367 data = pd.read_csv('files/NCBI_Phage_Bacteria_Data.csv', header=0, index_col=0)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
368 all_bact = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
369 for i in data.index:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
370 for bact in ast.literal_eval(data.loc[i, 'Host_ID']):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
371 if bact[:-2] not in all_bact:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
372 all_bact.append(bact[:-2])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
373 fullURL = ('https://db.psort.org/downloads/precomputed?version=3.00')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
374 result = requests.get(fullURL)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
375 psort = result.text.strip()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
376 urls = re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+/[a-z]+/\S+\"{1}', psort)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
377 i = 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
378 while i < len(urls):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
379 temp = urls[i]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
380 bact = temp[temp.rfind('=') + 1:temp.find('"')]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
381 if bact not in all_bact:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
382 i += 3
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
383 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
384 my_file = Path('files/psort/' + bact + ".faa.out")
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
385 if not my_file.is_file():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
386 temp_url = urls[i+1].strip('"')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
387 r = requests.get(temp_url)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
388 with open('files/psort/' + bact + ".faa.out", 'wb') as f:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
389 f.write(r.content)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
390 i += 3
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
391
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
392 def create_fasta_psort(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
393 from pathlib import Path
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
394 import json
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
395 import os
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
396 for bact in os.listdir('files/bacteria'):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
397 my_file = Path('files/psort/' + bact[:-5] + '.faa.out')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
398 if not my_file.is_file():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
399 with open('files/bacteria/' + bact, encoding='utf-8') as F:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
400 bact_prots = json.loads(F.read())
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
401 self._create_fasta(bact_prots, 'psort/' + bact[:-5] + '.fasta')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
402 os.system('./psortb -n -i /home/pedro-linux/OneDrive/UMinho/Cenas_de_tese_idk/test_tese_process/files/psort/' + bact[:-5] + '.fasta -r . -o long')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
403 os.listdir('./psortb')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
404 os.replace('', '/home/pedro-linux/OneDrive/UMinho/Cenas_de_tese_idk/test_tese_process/files/psort/' + bact[:-5] + '.faa.out') # move and rename output
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
405 os.remove('files/psort/' + bact[:-5] + '.fasta')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
406
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
407 def saveDomains(self):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
408 '''
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
409 Saves the protdomain variable in a file.
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
410 :return: SearchedDomains.json
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
411 '''
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
412 import json
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
413 with open('files/phagesProteins.json', 'w') as f:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
414 json.dump(self.phagesProteins, f)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
415 # with open('files/phagesProteins.fasta', 'w') as F:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
416 # for phage in self.phagesProteins.keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
417 # for prot in self.phagesProteins[phage]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
418 # F.write('>' + prot + '\n' + self.phagesProteins[phage][prot][1] + '\n')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
419
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
420
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
421 if __name__ == '__main__':
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
422 test = DomainSearch()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
423
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
424 test.extract_bact_location()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
425 test.create_fasta_psort()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
426
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
427 test.create_blast_db()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
428 test.process_blastdb('test_blast')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
429
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
430 test.cdHit()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
431 test.scanInterPro()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
432 test.processInterPro()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
433
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
434 test.fillDomainsBLAST()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
435 test.fillDomainsUniProt()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
436 test.saveDomains()