view domain_search.py @ 2:8674f554d76b draft

Uploaded
author pedro_araujo
date Fri, 29 Jan 2021 16:01:12 +0000
parents e4b3fc88efe0
children
line wrap: on
line source


class DomainSearch:

	def __init__(self):
		'''
		This still needs a bit of modifications
		:param phagesProteins: protein function and sequences, as provided in NCBI. Each phage ID has every protein represented with a dicionary with keys as protein IDs
		:param phageDomains: for each phage and each of it's proteins, a list of predicted domains is given. If unavailable, it returns an empty list
		'''
		import json
		import pandas as pd
		with open('files/phagesProteins.json', encoding='utf-8') as F:
			self.phagesProteins = json.loads(F.read())
		# with open('files/bactProteins.json', encoding='utf-8') as F: # For later use, implement the same way as phage, more or less. Include psort
		# 	self.bacProt = json.loads(F.read())
		data = pd.read_csv('files/NCBI_Phage_Bacteria_Data.csv', header=0, index_col=0)
		for phage in data.index:
			if data.loc[phage, 'Host_ID'] == '[]':
				try: del self.phagesProteins[phage]
				except: pass
		self._filter_phage()

	def _create_fasta(self, dic, name):
		'''
		Creates a fasta file containing every protein sequence for a given dictionary.
		:return:
		'''
		with open('files/' + name, 'w') as F:
			for org in dic:
				for prot in dic[org]:
					F.write('>' + org + '-' + prot + '\n' + dic[org][prot][1] + '\n')

	def _filter_phage(self):
		self.known_function = {}
		self.unknown_function = {}
		for phage in self.phagesProteins.keys():
			self.known_function[phage] = {}
			self.unknown_function[phage] = {}
			for prot in self.phagesProteins[phage].keys():
				func = self.phagesProteins[phage][prot][0]
				if (not any(i in func.lower() for i in ['hypothetical', 'unknown', 'kda', 'uncharacterized', 'hyphothetical']) and len(func) > 3) and not ('gp' in func.lower() and len(func.split(' ')) < 2) and not (len(func.split(' ')) == 1 and len(func) < 5):
					self.known_function[phage][prot] = self.phagesProteins[phage][prot]
				else:
					self.unknown_function[phage][prot] = self.phagesProteins[phage][prot]

	def scanInterPro(self, InterPro_path='/home/pedro-linux/Downloads/interproscan-5.46-81.0/', out_path='/home/pedro-linux/OneDrive/UMinho/Cenas_de_tese_idk/test_tese_process/files/'):
		'''
		Creates a fasta file containing every protein and scans it using Interproscan. Creates a tsv file
		:param InterPro_path: path to the interproscan executable
		:param out_path: path to save the tsv output
		:return: domains_output.tsv, a file that contains the domain associated with each protein
		'''
		import os
		self._create_fasta(self.unknown_function, 'unknown_phages.fasta')
		os.system(InterPro_path + 'interproscan.sh -b ' + out_path + '/interpro/domains_output -i ' + out_path + 'unknown_phages.fasta -f tsv')

	def iter_interpro(self, InterPro_path='/home/pedro-linux/Downloads/interproscan-5.46-81.0/', out_path='/home/pedro-linux/OneDrive/UMinho/Cenas_de_tese_idk/test_tese_process/files/interpro/'):
		import os
		from pathlib import Path
		count = 0
		F = open('files/interpro/temp_100.fasta', 'w')
		for phage in self.unknown_function:
			for prot in self.unknown_function[phage]:
				count += 1
				my_file = Path("files/interpro/domains_output" + str(count) + ".tsv")
				if count % 100 == 0 and not my_file.is_file():
					F.write('>' + prot + '\n' + self.unknown_function[phage][prot][1] + '\n')
					F.close()
					os.system(InterPro_path + 'interproscan.sh -b ' + out_path + 'domains_output' + str(count) + ' -i ' + out_path + 'temp_100.fasta -f tsv')
					F = open('files/interpro/temp_100.fasta', 'w')
				else:
					F.write('>' + prot + '\n' + self.unknown_function[phage][prot][1] + '\n')
		if count % 100 != 0:
			F.close()
			os.system(InterPro_path + 'interproscan.sh -b ' + out_path + 'domains_output' + str(count) + ' -i ' + out_path + 'temp_100.fasta -f tsv')

	def processInterPro(self):
		'''
		Processes the tsv file created from scanInterPro. Domains are saved in the protdomains variable.
		:return: phageDomains, a dictionary that, for each protein in a given species, has domains associated
		'''
		import os
		from pathlib import Path
		import pandas as pd
		import re
		my_file = Path("files/interpro/domains_output.tsv")
		if not my_file.is_file():
			with open('files/interpro/domains_output.tsv', 'w') as F:
				for file in os.listdir('files/interpro/'):
					if 'temp_100' not in file:
						with open('files/interpro/' + file, 'r') as f:
							F.write(f.read())
		domains = pd.read_csv('files/interpro/domains_output.tsv', sep='\t', index_col=0, header=None, names=list(range(13)))
		domains = domains.fillna('-')
		domains = domains[domains.loc[:, 3] != 'Coils']
		domains = domains[domains.loc[:, 3] != 'MobiDBLite']
		# domains = domains.groupby(domains.index).last()
		add_domains = {}
		for spec in self.phagesProteins:
			for prot in self.phagesProteins[spec]:
				if prot in domains.index:
					temp = '-'
					try:
						for i in range(domains.loc[prot, :].shape[0]):
							if '-' not in domains.loc[prot, 12].iloc[i].lower():
								if float(domains.loc[id, 8].iloc[i]) < 1.0:
									temp = domains.loc[id, 12].iloc[i]
								break
					except:
						if float(domains.loc[id, 8]) < 1.0:
							temp = domains.loc[id, 12]
					x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp) # se tiver hits, remover
					if temp != '-' and not any(z in temp.lower() for z in ['unknown', 'ucp', 'uncharacterized', 'consensus']) and len(temp) > 3 and not x:
						if temp not in add_domains.keys():
							add_domains[temp] = input('Add function: ' + temp).lower()
						if 'y' in add_domains[temp]:
							self.phagesProteins[spec][prot][0] = temp
					else:
						try:
							for i in range(domains.loc[prot, :].shape[0]):
								if '-' not in domains.loc[prot, 5].iloc[i].lower():
									temp = domains.loc[prot, 5].iloc[i]
									break
						except:
							temp = domains.loc[prot, 5]
						x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp)
						if temp != '-' and not any(z in temp.lower() for z in ['unknown', 'ucp', 'uncharacterized', 'consensus']) and len(temp) > 3 and not x:
							if temp not in add_domains.keys():
								add_domains[temp] = input('Add function: ' + temp).lower()
							if 'y' in add_domains[temp]:
								self.phagesProteins[spec][prot][0] = temp

	def find_domains_interpro(self, dic):
		import os
		import pandas as pd
		import re
		InterPro_path='/home/pedro-linux/Downloads/interproscan-5.46-81.0/'
		out_path='/home/pedro-linux/OneDrive/UMinho/Cenas_de_tese_idk/WholeProcess/files/'
		with open('files/SinglePhageProteins.fasta', 'w') as F:
			for prot in dic.keys():
				F.write('>' + dic[prot][0] + '\n' + dic[prot][1] + '\n')
		os.system(InterPro_path + 'interproscan.sh -b ' + out_path + 'single_phage_domains -i ' + out_path + 'SinglePhageProteins.fasta -f tsv')

		domains = pd.read_csv('files/single_phage_domains.tsv', sep='\t', index_col=0, header=None, names=list(range(13)))
		domains = domains.fillna('-')
		for prot in dic:
			if prot in domains.index:
				temp = '-'
				try:
					for i in range(domains.loc[prot, :].shape[0]):
						if 'coil' not in domains.loc[prot, 12].iloc[i].lower() and '-' not in domains.loc[prot, 12].iloc[i].lower():
							temp = domains.loc[prot, 12].iloc[i]
							break
				except:
					temp = domains.loc[prot, 12]
				x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp) # se tiver hits, remover
				if temp != '-' and 'unknown' not in temp and 'UCP' not in temp and len(temp)>3 and not x:
					dic[prot][0] = temp
				else:
					try:
						for i in range(domains.loc[prot, :].shape[0]):
							if 'coil' not in domains.loc[prot, 5].iloc[i].lower() and '-' not in domains.loc[prot, 12].iloc[i].lower():
								temp = domains.loc[prot, 5].iloc[i]
								break
					except:
						temp = domains.loc[prot, 5]
					x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp)
					if temp != '-' and 'unknown' not in temp and 'UCP' not in temp and len(temp) > 3 and not x:
						dic[prot][0] = temp
		return dic

	def fillDomainsBLAST(self):
		'''
		Using the NCBIWWW package, it searches for domains with BLAST. Domains are saved in the protdomains variable.
		:return: phageDomains, a dictionary that, for each protein in a given species, has domains associated
		'''
		print('Finding functions/domains with BLAST')
		from Bio.Blast import NCBIWWW
		from Bio.Blast import NCBIXML
		import pickle
		from pathlib import Path
		my_file = Path("files/phage_list_blast")
		if my_file.is_file():
			with open('files/phage_list_blast', 'rb') as f:
				list_done = pickle.load(f)
		else:
			list_done = []
		for spec in self.phagesProteins:
			if spec not in list_done:
				for prot in self.phagesProteins[spec]:
					if 'hypothetical' in self.phagesProteins[spec][prot][0].lower() or 'uncharacterized' in self.phagesProteins[spec][prot][0].lower() or 'unknown' in self.phagesProteins[spec][prot][0].lower():
					# if not self.phageDomains[bac][prot]:
						result_handle = NCBIWWW.qblast('blastp', 'nr', self.phagesProteins[spec][prot][1], entrez_query='Acinetobacter baumannii (taxid:470), Escherichia coli (taxid:562), Klebsiella pneumonia (taxid:573)')
						blastout = NCBIXML.read(result_handle)
						for ali in blastout.alignments:
							if 'hypothetical' not in ali.hit_def.lower() and 'uncharacterized' not in ali.hit_def.lower():
								print(ali.hit_def[:ali.hit_def.find(' [')])
								self.phagesProteins[spec][prot][0] = ali.hit_def[:ali.hit_def.find(' [')]
								break
				list_done.append(spec)
				with open('files/phage_list_blast', 'wb') as f:
					pickle.dump(list_done, f)
				self.saveDomains()

	def find_domains_blast(self, dic):
		from Bio.Blast import NCBIWWW
		from Bio.Blast import NCBIXML

		for prot in dic.keys():
			if 'hypothetical' in dic[prot][0].lower() or 'uncharacterized' in dic[prot][0].lower() or 'unknown' in dic[prot][0].lower():
				result_handle = NCBIWWW.qblast('blastp', 'nr', prot, entrez_query='Acinetobacter baumannii (taxid:470), Escherichia coli (taxid:562), Klebsiella pneumonia (taxid:573)')
				blastout = NCBIXML.read(result_handle)
				for ali in blastout.alignments:
					if 'hypothetical' not in ali.hit_def.lower() and 'uncharacterized' not in ali.hit_def.lower():
						print(ali.hit_def[:ali.hit_def.find(' [')])
						self.phagesProteins[spec][prot][0] = ali.hit_def[:ali.hit_def.find(' [')]
						break
		return dic

	def fillDomainsUniProt(self):
		'''
		Using the UniProt website, similar sequences are obtained and the ones with function assigned are saved into the domains. Domains are saved in the protdomains variable.
		:return: phageDomains, a dictionary that, for each protein in a given species, has domains associated
		'''
		print('Finding functions/domains with UniProt')
		import requests
		import pickle
		from pathlib import Path
		my_file = Path("files/phage_list_uniprot")
		if my_file.is_file():
			with open('files/phage_list_uniprot', 'rb') as f:
				list_done = pickle.load(f)
		else:
			list_done = []
		for phage in self.phagesProteins:
			if phage not in list_done:
				for accID in self.phagesProteins[phage]:
					if 'hypothetical' in self.phagesProteins[phage][accID][0].lower() or 'uncharacterized' in self.phagesProteins[phage][accID][0].lower() or 'unknown' in self.phagesProteins[phage][accID][0].lower():
					# if not self.phageDomains[phage][accID]:
						fullURL = ('https://www.uniprot.org/uniprot/?query=' + accID + '&sort=score&format=list')
						result = requests.get(fullURL)
						uniprot_acc = result.text.strip()
						fullURL = ('https://www.uniprot.org/uniprot/?query=cluster:(uniprot:' + uniprot_acc + '* identity:1.0) not id:' + uniprot_acc + '&format=txt')
						result = requests.get(fullURL)
						listResults = result.text.split('\n')
						for entry in listResults:
							if entry[:2] == 'DE':
								start_pos = entry.find('Full=') + 5
								end_pos = entry.find(' {ECO')
								domain = entry[start_pos:end_pos]
								if not any(z in domain.lower() for z in ['uncharacterized', 'flags', 'domain', 'bacteriophage protein', 'family protein', 'phage-like', 'phage protein', 'unassigned', 'orf', 'gene']) and len(domain) > 5:
									print(domain)
									self.phagesProteins[phage][accID][0] = domain
									break
				list_done.append(phage)
				with open('files/phage_list_uniprot', 'wb') as f:
					pickle.dump(list_done, f)
				self.saveDomains()

	def find_domains_uniprot(self, dic):
		import requests
		for accID in dic.keys():
			if 'hypothetical' in dic[accID][0].lower() or 'uncharacterized' in dic[accID][0].lower() or 'unknown' in dic[accID][0].lower():
				fullURL = ('https://www.uniprot.org/uniprot/?query=' + accID + '&sort=score&format=list')
				result = requests.get(fullURL)
				uniprot_acc = result.text.strip()
				fullURL = ('https://www.uniprot.org/uniprot/?query=cluster:(uniprot:' + uniprot_acc + '* identity:1.0) not id:' + uniprot_acc + '&format=txt')
				result = requests.get(fullURL)
				listResults = result.text.split('\n')
				for entry in listResults:
					if entry[:2] == 'DE':
						start_pos = entry.find('Full=') + 5
						end_pos = entry.find(' {ECO')
						domain = entry[start_pos:end_pos]
						if not any(z in domain.lower() for z in ['uncharacterized', 'flags', 'domain', 'bacteriophage protein', 'family protein', 'phage-like', 'phage protein', 'unassigned', 'orf', 'gene']) and len(domain) > 5:
							dic[accID][0] = domain
							break
		return dic

	def cdHit(self):
		import os
		from pathlib import Path
		my_file = Path('files/phagesProteins.fasta')
		if not my_file.is_file():
			self._create_fasta(self.phagesProteins, 'phagesProteins.fasta')
		my_file = Path('files/complete_cdhit.clstr')
		if not my_file.is_file():
			os.system('cd-hit -i files/phagesProteins.fasta -d 50 -o files/complete_cdhit')
		# clusters = {}
		temp_cluster = []
		list_found = []
		found = False
		with open('files/complete_cdhit.clstr', 'r') as f:
			for line in f.readlines():
				if '>Cluster' in line:
					if temp_cluster and found:
						if len(list_found) == 1:
							function = list_found[0]
						else:
							x = int(input(str(list_found) + '\nChoose from 1 to ' + str(len(list_found)) + ': ')) - 1
							function = list_found[x]
						for clust in temp_cluster:
							self.phagesProteins[clust[clust.find('-') + 1:]][clust[:clust.find('-')]][0] = function

					temp_cluster = []
					list_found = []
					found = False
				else:
					pos_i = line.find('>') + 1
					pos_f = line.find('...')
					pos_m = line.find('-')
					prot = line[pos_i:pos_m]
					phage = line[pos_m + 1:pos_f]
					if prot in self.known_function[phage].keys() and not found:
						function = self.known_function[phage][prot][0]
						list_found.append(function)
						found = True
					elif prot in self.known_function[phage].keys() and found:
						if function != self.known_function[phage][prot][0] and self.known_function[phage][prot][0] not in list_found:
							function = self.known_function[phage][prot][0]
							list_found.append(function)
					elif prot in self.unknown_function[phage].keys():
						temp_cluster.append(line[pos_i:pos_f])

	def create_blast_db(self):
		import os
		self._create_fasta(self.known_function, 'database_phages.fasta')
		os.system('makeblastdb -in files/database_phages.fasta -dbtype prot -title PhageProts -parse_seqids -out files/database_phages')
		self._create_fasta(self.unknown_function, 'unknown_phages.fasta')
		os.system('blastp -db files/database_phages -query files/unknown_phages.fasta -out files/test_blast -num_threads 2 -outfmt 6')

	def process_blastdb(self, blastdb):
		import pandas as pd
		blast_domains = pd.read_csv('files/' + blastdb, sep='\t', header=None)
		for phage in self.unknown_function.keys():
			for prot in self.unknown_function[phage]:
				evalue = []
				bitscore = []
				pred = blast_domains[blast_domains[0] == phage + '-' + prot]
				if pred.shape[0] == 0: break
				for i in pred[10]:
					evalue.append(float(i))
				for i in pred[11]:
					bitscore.append(float(i))
				if min(evalue) < 1.0 and max(bitscore) > 30.0:
					ind = evalue.index(min(evalue))
					if ind != bitscore.index(max(bitscore)):
						ind = bitscore.index(max(bitscore))
					temp = pred.iloc[ind,1]
					known_phage = temp[:temp.find('-')]
					known_prot = temp[temp.find('-')+1:]
					if self.known_function[known_phage][known_prot]:
						new_func = self.known_function[known_phage][known_prot][0]
					for j in self.known_function.keys():
						if pred.iloc[ind,1] in self.known_function[j].keys():
							new_func = self.known_function[j][pred.iloc[ind,1]][0]
							break
					self.phagesProteins[phage][prot][0] = new_func
		self.saveDomains()

	def extract_bact_location(self):
		import pandas as pd
		import ast
		import requests
		import re
		from pathlib import Path
		data = pd.read_csv('files/NCBI_Phage_Bacteria_Data.csv', header=0, index_col=0)
		all_bact = []
		for i in data.index:
			for bact in ast.literal_eval(data.loc[i, 'Host_ID']):
				if bact[:-2] not in all_bact:
					all_bact.append(bact[:-2])
		fullURL = ('https://db.psort.org/downloads/precomputed?version=3.00')
		result = requests.get(fullURL)
		psort = result.text.strip()
		urls = re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+/[a-z]+/\S+\"{1}', psort)
		i = 1
		while i < len(urls):
			temp = urls[i]
			bact = temp[temp.rfind('=') + 1:temp.find('"')]
			if bact not in all_bact:
				i += 3
			else:
				my_file = Path('files/psort/' + bact + ".faa.out")
				if not my_file.is_file():
					temp_url = urls[i+1].strip('"')
					r = requests.get(temp_url)
					with open('files/psort/' + bact + ".faa.out", 'wb') as f:
						f.write(r.content)
				i += 3

	def create_fasta_psort(self):
		from pathlib import Path
		import json
		import os
		for bact in os.listdir('files/bacteria'):
			my_file = Path('files/psort/' + bact[:-5] + '.faa.out')
			if not my_file.is_file():
				with open('files/bacteria/' + bact, encoding='utf-8') as F:
					bact_prots = json.loads(F.read())
				self._create_fasta(bact_prots, 'psort/' + bact[:-5] + '.fasta')
			os.system('./psortb -n -i /home/pedro-linux/OneDrive/UMinho/Cenas_de_tese_idk/test_tese_process/files/psort/' + bact[:-5] + '.fasta -r . -o long')
			os.listdir('./psortb')
			os.replace('', '/home/pedro-linux/OneDrive/UMinho/Cenas_de_tese_idk/test_tese_process/files/psort/' + bact[:-5] + '.faa.out')  # move and rename output
			os.remove('files/psort/' + bact[:-5] + '.fasta')

	def saveDomains(self):
		'''
		Saves the protdomain variable in a file.
		:return: SearchedDomains.json
		'''
		import json
		with open('files/phagesProteins.json', 'w') as f:
			json.dump(self.phagesProteins, f)
		# with open('files/phagesProteins.fasta', 'w') as F:
		# 	for phage in self.phagesProteins.keys():
		# 		for prot in self.phagesProteins[phage]:
		# 			F.write('>' + prot + '\n' + self.phagesProteins[phage][prot][1] + '\n')


if __name__ == '__main__':
	test = DomainSearch()

	test.extract_bact_location()
	test.create_fasta_psort()

	test.create_blast_db()
	test.process_blastdb('test_blast')

	test.cdHit()
	test.scanInterPro()
	test.processInterPro()

	test.fillDomainsBLAST()
	test.fillDomainsUniProt()
	test.saveDomains()