# HG changeset patch # User pedro_araujo # Date 1612093156 0 # Node ID ca8d2b919299d76d8ce6e4d0bb83f2a3971b7d55 # Parent d0d1dac3903ad39f31d022e8b1b00c1101ad3fd4 Deleted selected files diff -r d0d1dac3903a -r ca8d2b919299 process_raw_data.py --- a/process_raw_data.py Sun Jan 31 11:28:30 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,386 +0,0 @@ -""" -https://www.ncbi.nlm.nih.gov/Sequin/acc.html -https://www.ncbi.nlm.nih.gov/books/NBK21091/table/ch18.T.refseq_accession_numbers_and_mole/?report=objectonly -""" - - -class PhageBacteriaData: - - def __init__(self, dataset=None): - """ - Imports a dataset from NCBI Virus, where the columns are Phage ID, Phage Name, Bacteria Name, Bacteria ID. - If a phage entry does not have a bacteria associated, it is deleted - :param dataset: - """ - import pandas as pd - import json - self.listBacID = [] - if dataset is None: - file = False - while not file: - try: - name = input('File name: ') - self.data = pd.read_csv('files/' + name, header=0, index_col=0) - file = True - except: - print('Couldn\'t find file') - else: - self.data = pd.read_csv('files/' + dataset, header=0, index_col=0) - self.data = self.data.dropna(how='any') - self.data = self.data[self.data['Host'] != 'unclassified bacterium'] - index_remove = [] - for i in range(len(self.data)): - if 'uncultured' in self.data.iloc[i]['Species']: - index_remove.append(i) - elif 'virus' not in self.data.iloc[i]['Species'] and 'phage' not in self.data.iloc[i]['Species']: - index_remove.append(i) - self.data = self.data.drop(self.data.index[index_remove]) - index_remove = [] - for i in range(len(self.data)): - temp = self.data['Host'][i].split(' ') - if len(temp) <= 1: - index_remove.append(i) - self.data = self.data.drop(self.data.index[index_remove]) - if 'Host_ID' not in self.data.columns: - temp = [] - for i in self.data.index: - temp.append([]) - self.data['Host_ID'] = temp - try: - with open('files/searched_accessions', 'r') as f: - self.searched = json.loads(f.read()) - except: - self.searched = {} - - def addPhageName(self): - """ - Using the entrez service, from NCBI, for each phage the name is added, from its features. - :return: - """ - from Bio import Entrez - from Bio import SeqIO - Entrez.email = "pedro_araujo97@hotmail.com" - listNames = [] - with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=self.data.index) as handle: - for seq_record in SeqIO.parse(handle, "gb"): - listNames.append(seq_record.annotations['organism']) - self.data['Species'] = listNames - - def addBacteriaName(self): - """ - Using the entrez service, from NCBI, for each phage the infecting bacteria name is added, from its features. - :return: - """ - from Bio import Entrez - from Bio import SeqIO - Entrez.email = "pedro_araujo97@hotmail.com" - for phage in self.data.index: - with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=phage) as handle: - seq_record = SeqIO.read(handle, "gb") - try: - if len(seq_record.features[0].qualifiers['host'][0].split(' ')) > 2: - self.data.loc[phage, 'Host'] = seq_record.features[0].qualifiers['host'][0] - except: - if len(seq_record.features[0].qualifiers['lab_host'][0].split(' ')) > 2: - self.data.loc[phage, 'Host'] = seq_record.features[0].qualifiers['lab_host'][0] - self.save_data() - - def addBacteriaGenome(self): - """ - For each phage, the associated scientific articles in pubmed are looked. From theses pubmed articles, all associated IDs are extracted. - If the ID corresponds to a bacterial strain, its accession ID is added to the Bacteria ID column. - :return: - """ - from Bio import Entrez - import ast - import pickle - from pathlib import Path - Entrez.email = "pedro_araujo97@hotmail.com" - my_file = Path("files/searched_hosts") - if my_file.is_file(): - with open('files/searched_hosts', 'rb') as f: - list_done = pickle.load(f) - else: - list_done = [] - count = 0 - try: - for phageID in self.data.index: - if phageID in list_done: continue - listBactID = ast.literal_eval(self.data.loc[phageID, 'Host_ID']) - try: - with Entrez.elink(dbfrom='nuccore', db='pubmed', id=phageID) as handle: - pubmed = Entrez.read(handle) - for i in pubmed[0]['LinkSetDb']: - if 'weighted' not in i["LinkName"]: - for link in i["Link"]: - try: - with Entrez.elink(dbfrom='pubmed', db="nucleotide", id=link['Id']) as handle: - genomes = Entrez.read(handle) - for i in genomes[0]['LinkSetDb']: - if 'weighted' not in i['LinkName']: - for id in i['Link']: - with Entrez.esummary(db='nucleotide', id=id['Id']) as handle: - bacorg = Entrez.read(handle) - if bacorg[0]['Caption'] != phageID and 'phage' not in bacorg[0][ - 'Title'].lower() \ - and bacorg[0][ - 'AccessionVersion'] not in listBactID and 'cds' not in \ - bacorg[0]['Title'].lower() and 'shotgun' not in bacorg[0][ - 'Title'].lower(): - if any(z in bacorg[0]['AccessionVersion'][:3] for z in - ['NC_', 'AC_', 'NZ_', 'CP', 'AE', 'CY', 'AP']): - listBactID.append(bacorg[0]['AccessionVersion']) - self.searched[bacorg[0]['AccessionVersion']] = 'yes' - count += 1 - elif not any(z in bacorg[0]['AccessionVersion'][:3] for z in - ['MN', 'FM', 'MQ', 'MR', 'MK', 'AB', 'MF', 'KP', - 'NM_', 'KC', 'MH', 'AY', 'FN', 'AY']) \ - and 'complete' in bacorg[0]['Title'].lower(): - if bacorg[0]['AccessionVersion'] in self.searched.keys(): - add = self.searched[bacorg[0]['AccessionVersion']] - else: - add = input('Check ' + bacorg[0][ - 'AccessionVersion'] + '\nDo you wish to add it? (yes/no)') - if 'y' in add.lower(): - listBactID.append(bacorg[0]['AccessionVersion']) - self.searched[bacorg[0]['AccessionVersion']] = 'yes' - count += 1 - else: - self.searched[bacorg[0]['AccessionVersion']] = 'no' - except: - pass - except: - pass - self.data.loc[phageID, 'Host_ID'] = listBactID - list_done.append(phageID) - with open('files/searched_hosts', 'wb') as f: - pickle.dump(list_done, f) - self.save_data() - print(phageID) - except: - print('Bacterial host name missing. Searching from phage id') - pass - print('For future reference,', count, "new bacterial ID's were added.") - self.save_data() - - def checkAbstracts(self): - """ - For each phage, the associated scientific articles in pubmed are looked. From theses pubmed articles, the abstracted is searched for mentions of bacterial strains. - If bacterial strains are found, its accession IDs are added to the Bacteria ID column. - :return: - """ - from Bio import Entrez - import re - import ast - Entrez.email = 'pedro_araujo97@hotmail.com' - count = 0 - for phageID in self.data.index: - if len(self.data.loc[phageID, 'Host'].split()) < 3: - with Entrez.elink(dbfrom='nuccore', db='pubmed', id=phageID) as handle: - pubmed = Entrez.read(handle) - listBactID = ast.literal_eval(self.data.loc[phageID, 'Host_ID']) - for i in pubmed[0]['LinkSetDb']: - if 'weighted' not in i["LinkName"]: - for link in i["Link"]: - try: - with Entrez.efetch(db="pubmed", rettype="medline", retmode="xml", - id=link['Id']) as handle: - article = Entrez.read(handle) - abstract = \ - article['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']['AbstractText'][0] - x = re.findall('\w{0,1}[A-Z]{1,5}[0-9]{1,5}[-,:]{0,1}[A-Z]{0,5}[1-9]{0,5}', abstract) - for i in range(len(x)): - x[i] = x[i].strip(',;') - x = list(set(x)) - for i in x: - if 'ORF' in i: - x.remove(i) - for strain in x: - with Entrez.esearch(db='nucleotide', term='((((' + self.data.loc[ - phageID, 'Host'] + ' ' + strain + ' AND complete sequence) NOT shotgun[Title]) NOT phage[Title]) NOT cds[Title]) NOT gene[Title]', - idtype="acc") as handle: - species = Entrez.read(handle) - strains = species['IdList'] - for i in strains: - if any(z in i for z in ['NC_', 'NZ_', 'AC_', 'CP', 'AE', 'CY', - 'AP']) and i not in listBactID: - listBactID.append(i) - self.searched[i] = 'yes' - count += 1 - elif not any(z in i[:3] for z in - ['MN', 'FM', 'MQ', 'MR', 'MK', 'AB', 'MF', 'KP', 'NM_', 'KC', - 'MH', 'AY', 'FN', 'AY']) and i not in listBactID: - if i in self.searched.keys(): - add = self.searched[i] - else: - add = input('Check ' + i + '\nDo you wish to add it? (yes/no)') - if 'y' in add.lower(): - listBactID.append(i) - self.searched[i] = 'yes' - count += 1 - else: - self.searched = 'no' - except: - pass - self.data.loc[phageID, 'Host_ID'] = listBactID - print('For future reference,', count, "new bacterial ID's were added.") - self.save_data() - - def saveAbstracts(self): - """ - For each phage, the associated scientific articles in pubmed are looked. - From theses pubmed articles, the abstracted is saved in a dictionary structure, where each phage ID is associated with a list of abstracts. - These abstracts can be used for later processing. - :return: - """ - from Bio import Entrez - import json - Entrez.email = 'pedro_araujo97@hotmail.com' - dicPhageAbstracts = {} - - for phageID in self.data.index: - print(phageID, end='\n') - dicPhageAbstracts[phageID] = [] - with Entrez.elink(dbfrom='nuccore', db='pubmed', id=phageID) as handle: - pubmed = Entrez.read(handle) - lista = [] - for i in pubmed[0]['LinkSetDb']: - if 'weighted' not in i["LinkName"]: - for link in i["Link"]: - try: - with Entrez.efetch(db="pubmed", rettype="medline", retmode="xml", id=link['Id']) as handle: - article = Entrez.read(handle) - abstract = \ - article['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']['AbstractText'][0] - dicPhageAbstracts[phageID].append([link['Id'], abstract]) - except: - pass - with open('files/phageAbstracts.json', 'w') as f: - json.dump(dicPhageAbstracts, f) - - def searchBacName(self): - from Bio import Entrez - import ast - Entrez.email = "pedro_araujo97@hotmail.com" - count = 0 - for phageID in self.data.index: - if len(self.data.loc[phageID, 'Host'].split()) > 2: - listBactID = ast.literal_eval(self.data.loc[phageID, 'Host_ID']) - with Entrez.esearch(db='nucleotide', term='((((' + self.data.loc[ - phageID, 'Host'] + ' AND complete sequence) NOT shotgun[Title]) NOT phage[Title]) NOT cds[Title]) NOT gene[Title]', - idtype="acc") as handle: - species = Entrez.read(handle) - strains = species['IdList'] - for j in strains: - if any(z in j[:3] for z in - ['NC_', 'NZ_', 'AC_', 'CP', 'AE', 'CY', 'AP']) and j not in listBactID: - listBactID.append(j) - self.searched[j] = 'yes' - count += 1 - elif not any(z in j[:3] for z in - ['MN', 'FM', 'MQ', 'MR', 'MK', 'AB', 'MF', 'KP', 'NM_', 'KC', 'MH', 'AY', 'FN', - 'AY']) and j not in listBactID: - if j in self.searched.keys(): - add = self.searched[j] - else: - add = input('Check ' + j + '\nDo you wish to add it? (yes/no)') - if 'y' in add.lower(): - listBactID.append(j) - self.searched[j] = 'yes' - count += 1 - else: - self.searched[j] = 'no' - self.data.loc[phageID, 'Host_ID'] = listBactID - print('For future reference,', count, "new bacterial ID's were added.") - self.save_data() - - def createListBacID(self, lower=0, upper=100): - """ - More sequential than previous methods. Maybe include every single one... - :param lower: lower index from the phage list (numeric) - :param upper: upper index from the phage list (numeric) - :return: - """ - from Bio import Entrez - Entrez.email = 'pedro_araujo97@hotmail.com' - for i in range(lower, upper): - phageID = self.data.index[i] - BactID = [] - name = test.data.loc[phageID]['Bacteria Name'] - try: - if name != 'unclassified bacterium' and not name != name: # Verificação de hosts válidos - with Entrez.elink(dbfrom='nuccore', db='pubmed', id=phageID) as handle: - pubmed = Entrez.read(handle) - for link in pubmed[0]["LinkSetDb"][0]["Link"]: - try: - with Entrez.elink(dbfrom='pubmed', db="nucleotide", id=link['Id']) as handle: - genomes = Entrez.read(handle) - for id in genomes[0]['LinkSetDb'][0]['Link']: - with Entrez.esummary(db='nucleotide', id=id['Id']) as handle: - bacorg = Entrez.read(handle) - if 'NC_' in bacorg[0]['AccessionVersion'] or 'NZ_' in bacorg[0]['AccessionVersion']: - if bacorg[0]['Caption'] != phageID: - BactID.append(bacorg[0]['AccessionVersion']) - except: - pass - else: - pass - except: - pass - self.listBacID.append(BactID) - - def check_bacteria(self): - from Bio import Entrez - from Bio import SeqIO - import ast - Entrez.email = "pedro_araujo97@hotmail.com" - all_bact = [] - for i in self.data.index: - for bact in ast.literal_eval(self.data.loc[i, 'Host_ID']): - if bact[:-2] not in all_bact: - all_bact.append(bact[:-2]) - list_remove = [] - for bact in all_bact: - if bact not in list_remove: - try: - with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=bact) as handle: - seq_record = SeqIO.read(handle, "gb") - if not any(i in seq_record.description.lower() for i in ['pneumoniae', 'coli', 'baumannii']) or not any(i in seq_record.description.lower() for i in ['escherichia', 'acinetobacter', 'klebsiella']) \ - or 'phage' in seq_record.description.lower() or 'virus' in seq_record.description.lower(): - list_remove.append(bact) - except: - list_remove.append(bact) - print(list_remove) - for phage in self.data.index: - listBactID = ast.literal_eval(self.data.loc[phage, 'Host_ID']) - for bact in listBactID: - if bact[:-2] in list_remove: - listBactID.remove(bact) - self.data.loc[phage, 'Host_ID'] = listBactID - self.data.to_csv('files/NCBI_Phage_Bacteria_Data.csv') - - def save_data(self): - """ - Saves the data in csv format. - :return: - """ - import json - self.data.to_csv('files/NCBI_Phage_Bacteria_Data.csv') - with open('files/searched_accessions', 'w') as f: - json.dump(self.searched, f) - - -if __name__ == '__main__': - test = PhageBacteriaData('NCBI_Phage_Bacteria_Data.csv') # sequences - test.addBacteriaName() - test.addBacteriaGenome() - test.searchBacName() # 2266 bacteria added - test.checkAbstracts() - # test.data = test.data.drop(columns=['Bacteria ID']) - test.searchBacName() - # test.createListBacID(0, 100) - # test.data = test.data.iloc[:, 0:3] - test.check_bacteria() - test.save_data() -# test.extractProtein() -# test.importProtein('Phage')