comparison run_galaxy.py @ 1:f8dee15a72a4 draft

Uploaded
author pedro_araujo
date Wed, 27 Jan 2021 14:52:31 +0000
parents e4b3fc88efe0
children
comparison
equal deleted inserted replaced
0:e4b3fc88efe0 1:f8dee15a72a4
4 4
5 def __init__(self, phage_input_type='ID', bact_input_type='ID', phage='', bacteria='', ml_model='RandomForests', run_interpro=False): 5 def __init__(self, phage_input_type='ID', bact_input_type='ID', phage='', bacteria='', ml_model='RandomForests', run_interpro=False):
6 import pickle 6 import pickle
7 import os 7 import os
8 import re 8 import re
9 with open('files/FeatureDataset', 'rb') as f: 9 with open('files/feature_dataset', 'rb') as f:
10 dataset = pickle.load(f) 10 dataset = pickle.load(f)
11 self.all_phages = [] 11 self.all_phages = []
12 self.all_bacteria = [] 12 self.all_bacteria = []
13 for ID in dataset.index: 13 for ID in dataset.index:
14 temp_phage = ID[:ID.find('--')] 14 temp_phage = ID[:ID.find('--')]
94 else: 94 else:
95 from Bio import Entrez 95 from Bio import Entrez
96 from Bio import SeqIO 96 from Bio import SeqIO
97 phage = {} 97 phage = {}
98 Entrez.email = 'insert@email.com' 98 Entrez.email = 'insert@email.com'
99 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=ID) as handle: 99 try:
100 genome = SeqIO.read(handle, "gb") 100 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=ID) as handle:
101 for feat in genome.features: 101 genome = SeqIO.read(handle, "gb")
102 if feat.type == 'CDS': 102 for feat in genome.features:
103 try: temp_phage[ID][feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]] 103 if feat.type == 'CDS':
104 except: pass 104 try: temp_phage[ID][feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]]
105 except: pass
106 except:
107 print(ID, 'not found in GenBank')
105 return temp_phage 108 return temp_phage
106 109
107 def _retrieve_from_bact_id(self, bacteria): 110 def _retrieve_from_bact_id(self, bacteria):
108 temp_bacteria = {} 111 temp_bacteria = {}
109 for ID in bacteria: 112 for ID in bacteria:
110 temp_bacteria[ID] = {} 113 temp_bacteria[ID] = {}
111 if '.' in ID: 114 if '.' in ID:
112 ID = ID[:ID.find('.')] 115 ID = ID[:ID.find('.')]
113 if ID in self.all_bacteria: 116 #if ID in self.all_bacteria:
114 import json 117 # import json
115 with open('files/bacteria/' + ID + '.json', encoding='utf-8') as f: 118 # with open('files/bacteria/' + ID + '.json', encoding='utf-8') as f:
116 temp_bacteria[ID] = json.loads(f.read()) 119 # temp_bacteria[ID] = json.loads(f.read())
117 else: 120 #else:
118 from Bio import Entrez 121 from Bio import Entrez
119 from Bio import SeqIO 122 from Bio import SeqIO
120 bacteria = {} 123 bacteria = {}
121 Entrez.email = 'insert@email.com' 124 Entrez.email = 'insert@email.com'
125 try:
122 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=ID+'.1') as handle: 126 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=ID+'.1') as handle:
123 genome = SeqIO.read(handle, "gb") 127 genome = SeqIO.read(handle, "gb")
124 for feat in genome.features: 128 for feat in genome.features:
125 if feat.type == 'CDS': 129 if feat.type == 'CDS':
126 try: temp_bacteria[ID][feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]] 130 try: temp_bacteria[ID][feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]]
152 else: 156 else:
153 protSeq += genome[k].strip() 157 protSeq += genome[k].strip()
154 else: 158 else:
155 j += 1 159 j += 1
156 temp_bacteria[ID][protKey] = [product, protSeq[:protSeq.find('"')]] 160 temp_bacteria[ID][protKey] = [product, protSeq[:protSeq.find('"')]]
161 except:
162 print(ID, 'not found in GenBank')
157 return temp_bacteria 163 return temp_bacteria
158 164
159 def _find_phage_functions(self, phage_dict, run_interpro): 165 def _find_phage_functions(self, phage_dict, run_interpro):
160 import os 166 import os
161 import json 167 import json
361 run_interpro = True 367 run_interpro = True
362 else: 368 else:
363 run_interpro = False 369 run_interpro = False
364 model = sys.argv[6] 370 model = sys.argv[6]
365 GalaxyPrediction(phage_input_type=phage_input_type, bact_input_type=bact_input_type, phage=Phages, bacteria=Bacts, ml_model=model, run_interpro=run_interpro) 371 GalaxyPrediction(phage_input_type=phage_input_type, bact_input_type=bact_input_type, phage=Phages, bacteria=Bacts, ml_model=model, run_interpro=run_interpro)
366 # rg = GalaxyPrediction(phage_input_type='ID', bact_input_type='ID', phage='NC_050154', bacteria='NC_007414,NZ_MK033499,NZ_CP031214') 372 #rg = GalaxyPrediction(phage_input_type='ID', bact_input_type='ID', phage='NC_050154', bacteria='NC_007414,NZ_MK033499,NZ_CP031214')
367 # GalaxyPrediction(phage_input_type='ID', bact_input_type='ID', phage='NC_031087,NC_049833,NC_049838,NC_049444', bacteria='LR133964', ml_model='SVM') 373 # GalaxyPrediction(phage_input_type='ID', bact_input_type='ID', phage='NC_031087,NC_049833,NC_049838,NC_049444', bacteria='LR133964', ml_model='SVM')