Mercurial > repos > pedro_araujo > phage_host_prediction
comparison run_galaxy.py @ 1:f8dee15a72a4 draft
Uploaded
author | pedro_araujo |
---|---|
date | Wed, 27 Jan 2021 14:52:31 +0000 |
parents | e4b3fc88efe0 |
children |
comparison
equal
deleted
inserted
replaced
0:e4b3fc88efe0 | 1:f8dee15a72a4 |
---|---|
4 | 4 |
5 def __init__(self, phage_input_type='ID', bact_input_type='ID', phage='', bacteria='', ml_model='RandomForests', run_interpro=False): | 5 def __init__(self, phage_input_type='ID', bact_input_type='ID', phage='', bacteria='', ml_model='RandomForests', run_interpro=False): |
6 import pickle | 6 import pickle |
7 import os | 7 import os |
8 import re | 8 import re |
9 with open('files/FeatureDataset', 'rb') as f: | 9 with open('files/feature_dataset', 'rb') as f: |
10 dataset = pickle.load(f) | 10 dataset = pickle.load(f) |
11 self.all_phages = [] | 11 self.all_phages = [] |
12 self.all_bacteria = [] | 12 self.all_bacteria = [] |
13 for ID in dataset.index: | 13 for ID in dataset.index: |
14 temp_phage = ID[:ID.find('--')] | 14 temp_phage = ID[:ID.find('--')] |
94 else: | 94 else: |
95 from Bio import Entrez | 95 from Bio import Entrez |
96 from Bio import SeqIO | 96 from Bio import SeqIO |
97 phage = {} | 97 phage = {} |
98 Entrez.email = 'insert@email.com' | 98 Entrez.email = 'insert@email.com' |
99 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=ID) as handle: | 99 try: |
100 genome = SeqIO.read(handle, "gb") | 100 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=ID) as handle: |
101 for feat in genome.features: | 101 genome = SeqIO.read(handle, "gb") |
102 if feat.type == 'CDS': | 102 for feat in genome.features: |
103 try: temp_phage[ID][feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]] | 103 if feat.type == 'CDS': |
104 except: pass | 104 try: temp_phage[ID][feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]] |
105 except: pass | |
106 except: | |
107 print(ID, 'not found in GenBank') | |
105 return temp_phage | 108 return temp_phage |
106 | 109 |
107 def _retrieve_from_bact_id(self, bacteria): | 110 def _retrieve_from_bact_id(self, bacteria): |
108 temp_bacteria = {} | 111 temp_bacteria = {} |
109 for ID in bacteria: | 112 for ID in bacteria: |
110 temp_bacteria[ID] = {} | 113 temp_bacteria[ID] = {} |
111 if '.' in ID: | 114 if '.' in ID: |
112 ID = ID[:ID.find('.')] | 115 ID = ID[:ID.find('.')] |
113 if ID in self.all_bacteria: | 116 #if ID in self.all_bacteria: |
114 import json | 117 # import json |
115 with open('files/bacteria/' + ID + '.json', encoding='utf-8') as f: | 118 # with open('files/bacteria/' + ID + '.json', encoding='utf-8') as f: |
116 temp_bacteria[ID] = json.loads(f.read()) | 119 # temp_bacteria[ID] = json.loads(f.read()) |
117 else: | 120 #else: |
118 from Bio import Entrez | 121 from Bio import Entrez |
119 from Bio import SeqIO | 122 from Bio import SeqIO |
120 bacteria = {} | 123 bacteria = {} |
121 Entrez.email = 'insert@email.com' | 124 Entrez.email = 'insert@email.com' |
125 try: | |
122 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=ID+'.1') as handle: | 126 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=ID+'.1') as handle: |
123 genome = SeqIO.read(handle, "gb") | 127 genome = SeqIO.read(handle, "gb") |
124 for feat in genome.features: | 128 for feat in genome.features: |
125 if feat.type == 'CDS': | 129 if feat.type == 'CDS': |
126 try: temp_bacteria[ID][feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]] | 130 try: temp_bacteria[ID][feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]] |
152 else: | 156 else: |
153 protSeq += genome[k].strip() | 157 protSeq += genome[k].strip() |
154 else: | 158 else: |
155 j += 1 | 159 j += 1 |
156 temp_bacteria[ID][protKey] = [product, protSeq[:protSeq.find('"')]] | 160 temp_bacteria[ID][protKey] = [product, protSeq[:protSeq.find('"')]] |
161 except: | |
162 print(ID, 'not found in GenBank') | |
157 return temp_bacteria | 163 return temp_bacteria |
158 | 164 |
159 def _find_phage_functions(self, phage_dict, run_interpro): | 165 def _find_phage_functions(self, phage_dict, run_interpro): |
160 import os | 166 import os |
161 import json | 167 import json |
361 run_interpro = True | 367 run_interpro = True |
362 else: | 368 else: |
363 run_interpro = False | 369 run_interpro = False |
364 model = sys.argv[6] | 370 model = sys.argv[6] |
365 GalaxyPrediction(phage_input_type=phage_input_type, bact_input_type=bact_input_type, phage=Phages, bacteria=Bacts, ml_model=model, run_interpro=run_interpro) | 371 GalaxyPrediction(phage_input_type=phage_input_type, bact_input_type=bact_input_type, phage=Phages, bacteria=Bacts, ml_model=model, run_interpro=run_interpro) |
366 # rg = GalaxyPrediction(phage_input_type='ID', bact_input_type='ID', phage='NC_050154', bacteria='NC_007414,NZ_MK033499,NZ_CP031214') | 372 #rg = GalaxyPrediction(phage_input_type='ID', bact_input_type='ID', phage='NC_050154', bacteria='NC_007414,NZ_MK033499,NZ_CP031214') |
367 # GalaxyPrediction(phage_input_type='ID', bact_input_type='ID', phage='NC_031087,NC_049833,NC_049838,NC_049444', bacteria='LR133964', ml_model='SVM') | 373 # GalaxyPrediction(phage_input_type='ID', bact_input_type='ID', phage='NC_031087,NC_049833,NC_049838,NC_049444', bacteria='LR133964', ml_model='SVM') |