annotate ecoli_serotyping/predictionFunctions.py @ 9:148199949636 draft

Uploaded
author jpetteng
date Fri, 05 Jan 2018 16:25:18 -0500
parents fe3ceb5c4214
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
1 #!/usr/bin/env python
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
2
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
3 import json
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
4 import logging
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
5 import os
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
6 from collections import defaultdict
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
7
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
8 import pandas as pd
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
9
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
10 LOG = logging.getLogger(__name__)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
11
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
12 """
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
13 Serotype prediction for E. coli
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
14 """
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
15
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
16 def predict_serotype(blast_output_file, ectyper_dict_file, predictions_file, detailed=False):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
17 """Make serotype prediction for all genomes based on blast output
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
18
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
19 Args:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
20 blast_output_file(str):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
21 blastn output with outfmt:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
22 "6 qseqid qlen sseqid length pident sstart send sframe qcovhsp -word_size 11"
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
23 ectyper_dict_file(str):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
24 mapping file used to associate allele id to allele informations
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
25 predictions_file(str):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
26 csv file to store result
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
27 detailed(bool, optional):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
28 whether to generate detailed output or not
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
29
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
30 Returns:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
31 predictions_file
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
32 """
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
33 basename, extension = os.path.splitext(predictions_file)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
34 parsed_output_file = ''.join([basename, '_raw', extension])
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
35
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
36 LOG.info("Predicting serotype from blast output")
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
37 output_df = blast_output_to_df(blast_output_file)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
38 ectyper_df = ectyper_dict_to_df(ectyper_dict_file)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
39 # Merge output_df and ectyper_df
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
40 output_df = output_df.merge(ectyper_df, left_on='qseqid', right_on='name', how='left')
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
41 predictions_dict = {}
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
42 # Select individual genomes
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
43 output_df['genome_name'] = output_df['sseqid'].str.split('|').str[1]
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
44 # Initialize constants
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
45 gene_pairs = {'wzx':'wzy', 'wzy':'wzx', 'wzm':'wzt', 'wzt':'wzm'}
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
46 predictions_columns = ['O_prediction', 'O_info', 'H_prediction', 'H_info']
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
47 gene_list = ['wzx', 'wzy', 'wzm', 'wzt', 'fliC', 'fllA', 'flkA', 'flmA', 'flnA']
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
48 if detailed:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
49 # Add gene lists for detailed output report
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
50 for gene in gene_list:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
51 predictions_columns.append(gene)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
52 for genome_name, per_genome_df in output_df.groupby('genome_name'):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
53 # Make prediction for each genome based on blast output
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
54 predictions_dict[genome_name] = get_prediction(
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
55 per_genome_df, predictions_columns, gene_pairs, detailed)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
56 predictions_df = pd.DataFrame(predictions_dict).transpose()
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
57 if predictions_df.empty:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
58 predictions_df = pd.DataFrame(columns=predictions_columns)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
59 predictions_df = predictions_df[predictions_columns]
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
60 store_df(output_df, parsed_output_file)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
61 store_df(predictions_df, predictions_file)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
62 LOG.info("Serotype prediction completed")
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
63 return predictions_file
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
64
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
65 def get_prediction(per_genome_df, predictions_columns, gene_pairs, detailed, ):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
66 """Make serotype prediction for single genomes based on blast output
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
67
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
68 Args:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
69 per_genome_df(DataFrame):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
70 blastn outputs for the given genome
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
71 predictions_columns(dict):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
72 columns to be filled by prediction function
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
73 gene_pairs(dict):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
74 dict of pair genes used for paired logic
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
75 detailed(bool):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
76 whether to generate detailed output or not
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
77 Return:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
78 Prediction dictionary for the given genome
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
79 """
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
80 # Extract potential predictors
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
81 useful_columns = [
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
82 'gene', 'serotype', 'score', 'name', 'desc', 'pident', 'qcovhsp', 'qseqid', 'sseqid'
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
83 ]
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
84 per_genome_df = per_genome_df.sort_values(['gene', 'serotype', 'score'], ascending=False)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
85 per_genome_df = per_genome_df[~per_genome_df.duplicated(['gene', 'serotype'])]
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
86 predictors_df = per_genome_df[useful_columns]
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
87 predictors_df = predictors_df.sort_values('score', ascending=False)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
88 predictions = {}
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
89 for column in predictions_columns:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
90 predictions[column] = None
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
91 for predicting_antigen in ['O', 'H']:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
92 genes_pool = defaultdict(list)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
93 for index, row in predictors_df.iterrows():
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
94 gene = row['gene']
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
95 if detailed:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
96 predictions[gene] = True
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
97 if not predictions[predicting_antigen+'_prediction']:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
98 serotype = row['serotype']
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
99 if serotype[0] is not predicting_antigen:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
100 continue
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
101 genes_pool[gene].append(serotype)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
102 prediction = None
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
103 if len(serotype) < 1:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
104 continue
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
105 antigen = serotype[0].upper()
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
106 if antigen != predicting_antigen:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
107 continue
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
108 if gene in gene_pairs.keys():
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
109 predictions[antigen+'_info'] = 'Only unpaired alignments found'
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
110 # Pair gene logic
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
111 potential_pairs = genes_pool.get(gene_pairs.get(gene))
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
112 if potential_pairs is None:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
113 continue
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
114 if serotype in potential_pairs:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
115 prediction = serotype
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
116 else:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
117 # Normal logic
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
118 prediction = serotype
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
119 if prediction is None:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
120 continue
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
121 predictions[antigen+'_info'] = 'Alignment found'
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
122 predictions[predicting_antigen+'_prediction'] = prediction
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
123 # No alignment found
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
124 ## Make prediction based on non-paired gene
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
125 ## if only one non-paired gene is avaliable
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
126 if predictions.get(predicting_antigen+'_prediction') is None:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
127 if len(genes_pool) == 1:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
128 serotypes = list(genes_pool.values())[0]
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
129 if len(serotypes) == 1:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
130 predictions[antigen+'_info'] = 'Lone unpaired alignment found'
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
131 predictions[predicting_antigen+'_prediction'] = serotypes[0]
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
132 return predictions
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
133
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
134 def blast_output_to_df(blast_output_file):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
135 '''Convert raw blast output file to DataFrame
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
136 Args:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
137 blast_output_file(str): location of blast output
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
138
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
139 Returns:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
140 DataFrame:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
141 DataFrame that contains all informations from blast output
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
142 '''
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
143 # Load blast output
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
144 output_data = []
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
145 with open(blast_output_file, 'r') as fh:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
146 for line in fh:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
147 fields = line.strip().split()
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
148 entry = {
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
149 'qseqid': fields[0],
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
150 'qlen': fields[1],
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
151 'sseqid': fields[2],
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
152 'length': fields[3],
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
153 'pident': fields[4],
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
154 'sstart': fields[5],
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
155 'send': fields[6],
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
156 'sframe': fields[7],
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
157 'qcovhsp': fields[8]
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
158 }
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
159 output_data.append(entry)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
160 df = pd.DataFrame(output_data)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
161 if not output_data:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
162 LOG.info("No hit found for this blast query")
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
163 # Return empty dataframe with correct columns
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
164 return pd.DataFrame(
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
165 columns=[
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
166 'length', 'pident', 'qcovhsp',
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
167 'qlen', 'qseqid', 'send',
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
168 'sframe', 'sseqid', 'sstart'
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
169 ])
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
170 df['score'] = df['pident'].astype(float)*df['qcovhsp'].astype(float)/10000
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
171 return df
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
172
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
173 def ectyper_dict_to_df(ectyper_dict_file):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
174 # Read ectyper dict
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
175 with open(ectyper_dict_file) as fh:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
176 ectyper_dict = json.load(fh)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
177 temp_list = []
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
178 for antigen, alleles in ectyper_dict.items():
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
179 for name, allele in alleles.items():
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
180 new_entry = {
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
181 'serotype': allele.get('allele'),
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
182 'name': name,
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
183 'gene': allele.get('gene'),
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
184 'desc': allele.get('desc')
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
185 }
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
186 temp_list.append(new_entry)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
187 df = pd.DataFrame(temp_list)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
188 return df
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
189
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
190 def store_df(src_df, dst_file):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
191 """Append dataframe to a file if it exists, otherwise, make a new file
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
192 Args:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
193 src_df(str): dataframe object to be stored
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
194 dst_file(str): dst_file to be modified/created
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
195 """
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
196 if os.path.isfile(dst_file):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
197 with open(dst_file, 'a') as fh:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
198 src_df.to_csv(fh, header=False)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
199 else:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
200 with open(dst_file, 'w') as fh:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
201 src_df.to_csv(fh, header=True, index_label='genome')
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
202
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
203 def report_result(csv_file):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
204 '''Report the content of dataframe in log
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
205
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
206 Args:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
207 csv_file(str): location of the prediction file
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
208 '''
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
209 df = pd.read_csv(csv_file)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
210 if df.empty:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
211 LOG.info('No prediction was made becuase no alignment was found')
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
212 return
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
213 LOG.info('\n{0}'.format(df.to_string(index=False)))
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
214
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
215 def add_non_predicted(all_genomes_list, predictions_file):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
216 '''Add genomes that do not show up in blast result to prediction file
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
217
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
218 Args:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
219 all_genome_list(list):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
220 list of genomes from user input
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
221 predictions_file(str):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
222 location of the prediction file
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
223 Returns:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
224 str: location of the prediction file
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
225 '''
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
226 df = pd.read_csv(predictions_file)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
227 df = df.merge(pd.DataFrame(all_genomes_list, columns=['genome']), on='genome', how='outer')
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
228 df.fillna({'O_info':'No alignment found', 'H_info':'No alignment found'}, inplace=True)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
229 df.fillna('-', inplace=True)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
230 df.to_csv(predictions_file, index=False)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
231 return predictions_file