comparison vsnp_build_tables.py @ 0:0ad85e7db2fc draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 6a0c9a857c1f4638ef18e106b1f8c0681303acc5"
author iuc
date Sun, 27 Sep 2020 10:07:44 +0000
parents
children aed013f6b13b
comparison
equal deleted inserted replaced
-1:000000000000 0:0ad85e7db2fc
1 #!/usr/bin/env python
2
3 import argparse
4 import multiprocessing
5 import os
6 import pandas
7 import queue
8 import pandas.io.formats.excel
9 import re
10 from Bio import SeqIO
11
12 INPUT_JSON_AVG_MQ_DIR = 'input_json_avg_mq_dir'
13 INPUT_JSON_DIR = 'input_json_dir'
14 INPUT_NEWICK_DIR = 'input_newick_dir'
15 # Maximum columns allowed in a LibreOffice
16 # spreadsheet is 1024. Excel allows for
17 # 16,384 columns, but we'll set the lower
18 # number as the maximum. Some browsers
19 # (e.g., Firefox on Linux) are configured
20 # to use LibreOffice for Excel spreadsheets.
21 MAXCOLS = 1024
22 OUTPUT_EXCEL_DIR = 'output_excel_dir'
23
24
25 def annotate_table(table_df, group, annotation_dict):
26 for gbk_chrome, pro in list(annotation_dict.items()):
27 ref_pos = list(table_df)
28 ref_series = pandas.Series(ref_pos)
29 ref_df = pandas.DataFrame(ref_series.str.split(':', expand=True).values, columns=['reference', 'position'])
30 all_ref = ref_df[ref_df['reference'] == gbk_chrome]
31 positions = all_ref.position.to_frame()
32 # Create an annotation file.
33 annotation_file = "%s_annotations.csv" % group
34 with open(annotation_file, "a") as fh:
35 for index, row in positions.iterrows():
36 pos = row.position
37 try:
38 aaa = pro.iloc[pro.index.get_loc(int(pos))][['chrom', 'locus', 'product', 'gene']]
39 try:
40 chrom, name, locus, tag = aaa.values[0]
41 print("{}:{}\t{}, {}, {}".format(chrom, pos, locus, tag, name), file=fh)
42 except ValueError:
43 # If only one annotation for the entire
44 # chromosome (e.g., flu) then having [0] fails
45 chrom, name, locus, tag = aaa.values
46 print("{}:{}\t{}, {}, {}".format(chrom, pos, locus, tag, name), file=fh)
47 except KeyError:
48 print("{}:{}\tNo annotated product".format(gbk_chrome, pos), file=fh)
49 # Read the annotation file into a data frame.
50 annotations_df = pandas.read_csv(annotation_file, sep='\t', header=None, names=['index', 'annotations'], index_col='index')
51 # Remove the annotation_file from disk since both
52 # cascade and sort tables are built using the file,
53 # and it is opened for writing in append mode.
54 os.remove(annotation_file)
55 # Process the data.
56 table_df_transposed = table_df.T
57 table_df_transposed.index = table_df_transposed.index.rename('index')
58 table_df_transposed = table_df_transposed.merge(annotations_df, left_index=True, right_index=True)
59 table_df = table_df_transposed.T
60 return table_df
61
62
63 def excel_formatter(json_file_name, excel_file_name, group, annotation_dict):
64 pandas.io.formats.excel.header_style = None
65 table_df = pandas.read_json(json_file_name, orient='split')
66 if annotation_dict is not None:
67 table_df = annotate_table(table_df, group, annotation_dict)
68 else:
69 table_df = table_df.append(pandas.Series(name='no annotations'))
70 writer = pandas.ExcelWriter(excel_file_name, engine='xlsxwriter')
71 table_df.to_excel(writer, sheet_name='Sheet1')
72 writer_book = writer.book
73 ws = writer.sheets['Sheet1']
74 format_a = writer_book.add_format({'bg_color': '#58FA82'})
75 format_g = writer_book.add_format({'bg_color': '#F7FE2E'})
76 format_c = writer_book.add_format({'bg_color': '#0000FF'})
77 format_t = writer_book.add_format({'bg_color': '#FF0000'})
78 format_normal = writer_book.add_format({'bg_color': '#FDFEFE'})
79 formatlowqual = writer_book.add_format({'font_color': '#C70039', 'bg_color': '#E2CFDD'})
80 format_ambigous = writer_book.add_format({'font_color': '#C70039', 'bg_color': '#E2CFDD'})
81 format_n = writer_book.add_format({'bg_color': '#E2CFDD'})
82 rows, cols = table_df.shape
83 ws.set_column(0, 0, 30)
84 ws.set_column(1, cols, 2.1)
85 ws.freeze_panes(2, 1)
86 format_annotation = writer_book.add_format({'font_color': '#0A028C', 'rotation': '-90', 'align': 'top'})
87 # Set last row.
88 ws.set_row(rows + 1, cols + 1, format_annotation)
89 # Make sure that row/column locations don't overlap.
90 ws.conditional_format(rows - 2, 1, rows - 1, cols, {'type': 'cell', 'criteria': '<', 'value': 55, 'format': formatlowqual})
91 ws.conditional_format(2, 1, rows - 2, cols, {'type': 'cell', 'criteria': '==', 'value': 'B$2', 'format': format_normal})
92 ws.conditional_format(2, 1, rows - 2, cols, {'type': 'text', 'criteria': 'containing', 'value': 'A', 'format': format_a})
93 ws.conditional_format(2, 1, rows - 2, cols, {'type': 'text', 'criteria': 'containing', 'value': 'G', 'format': format_g})
94 ws.conditional_format(2, 1, rows - 2, cols, {'type': 'text', 'criteria': 'containing', 'value': 'C', 'format': format_c})
95 ws.conditional_format(2, 1, rows - 2, cols, {'type': 'text', 'criteria': 'containing', 'value': 'T', 'format': format_t})
96 ws.conditional_format(2, 1, rows - 2, cols, {'type': 'text', 'criteria': 'containing', 'value': 'S', 'format': format_ambigous})
97 ws.conditional_format(2, 1, rows - 2, cols, {'type': 'text', 'criteria': 'containing', 'value': 'Y', 'format': format_ambigous})
98 ws.conditional_format(2, 1, rows - 2, cols, {'type': 'text', 'criteria': 'containing', 'value': 'R', 'format': format_ambigous})
99 ws.conditional_format(2, 1, rows - 2, cols, {'type': 'text', 'criteria': 'containing', 'value': 'W', 'format': format_ambigous})
100 ws.conditional_format(2, 1, rows - 2, cols, {'type': 'text', 'criteria': 'containing', 'value': 'K', 'format': format_ambigous})
101 ws.conditional_format(2, 1, rows - 2, cols, {'type': 'text', 'criteria': 'containing', 'value': 'M', 'format': format_ambigous})
102 ws.conditional_format(2, 1, rows - 2, cols, {'type': 'text', 'criteria': 'containing', 'value': 'N', 'format': format_n})
103 ws.conditional_format(2, 1, rows - 2, cols, {'type': 'text', 'criteria': 'containing', 'value': '-', 'format': format_n})
104 format_rotation = writer_book.add_format({})
105 format_rotation.set_rotation(90)
106 for column_num, column_name in enumerate(list(table_df.columns)):
107 ws.write(0, column_num + 1, column_name, format_rotation)
108 format_annotation = writer_book.add_format({'font_color': '#0A028C', 'rotation': '-90', 'align': 'top'})
109 # Set last row.
110 ws.set_row(rows, 400, format_annotation)
111 writer.save()
112
113
114 def get_annotation_dict(gbk_file):
115 gbk_dict = SeqIO.to_dict(SeqIO.parse(gbk_file, "genbank"))
116 annotation_dict = {}
117 tmp_file = "features.csv"
118 # Create a file of chromosomes and features.
119 for chromosome in list(gbk_dict.keys()):
120 with open(tmp_file, 'w+') as fh:
121 for feature in gbk_dict[chromosome].features:
122 if "CDS" in feature.type or "rRNA" in feature.type:
123 try:
124 product = feature.qualifiers['product'][0]
125 except KeyError:
126 product = None
127 try:
128 locus = feature.qualifiers['locus_tag'][0]
129 except KeyError:
130 locus = None
131 try:
132 gene = feature.qualifiers['gene'][0]
133 except KeyError:
134 gene = None
135 fh.write("%s\t%d\t%d\t%s\t%s\t%s\n" % (chromosome, int(feature.location.start), int(feature.location.end), locus, product, gene))
136 # Read the chromosomes and features file into a data frame.
137 df = pandas.read_csv(tmp_file, sep='\t', names=["chrom", "start", "stop", "locus", "product", "gene"])
138 # Process the data.
139 df = df.sort_values(['start', 'gene'], ascending=[True, False])
140 df = df.drop_duplicates('start')
141 pro = df.reset_index(drop=True)
142 pro.index = pandas.IntervalIndex.from_arrays(pro['start'], pro['stop'], closed='both')
143 annotation_dict[chromosome] = pro
144 return annotation_dict
145
146
147 def get_base_file_name(file_path):
148 base_file_name = os.path.basename(file_path)
149 if base_file_name.find(".") > 0:
150 # Eliminate the extension.
151 return os.path.splitext(base_file_name)[0]
152 elif base_file_name.find("_") > 0:
153 # The dot extension was likely changed to
154 # the " character.
155 items = base_file_name.split("_")
156 return "_".join(items[0:-1])
157 else:
158 return base_file_name
159
160
161 def output_cascade_table(cascade_order, mqdf, group, annotation_dict):
162 cascade_order_mq = pandas.concat([cascade_order, mqdf], join='inner')
163 output_table(cascade_order_mq, "cascade", group, annotation_dict)
164
165
166 def output_excel(df, type_str, group, annotation_dict, count=None):
167 # Output the temporary json file that
168 # is used by the excel_formatter.
169 if count is None:
170 if group is None:
171 json_file_name = "%s_order_mq.json" % type_str
172 excel_file_name = os.path.join(OUTPUT_EXCEL_DIR, "%s_table.xlsx" % type_str)
173 else:
174 json_file_name = "%s_%s_order_mq.json" % (group, type_str)
175 excel_file_name = os.path.join(OUTPUT_EXCEL_DIR, "%s_%s_table.xlsx" % (group, type_str))
176 else:
177 if group is None:
178 json_file_name = "%s_order_mq_%d.json" % (type_str, count)
179 excel_file_name = os.path.join(OUTPUT_EXCEL_DIR, "%s_table_%d.xlsx" % (type_str, count))
180 else:
181 json_file_name = "%s_%s_order_mq_%d.json" % (group, type_str, count)
182 excel_file_name = os.path.join(OUTPUT_EXCEL_DIR, "%s_%s_table_%d.xlsx" % (group, type_str, count))
183 df.to_json(json_file_name, orient='split')
184 # Output the Excel file.
185 excel_formatter(json_file_name, excel_file_name, group, annotation_dict)
186
187
188 def output_sort_table(cascade_order, mqdf, group, annotation_dict):
189 sort_df = cascade_order.T
190 sort_df['abs_value'] = sort_df.index
191 sort_df[['chrom', 'pos']] = sort_df['abs_value'].str.split(':', expand=True)
192 sort_df = sort_df.drop(['abs_value', 'chrom'], axis=1)
193 sort_df.pos = sort_df.pos.astype(int)
194 sort_df = sort_df.sort_values(by=['pos'])
195 sort_df = sort_df.drop(['pos'], axis=1)
196 sort_df = sort_df.T
197 sort_order_mq = pandas.concat([sort_df, mqdf], join='inner')
198 output_table(sort_order_mq, "sort", group, annotation_dict)
199
200
201 def output_table(df, type_str, group, annotation_dict):
202 if isinstance(group, str) and group.startswith("dataset"):
203 # Inputs are single files, not collections,
204 # so input file names are not useful for naming
205 # output files.
206 group_str = None
207 else:
208 group_str = group
209 count = 0
210 chunk_start = 0
211 chunk_end = 0
212 column_count = df.shape[1]
213 if column_count >= MAXCOLS:
214 # Here the number of columns is greater than
215 # the maximum allowed by Excel, so multiple
216 # outputs will be produced.
217 while column_count >= MAXCOLS:
218 count += 1
219 chunk_end += MAXCOLS
220 df_of_type = df.iloc[:, chunk_start:chunk_end]
221 output_excel(df_of_type, type_str, group_str, annotation_dict, count=count)
222 chunk_start += MAXCOLS
223 column_count -= MAXCOLS
224 count += 1
225 df_of_type = df.iloc[:, chunk_start:]
226 output_excel(df_of_type, type_str, group_str, annotation_dict, count=count)
227 else:
228 output_excel(df, type_str, group_str, annotation_dict)
229
230
231 def preprocess_tables(task_queue, annotation_dict, timeout):
232 while True:
233 try:
234 tup = task_queue.get(block=True, timeout=timeout)
235 except queue.Empty:
236 break
237 newick_file, json_file, json_avg_mq_file = tup
238 avg_mq_series = pandas.read_json(json_avg_mq_file, typ='series', orient='split')
239 # Map quality to dataframe.
240 mqdf = avg_mq_series.to_frame(name='MQ')
241 mqdf = mqdf.T
242 # Get the group.
243 group = get_base_file_name(newick_file)
244 snps_df = pandas.read_json(json_file, orient='split')
245 with open(newick_file, 'r') as fh:
246 for line in fh:
247 line = re.sub('[:,]', '\n', line)
248 line = re.sub('[)(]', '', line)
249 line = re.sub(r'[0-9].*\.[0-9].*\n', '', line)
250 line = re.sub('root\n', '', line)
251 sample_order = line.split('\n')
252 sample_order = list([_f for _f in sample_order if _f])
253 sample_order.insert(0, 'root')
254 tree_order = snps_df.loc[sample_order]
255 # Count number of SNPs in each column.
256 snp_per_column = []
257 for column_header in tree_order:
258 count = 0
259 column = tree_order[column_header]
260 for element in column:
261 if element != column[0]:
262 count = count + 1
263 snp_per_column.append(count)
264 row1 = pandas.Series(snp_per_column, tree_order.columns, name="snp_per_column")
265 # Count number of SNPS from the
266 # top of each column in the table.
267 snp_from_top = []
268 for column_header in tree_order:
269 count = 0
270 column = tree_order[column_header]
271 # for each element in the column
272 # skip the first element
273 for element in column[1:]:
274 if element == column[0]:
275 count = count + 1
276 else:
277 break
278 snp_from_top.append(count)
279 row2 = pandas.Series(snp_from_top, tree_order.columns, name="snp_from_top")
280 tree_order = tree_order.append([row1])
281 tree_order = tree_order.append([row2])
282 # In pandas=0.18.1 even this does not work:
283 # abc = row1.to_frame()
284 # abc = abc.T --> tree_order.shape (5, 18), abc.shape (1, 18)
285 # tree_order.append(abc)
286 # Continue to get error: "*** ValueError: all the input arrays must have same number of dimensions"
287 tree_order = tree_order.T
288 tree_order = tree_order.sort_values(['snp_from_top', 'snp_per_column'], ascending=[True, False])
289 tree_order = tree_order.T
290 # Remove snp_per_column and snp_from_top rows.
291 cascade_order = tree_order[:-2]
292 # Output the cascade table.
293 output_cascade_table(cascade_order, mqdf, group, annotation_dict)
294 # Output the sorted table.
295 output_sort_table(cascade_order, mqdf, group, annotation_dict)
296 task_queue.task_done()
297
298
299 def set_num_cpus(num_files, processes):
300 num_cpus = int(multiprocessing.cpu_count())
301 if num_files < num_cpus and num_files < processes:
302 return num_files
303 if num_cpus < processes:
304 half_cpus = int(num_cpus / 2)
305 if num_files < half_cpus:
306 return num_files
307 return half_cpus
308 return processes
309
310
311 if __name__ == '__main__':
312 parser = argparse.ArgumentParser()
313
314 parser.add_argument('--input_avg_mq_json', action='store', dest='input_avg_mq_json', required=False, default=None, help='Average MQ json file')
315 parser.add_argument('--input_newick', action='store', dest='input_newick', required=False, default=None, help='Newick file')
316 parser.add_argument('--input_snps_json', action='store', dest='input_snps_json', required=False, default=None, help='SNPs json file')
317 parser.add_argument('--gbk_file', action='store', dest='gbk_file', required=False, default=None, help='Optional gbk file'),
318 parser.add_argument('--processes', action='store', dest='processes', type=int, help='User-selected number of processes to use for job splitting')
319
320 args = parser.parse_args()
321
322 if args.gbk_file is not None:
323 # Create the annotation_dict for annotating
324 # the Excel tables.
325 annotation_dict = get_annotation_dict(args.gbk_file)
326 else:
327 annotation_dict = None
328
329 # The assumption here is that the list of files
330 # in both INPUT_NEWICK_DIR and INPUT_JSON_DIR are
331 # named such that they are properly matched if
332 # the directories contain more than 1 file (i.e.,
333 # hopefully the newick file names and json file names
334 # will be something like Mbovis-01D6_* so they can be
335 # sorted and properly associated with each other).
336 if args.input_newick is not None:
337 newick_files = [args.input_newick]
338 else:
339 newick_files = []
340 for file_name in sorted(os.listdir(INPUT_NEWICK_DIR)):
341 file_path = os.path.abspath(os.path.join(INPUT_NEWICK_DIR, file_name))
342 newick_files.append(file_path)
343 if args.input_snps_json is not None:
344 json_files = [args.input_snps_json]
345 else:
346 json_files = []
347 for file_name in sorted(os.listdir(INPUT_JSON_DIR)):
348 file_path = os.path.abspath(os.path.join(INPUT_JSON_DIR, file_name))
349 json_files.append(file_path)
350 if args.input_avg_mq_json is not None:
351 json_avg_mq_files = [args.input_avg_mq_json]
352 else:
353 json_avg_mq_files = []
354 for file_name in sorted(os.listdir(INPUT_JSON_AVG_MQ_DIR)):
355 file_path = os.path.abspath(os.path.join(INPUT_JSON_AVG_MQ_DIR, file_name))
356 json_avg_mq_files.append(file_path)
357
358 multiprocessing.set_start_method('spawn')
359 queue1 = multiprocessing.JoinableQueue()
360 queue2 = multiprocessing.JoinableQueue()
361 num_files = len(newick_files)
362 cpus = set_num_cpus(num_files, args.processes)
363 # Set a timeout for get()s in the queue.
364 timeout = 0.05
365
366 for i, newick_file in enumerate(newick_files):
367 json_file = json_files[i]
368 json_avg_mq_file = json_avg_mq_files[i]
369 queue1.put((newick_file, json_file, json_avg_mq_file))
370
371 # Complete the preprocess_tables task.
372 processes = [multiprocessing.Process(target=preprocess_tables, args=(queue1, annotation_dict, timeout, )) for _ in range(cpus)]
373 for p in processes:
374 p.start()
375 for p in processes:
376 p.join()
377 queue1.join()
378
379 if queue1.empty():
380 queue1.close()
381 queue1.join_thread()