Mercurial > repos > iuc > vsnp_build_tables
diff vsnp_build_tables.py @ 11:6b3b0f5858e6 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit c38fd63f7980c70390d104a73ba4c72b266444c3
author | iuc |
---|---|
date | Fri, 10 Jun 2022 06:11:08 +0000 |
parents | 152716f90b84 |
children |
line wrap: on
line diff
--- a/vsnp_build_tables.py Mon Dec 06 18:28:04 2021 +0000 +++ b/vsnp_build_tables.py Fri Jun 10 06:11:08 2022 +0000 @@ -1,6 +1,7 @@ #!/usr/bin/env python import argparse +import itertools import multiprocessing import os import queue @@ -153,11 +154,6 @@ return base_file_name -def output_cascade_table(cascade_order, mqdf, group, annotation_dict): - cascade_order_mq = pandas.concat([cascade_order, mqdf], join='inner') - output_table(cascade_order_mq, "cascade", group, annotation_dict) - - def output_excel(df, type_str, group, annotation_dict, count=None): # Output the temporary json file that # is used by the excel_formatter. @@ -183,19 +179,6 @@ excel_formatter(json_file_name, excel_file_name, group, annotation_dict) -def output_sort_table(cascade_order, mqdf, group, annotation_dict): - sort_df = cascade_order.T - sort_df['abs_value'] = sort_df.index - sort_df[['chrom', 'pos']] = sort_df['abs_value'].str.split(':', expand=True) - sort_df = sort_df.drop(['abs_value', 'chrom'], axis=1) - sort_df.pos = sort_df.pos.astype(int) - sort_df = sort_df.sort_values(by=['pos']) - sort_df = sort_df.drop(['pos'], axis=1) - sort_df = sort_df.T - sort_order_mq = pandas.concat([sort_df, mqdf], join='inner') - output_table(sort_order_mq, "sort", group, annotation_dict) - - def output_table(df, type_str, group, annotation_dict): if isinstance(group, str) and group.startswith("dataset"): # Inputs are single files, not collections, @@ -233,10 +216,6 @@ except queue.Empty: break newick_file, json_file, json_avg_mq_file = tup - avg_mq_series = pandas.read_json(json_avg_mq_file, typ='series', orient='split') - # Map quality to dataframe. - mqdf = avg_mq_series.to_frame(name='MQ') - mqdf = mqdf.T # Get the group. group = get_sample_name(newick_file) snps_df = pandas.read_json(json_file, orient='split') @@ -245,67 +224,53 @@ line = re.sub('[:,]', '\n', line) line = re.sub('[)(]', '', line) line = re.sub(r'[0-9].*\.[0-9].*\n', '', line) + line = re.sub("'", '', line) line = re.sub('root\n', '', line) sample_order = line.split('\n') - sample_order = list([_f for _f in sample_order if _f]) + sample_order = list(filter(None, sample_order)) sample_order.insert(0, 'root') - tree_order = snps_df.loc[sample_order] + tree_order_df = snps_df.loc[sample_order] + # Output the sorted table. + output_table(tree_order_df, "sort", group, annotation_dict) # Count number of SNPs in each column. snp_per_column = [] - for column_header in tree_order: + for column_header in tree_order_df: count = 0 - column = tree_order[column_header] + column = tree_order_df[column_header] for element in column: - if element != column[0]: + # column[0] is top row/root/reference, + # element is everything below it. + if element != column[0] and element != '-': count = count + 1 snp_per_column.append(count) - row1 = pandas.Series(snp_per_column, tree_order.columns, name="snp_per_column") + row1 = pandas.Series(snp_per_column, tree_order_df.columns, name="snp_per_column") # Count number of SNPS from the # top of each column in the table. snp_from_top = [] - for column_header in tree_order: + for column_header in tree_order_df: count = 0 - column = tree_order[column_header] - # for each element in the column - # skip the first element - for element in column[1:]: - if element == column[0]: - count = count + 1 - else: - break - snp_from_top.append(count) - row2 = pandas.Series(snp_from_top, tree_order.columns, name="snp_from_top") - tree_order = tree_order.append([row1]) - tree_order = tree_order.append([row2]) - # In pandas=0.18.1 even this does not work: - # abc = row1.to_frame() - # abc = abc.T --> tree_order.shape (5, 18), abc.shape (1, 18) - # tree_order.append(abc) - # Continue to get error: "*** ValueError: all the input arrays must have same number of dimensions" - tree_order = tree_order.T - tree_order = tree_order.sort_values(['snp_from_top', 'snp_per_column'], ascending=[True, False]) - tree_order = tree_order.T + column = tree_order_df[column_header] + index_list_of_ref_differences = [] + for ind, list_item in enumerate(column[1:].to_list()): + if list_item not in [column[0], '-']: + index_list_of_ref_differences.append(ind) + c = itertools.count() + val = max((list(g) for _, g in itertools.groupby(index_list_of_ref_differences, lambda x: x - next(c))), key=len) + # Starting row number with longest continous SNPs in column + snp_from_top.append(val[0]) + row2 = pandas.Series(snp_from_top, tree_order_df.columns, name="snp_from_top") + tree_order_df = tree_order_df.append([row1]) + tree_order_df = tree_order_df.append([row2]) + tree_order_df = tree_order_df.T + tree_order_df = tree_order_df.sort_values(['snp_from_top', 'snp_per_column'], ascending=[True, False]) + tree_order_df = tree_order_df.T # Remove snp_per_column and snp_from_top rows. - cascade_order = tree_order[:-2] + cascade_order_df = tree_order_df[:-2] # Output the cascade table. - output_cascade_table(cascade_order, mqdf, group, annotation_dict) - # Output the sorted table. - output_sort_table(cascade_order, mqdf, group, annotation_dict) + output_table(cascade_order_df, "cascade", group, annotation_dict) task_queue.task_done() -def set_num_cpus(num_files, processes): - num_cpus = len(os.sched_getaffinity(0)) - if num_files < num_cpus and num_files < processes: - return num_files - if num_cpus < processes: - half_cpus = int(num_cpus / 2) - if num_files < half_cpus: - return num_files - return half_cpus - return processes - - if __name__ == '__main__': parser = argparse.ArgumentParser() @@ -357,7 +322,6 @@ queue1 = multiprocessing.JoinableQueue() queue2 = multiprocessing.JoinableQueue() num_files = len(newick_files) - cpus = set_num_cpus(num_files, args.processes) # Set a timeout for get()s in the queue. timeout = 0.05 @@ -367,7 +331,7 @@ queue1.put((newick_file, json_file, json_avg_mq_file)) # Complete the preprocess_tables task. - processes = [multiprocessing.Process(target=preprocess_tables, args=(queue1, annotation_dict, timeout, )) for _ in range(cpus)] + processes = [multiprocessing.Process(target=preprocess_tables, args=(queue1, annotation_dict, timeout, )) for _ in range(args.processes)] for p in processes: p.start() for p in processes: