Mercurial > repos > iuc > concoct
diff merge_cut_up_clustering.py @ 1:031f84cb2fd3 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
author | iuc |
---|---|
date | Sun, 13 Mar 2022 08:45:05 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/merge_cut_up_clustering.py Sun Mar 13 08:45:05 2022 +0000 @@ -0,0 +1,56 @@ +#!/usr/bin/env python + +import argparse +import re +import sys +from collections import Counter +from collections import defaultdict + + +CONTIG_PART_EXPR = re.compile(r'(.*)\.concoct_part_([0-9]*)') + + +def original_contig_name_special(contig_id): + try: + original_id, part_index = CONTIG_PART_EXPR.match(contig_id).group(1, 2) + return original_id, part_index + except AttributeError: + # No matches for concoct_part regex. + return contig_id, 0 + + +parser = argparse.ArgumentParser() +parser.add_argument("--input", action="store", dest="input", help="Tabular file with cut up clusters") +parser.add_argument("--output", action="store", dest="output", help="Output file with merged clusters") + +args = parser.parse_args() + +# Get cut up clusters +all_seqs = {} +all_originals = defaultdict(dict) +with open(args.input, 'r') as ifh: + for i, line in enumerate(ifh): + if i == 0: + if 'contig_id' not in line: + sys.stderr.write("ERROR nvalid clustering file, 'contig_id' is not found in the header.") + sys.exit(-1) + # Skip header. + continue + line = line.rstrip('\r\n') + contig_id, cluster_id = line.split('\t') + original_contig_name, part_id = original_contig_name_special(contig_id) + all_originals[original_contig_name][part_id] = cluster_id + +# Merge cut up clusters. +with open(args.output, 'w') as ofh: + ofh.write("contig_id\tcluster_id\n") + for original_contig_id, part_ids_d in all_originals.items(): + if len(part_ids_d) > 1: + c = Counter(part_ids_d.values()) + cluster_id = c.most_common(1)[0][0] + c_string = [(a, b) for a, b in c.items()] + # Here if len(c.values()) > 1, + # then no cluster for contig. + else: + cluster_id = list(part_ids_d.values())[0] + ofh.write(f"{original_contig_id}\t{cluster_id}\n")