Mercurial > repos > iuc > concoct_merge_cut_up_clustering
comparison merge_cut_up_clustering.py @ 0:b546422c9128 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
| author | iuc |
|---|---|
| date | Sun, 13 Mar 2022 08:44:34 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:b546422c9128 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 import argparse | |
| 4 import re | |
| 5 import sys | |
| 6 from collections import Counter | |
| 7 from collections import defaultdict | |
| 8 | |
| 9 | |
| 10 CONTIG_PART_EXPR = re.compile(r'(.*)\.concoct_part_([0-9]*)') | |
| 11 | |
| 12 | |
| 13 def original_contig_name_special(contig_id): | |
| 14 try: | |
| 15 original_id, part_index = CONTIG_PART_EXPR.match(contig_id).group(1, 2) | |
| 16 return original_id, part_index | |
| 17 except AttributeError: | |
| 18 # No matches for concoct_part regex. | |
| 19 return contig_id, 0 | |
| 20 | |
| 21 | |
| 22 parser = argparse.ArgumentParser() | |
| 23 parser.add_argument("--input", action="store", dest="input", help="Tabular file with cut up clusters") | |
| 24 parser.add_argument("--output", action="store", dest="output", help="Output file with merged clusters") | |
| 25 | |
| 26 args = parser.parse_args() | |
| 27 | |
| 28 # Get cut up clusters | |
| 29 all_seqs = {} | |
| 30 all_originals = defaultdict(dict) | |
| 31 with open(args.input, 'r') as ifh: | |
| 32 for i, line in enumerate(ifh): | |
| 33 if i == 0: | |
| 34 if 'contig_id' not in line: | |
| 35 sys.stderr.write("ERROR nvalid clustering file, 'contig_id' is not found in the header.") | |
| 36 sys.exit(-1) | |
| 37 # Skip header. | |
| 38 continue | |
| 39 line = line.rstrip('\r\n') | |
| 40 contig_id, cluster_id = line.split('\t') | |
| 41 original_contig_name, part_id = original_contig_name_special(contig_id) | |
| 42 all_originals[original_contig_name][part_id] = cluster_id | |
| 43 | |
| 44 # Merge cut up clusters. | |
| 45 with open(args.output, 'w') as ofh: | |
| 46 ofh.write("contig_id\tcluster_id\n") | |
| 47 for original_contig_id, part_ids_d in all_originals.items(): | |
| 48 if len(part_ids_d) > 1: | |
| 49 c = Counter(part_ids_d.values()) | |
| 50 cluster_id = c.most_common(1)[0][0] | |
| 51 c_string = [(a, b) for a, b in c.items()] | |
| 52 # Here if len(c.values()) > 1, | |
| 53 # then no cluster for contig. | |
| 54 else: | |
| 55 cluster_id = list(part_ids_d.values())[0] | |
| 56 ofh.write(f"{original_contig_id}\t{cluster_id}\n") |
