Mercurial > repos > iuc > concoct_merge_cut_up_clustering
annotate merge_cut_up_clustering.py @ 0:b546422c9128 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
| author | iuc |
|---|---|
| date | Sun, 13 Mar 2022 08:44:34 +0000 |
| parents | |
| children |
| rev | line source |
|---|---|
|
0
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
1 #!/usr/bin/env python |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
2 |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
3 import argparse |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
4 import re |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
5 import sys |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
6 from collections import Counter |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
7 from collections import defaultdict |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
8 |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
9 |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
10 CONTIG_PART_EXPR = re.compile(r'(.*)\.concoct_part_([0-9]*)') |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
11 |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
12 |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
13 def original_contig_name_special(contig_id): |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
14 try: |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
15 original_id, part_index = CONTIG_PART_EXPR.match(contig_id).group(1, 2) |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
16 return original_id, part_index |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
17 except AttributeError: |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
18 # No matches for concoct_part regex. |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
19 return contig_id, 0 |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
20 |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
21 |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
22 parser = argparse.ArgumentParser() |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
23 parser.add_argument("--input", action="store", dest="input", help="Tabular file with cut up clusters") |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
24 parser.add_argument("--output", action="store", dest="output", help="Output file with merged clusters") |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
25 |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
26 args = parser.parse_args() |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
27 |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
28 # Get cut up clusters |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
29 all_seqs = {} |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
30 all_originals = defaultdict(dict) |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
31 with open(args.input, 'r') as ifh: |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
32 for i, line in enumerate(ifh): |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
33 if i == 0: |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
34 if 'contig_id' not in line: |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
35 sys.stderr.write("ERROR nvalid clustering file, 'contig_id' is not found in the header.") |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
36 sys.exit(-1) |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
37 # Skip header. |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
38 continue |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
39 line = line.rstrip('\r\n') |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
40 contig_id, cluster_id = line.split('\t') |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
41 original_contig_name, part_id = original_contig_name_special(contig_id) |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
42 all_originals[original_contig_name][part_id] = cluster_id |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
43 |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
44 # Merge cut up clusters. |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
45 with open(args.output, 'w') as ofh: |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
46 ofh.write("contig_id\tcluster_id\n") |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
47 for original_contig_id, part_ids_d in all_originals.items(): |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
48 if len(part_ids_d) > 1: |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
49 c = Counter(part_ids_d.values()) |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
50 cluster_id = c.most_common(1)[0][0] |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
51 c_string = [(a, b) for a, b in c.items()] |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
52 # Here if len(c.values()) > 1, |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
53 # then no cluster for contig. |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
54 else: |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
55 cluster_id = list(part_ids_d.values())[0] |
|
b546422c9128
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff
changeset
|
56 ofh.write(f"{original_contig_id}\t{cluster_id}\n") |
