annotate format_cd_hit_output.py @ 1:64da677bcee2 draft default tip

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
author bgruening
date Wed, 19 Oct 2022 14:42:33 +0000
parents 4015e9d6d277
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
1 #!/usr/bin/env python
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
2 # -*- coding: utf-8 -*-
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
3
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
4 import argparse
1
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
5
0
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
6
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
7 def extract_mapping_info(input_mapping_filepath):
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
8 mapping_info = {}
1
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
9 categories = set([])
0
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
10
1
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
11 with open(input_mapping_filepath, 'r') as mapping_file:
0
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
12 for line in mapping_file.readlines():
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
13 split_line = line[:-1].split('\t')
1
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
14 mapping_info.setdefault(split_line[0], split_line[1])
0
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
15 categories.add(split_line[1])
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
16
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
17 return mapping_info, categories
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
18
1
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
19
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
20 def init_category_distribution(categories=None):
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
21 cluster_categ_distri = {}
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
22 if categories is not None:
0
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
23 for category in categories:
1
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
24 cluster_categ_distri[category] = 0
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
25 return cluster_categ_distri
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
26
0
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
27
1
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
28 def flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster,
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
29 cluster_categ_distri, categories,
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
30 output_category_distribution_file, cluster_seq_number):
0
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
31 if cluster_name != '':
1
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
32 if categories is not None:
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
33 string = cluster_name
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
34 string += '\t' + str(cluster_seq_number)
0
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
35 for category in categories:
1
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
36 string += '\t'
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
37 string += str(cluster_categ_distri[category])
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
38 string += '\n'
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
39 output_category_distribution_file.write(string)
0
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
40
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
41 if cluster_ref_seq == '':
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
42 string = "No reference sequence found for "
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
43 string += cluster_name
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
44 raise ValueError(string)
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
45
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
46 ref_seq_cluster.setdefault(cluster_ref_seq, cluster_name)
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
47
1
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
48
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
49 def extract_cluster_info(args, mapping_info=None, categories=None):
0
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
50 ref_seq_cluster = {}
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
51
1
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
52 if args.output_category_distribution is not None:
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
53 if mapping_info is None or categories is None:
0
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
54 string = "A file with category distribution is expected but "
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
55 string += "no mapping information are available"
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
56 raise ValueError(string)
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
57 output_cat_distri_file = open(args.output_category_distribution, 'w')
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
58 output_cat_distri_file.write('Cluster\tSequence_number')
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
59 for category in categories:
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
60 output_cat_distri_file.write('\t' + category)
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
61
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
62 output_cat_distri_file.write('\n')
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
63 else:
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
64 output_cat_distri_file = None
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
65
1
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
66 with open(args.input_cluster_info, 'r') as cluster_info_file:
0
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
67 cluster_name = ''
1
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
68 cluster_categ_distri = init_category_distribution(categories)
0
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
69 cluster_ref_seq = ''
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
70 cluster_seq_number = 0
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
71 for line in cluster_info_file.readlines():
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
72 if line[0] == '>':
1
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
73 flush_cluster_info(
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
74 cluster_name,
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
75 cluster_ref_seq,
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
76 ref_seq_cluster,
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
77 cluster_categ_distri,
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
78 categories,
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
79 output_cat_distri_file,
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
80 cluster_seq_number)
0
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
81 cluster_name = line[1:-1]
1
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
82 cluster_name = cluster_name.replace(' ', '_')
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
83 cluster_categ_distri = init_category_distribution(categories)
0
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
84 cluster_ref_seq = ''
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
85 cluster_seq_number = 0
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
86 else:
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
87 seq_info = line[:-1].split('\t')[1].split(' ')
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
88 seq_name = seq_info[1][1:-3]
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
89 cluster_seq_number += 1
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
90
1
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
91 if categories is not None:
0
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
92 seq_count = 1
1
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
93 if args.number_sum is not None:
0
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
94 if seq_name.find('size') != -1:
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
95 substring = seq_name[seq_name.find('size'):-1]
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
96 seq_count = int(substring.split('=')[1])
1
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
97 if seq_name not in mapping_info:
0
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
98 string = seq_name + " not found in mapping"
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
99 raise ValueError(string)
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
100 category = mapping_info[seq_name]
1
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
101 cluster_categ_distri[category] += seq_count
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
102
0
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
103 if seq_info[-1] == '*':
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
104 if cluster_ref_seq != '':
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
105 string = "A reference sequence (" + cluster_ref_seq
1
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
106 string += ") already found for cluster " + cluster_name
0
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
107 string += " (" + seq_name + ")"
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
108 raise ValueError(string)
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
109 cluster_ref_seq = seq_name
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
110
1
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
111 flush_cluster_info(
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
112 cluster_name,
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
113 cluster_ref_seq,
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
114 ref_seq_cluster,
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
115 cluster_categ_distri,
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
116 categories,
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
117 output_cat_distri_file,
0
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
118 cluster_seq_number)
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
119
1
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
120 if args.output_category_distribution is not None:
0
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
121 output_cat_distri_file.close()
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
122
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
123 return ref_seq_cluster
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
124
1
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
125
0
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
126 def rename_representative_sequences(args, ref_seq_cluster):
1
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
127 with open(args.input_representative_sequences, 'r') as input_sequences:
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
128 with open(args.output_representative_sequences, 'w') as output_seq:
0
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
129 for line in input_sequences.readlines():
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
130 if line[0] == '>':
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
131 seq_name = line[1:-1]
1
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
132 if seq_name not in ref_seq_cluster:
0
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
133 string = seq_name + " not found as reference sequence"
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
134 raise ValueError(string)
1
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
135 string = '>' + ref_seq_cluster[seq_name] + '\n'
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
136 output_seq.write(string)
0
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
137 else:
1
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
138 output_seq.write(line)
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
139
0
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
140
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
141 def format_cd_hit_outputs(args):
1
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
142 if args.input_mapping is not None:
0
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
143 mapping_info, categories = extract_mapping_info(args.input_mapping)
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
144 else:
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
145 mapping_info = None
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
146 categories = None
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
147
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
148 ref_seq_cluster = extract_cluster_info(args, mapping_info, categories)
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
149
1
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
150 if args.input_representative_sequences is not None:
0
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
151 rename_representative_sequences(args, ref_seq_cluster)
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
152
1
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
153
0
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
154 if __name__ == "__main__":
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
155 parser = argparse.ArgumentParser()
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
156 parser.add_argument('--input_cluster_info', required=True)
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
157 parser.add_argument('--input_representative_sequences')
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
158 parser.add_argument('--output_representative_sequences')
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
159 parser.add_argument('--input_mapping')
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
160 parser.add_argument('--output_category_distribution')
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
161 parser.add_argument('--number_sum')
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
162 args = parser.parse_args()
4015e9d6d277 planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
bebatut
parents:
diff changeset
163
1
64da677bcee2 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
bgruening
parents: 0
diff changeset
164 format_cd_hit_outputs(args)