comparison format_cd_hit_output.py @ 1:64da677bcee2 draft default tip

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
author bgruening
date Wed, 19 Oct 2022 14:42:33 +0000
parents 4015e9d6d277
children
comparison
equal deleted inserted replaced
0:4015e9d6d277 1:64da677bcee2
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*- 2 # -*- coding: utf-8 -*-
3 3
4 import sys
5 import os
6 import argparse 4 import argparse
7 import copy 5
8 import operator
9 from sets import Set
10 6
11 def extract_mapping_info(input_mapping_filepath): 7 def extract_mapping_info(input_mapping_filepath):
12 mapping_info = {} 8 mapping_info = {}
13 categories = Set([]) 9 categories = set([])
14 10
15 with open(input_mapping_filepath,'r') as mapping_file: 11 with open(input_mapping_filepath, 'r') as mapping_file:
16 for line in mapping_file.readlines(): 12 for line in mapping_file.readlines():
17 split_line = line[:-1].split('\t') 13 split_line = line[:-1].split('\t')
18 mapping_info.setdefault(split_line[0],split_line[1]) 14 mapping_info.setdefault(split_line[0], split_line[1])
19 categories.add(split_line[1]) 15 categories.add(split_line[1])
20 16
21 return mapping_info, categories 17 return mapping_info, categories
22 18
23 def init_category_distribution(categories = None): 19
24 cluster_category_distribution = {} 20 def init_category_distribution(categories=None):
25 if categories != None: 21 cluster_categ_distri = {}
22 if categories is not None:
26 for category in categories: 23 for category in categories:
27 cluster_category_distribution[category] = 0 24 cluster_categ_distri[category] = 0
28 return cluster_category_distribution 25 return cluster_categ_distri
29 26
30 def flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster, 27
31 cluster_category_distribution, categories, output_category_distribution_file, 28 def flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster,
32 cluster_seq_number): 29 cluster_categ_distri, categories,
30 output_category_distribution_file, cluster_seq_number):
33 if cluster_name != '': 31 if cluster_name != '':
34 if categories != None: 32 if categories is not None:
35 output_category_distribution_file.write(cluster_name) 33 string = cluster_name
36 output_category_distribution_file.write('\t' + str(cluster_seq_number)) 34 string += '\t' + str(cluster_seq_number)
37 for category in categories: 35 for category in categories:
38 output_category_distribution_file.write('\t') 36 string += '\t'
39 output_category_distribution_file.write(str(cluster_category_distribution[category])) 37 string += str(cluster_categ_distri[category])
40 output_category_distribution_file.write('\n') 38 string += '\n'
39 output_category_distribution_file.write(string)
41 40
42 if cluster_ref_seq == '': 41 if cluster_ref_seq == '':
43 string = "No reference sequence found for " 42 string = "No reference sequence found for "
44 string += cluster_name 43 string += cluster_name
45 raise ValueError(string) 44 raise ValueError(string)
46 45
47 ref_seq_cluster.setdefault(cluster_ref_seq, cluster_name) 46 ref_seq_cluster.setdefault(cluster_ref_seq, cluster_name)
48 47
49 def extract_cluster_info(args, mapping_info = None, categories = None): 48
49 def extract_cluster_info(args, mapping_info=None, categories=None):
50 ref_seq_cluster = {} 50 ref_seq_cluster = {}
51 51
52 if args.output_category_distribution != None: 52 if args.output_category_distribution is not None:
53 if mapping_info == None or categories == None: 53 if mapping_info is None or categories is None:
54 string = "A file with category distribution is expected but " 54 string = "A file with category distribution is expected but "
55 string += "no mapping information are available" 55 string += "no mapping information are available"
56 raise ValueError(string) 56 raise ValueError(string)
57 output_cat_distri_file = open(args.output_category_distribution, 'w') 57 output_cat_distri_file = open(args.output_category_distribution, 'w')
58 output_cat_distri_file.write('Cluster\tSequence_number') 58 output_cat_distri_file.write('Cluster\tSequence_number')
61 61
62 output_cat_distri_file.write('\n') 62 output_cat_distri_file.write('\n')
63 else: 63 else:
64 output_cat_distri_file = None 64 output_cat_distri_file = None
65 65
66 with open(args.input_cluster_info,'r') as cluster_info_file: 66 with open(args.input_cluster_info, 'r') as cluster_info_file:
67 cluster_name = '' 67 cluster_name = ''
68 cluster_category_distribution = init_category_distribution(categories) 68 cluster_categ_distri = init_category_distribution(categories)
69 cluster_ref_seq = '' 69 cluster_ref_seq = ''
70 cluster_seq_number = 0 70 cluster_seq_number = 0
71 for line in cluster_info_file.readlines(): 71 for line in cluster_info_file.readlines():
72 if line[0] == '>': 72 if line[0] == '>':
73 flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster, 73 flush_cluster_info(
74 cluster_category_distribution, categories, 74 cluster_name,
75 output_cat_distri_file, cluster_seq_number) 75 cluster_ref_seq,
76 ref_seq_cluster,
77 cluster_categ_distri,
78 categories,
79 output_cat_distri_file,
80 cluster_seq_number)
76 cluster_name = line[1:-1] 81 cluster_name = line[1:-1]
77 cluster_name = cluster_name.replace(' ','_') 82 cluster_name = cluster_name.replace(' ', '_')
78 cluster_category_distribution = init_category_distribution(categories) 83 cluster_categ_distri = init_category_distribution(categories)
79 cluster_ref_seq = '' 84 cluster_ref_seq = ''
80 cluster_seq_number = 0 85 cluster_seq_number = 0
81 else: 86 else:
82 seq_info = line[:-1].split('\t')[1].split(' ') 87 seq_info = line[:-1].split('\t')[1].split(' ')
83 seq_name = seq_info[1][1:-3] 88 seq_name = seq_info[1][1:-3]
84 cluster_seq_number += 1 89 cluster_seq_number += 1
85 90
86 if categories != None: 91 if categories is not None:
87 seq_count = 1 92 seq_count = 1
88 if args.number_sum != None: 93 if args.number_sum is not None:
89 if seq_name.find('size') != -1: 94 if seq_name.find('size') != -1:
90 substring = seq_name[seq_name.find('size'):-1] 95 substring = seq_name[seq_name.find('size'):-1]
91 seq_count = int(substring.split('=')[1]) 96 seq_count = int(substring.split('=')[1])
92 if not mapping_info.has_key(seq_name): 97 if seq_name not in mapping_info:
93 string = seq_name + " not found in mapping" 98 string = seq_name + " not found in mapping"
94 raise ValueError(string) 99 raise ValueError(string)
95 category = mapping_info[seq_name] 100 category = mapping_info[seq_name]
96 cluster_category_distribution[category] += seq_count 101 cluster_categ_distri[category] += seq_count
97 102
98 if seq_info[-1] == '*': 103 if seq_info[-1] == '*':
99 if cluster_ref_seq != '': 104 if cluster_ref_seq != '':
100 string = "A reference sequence (" + cluster_ref_seq 105 string = "A reference sequence (" + cluster_ref_seq
101 string += ") already found for cluster " + cluster_name 106 string += ") already found for cluster " + cluster_name
102 string += " (" + seq_name + ")" 107 string += " (" + seq_name + ")"
103 raise ValueError(string) 108 raise ValueError(string)
104 cluster_ref_seq = seq_name 109 cluster_ref_seq = seq_name
105 110
106 flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster, 111 flush_cluster_info(
107 cluster_category_distribution, categories, output_cat_distri_file, 112 cluster_name,
113 cluster_ref_seq,
114 ref_seq_cluster,
115 cluster_categ_distri,
116 categories,
117 output_cat_distri_file,
108 cluster_seq_number) 118 cluster_seq_number)
109 119
110 if args.output_category_distribution != None: 120 if args.output_category_distribution is not None:
111 output_cat_distri_file.close() 121 output_cat_distri_file.close()
112 122
113 return ref_seq_cluster 123 return ref_seq_cluster
114 124
125
115 def rename_representative_sequences(args, ref_seq_cluster): 126 def rename_representative_sequences(args, ref_seq_cluster):
116 with open(args.input_representative_sequences,'r') as input_sequences: 127 with open(args.input_representative_sequences, 'r') as input_sequences:
117 with open(args.output_representative_sequences,'w') as output_sequences: 128 with open(args.output_representative_sequences, 'w') as output_seq:
118 for line in input_sequences.readlines(): 129 for line in input_sequences.readlines():
119 if line[0] == '>': 130 if line[0] == '>':
120 seq_name = line[1:-1] 131 seq_name = line[1:-1]
121 if not ref_seq_cluster.has_key(seq_name): 132 if seq_name not in ref_seq_cluster:
122 string = seq_name + " not found as reference sequence" 133 string = seq_name + " not found as reference sequence"
123 raise ValueError(string) 134 raise ValueError(string)
124 output_sequences.write('>' + ref_seq_cluster[seq_name] + '\n') 135 string = '>' + ref_seq_cluster[seq_name] + '\n'
136 output_seq.write(string)
125 else: 137 else:
126 output_sequences.write(line) 138 output_seq.write(line)
139
127 140
128 def format_cd_hit_outputs(args): 141 def format_cd_hit_outputs(args):
129 if args.input_mapping != None: 142 if args.input_mapping is not None:
130 mapping_info, categories = extract_mapping_info(args.input_mapping) 143 mapping_info, categories = extract_mapping_info(args.input_mapping)
131 else: 144 else:
132 mapping_info = None 145 mapping_info = None
133 categories = None 146 categories = None
134 147
135 ref_seq_cluster = extract_cluster_info(args, mapping_info, categories) 148 ref_seq_cluster = extract_cluster_info(args, mapping_info, categories)
136 149
137 if args.input_representative_sequences != None: 150 if args.input_representative_sequences is not None:
138 rename_representative_sequences(args, ref_seq_cluster) 151 rename_representative_sequences(args, ref_seq_cluster)
152
139 153
140 if __name__ == "__main__": 154 if __name__ == "__main__":
141 parser = argparse.ArgumentParser() 155 parser = argparse.ArgumentParser()
142 parser.add_argument('--input_cluster_info', required=True) 156 parser.add_argument('--input_cluster_info', required=True)
143 parser.add_argument('--input_representative_sequences') 157 parser.add_argument('--input_representative_sequences')