comparison iscc_similarity_parse_output.py @ 1:7d2c95a58897 draft default tip

planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
author imgteam
date Fri, 19 Dec 2025 15:03:29 +0000
parents
children
comparison
equal deleted inserted replaced
0:b9caa783059f 1:7d2c95a58897
1 #!/usr/bin/env python
2 """
3 Parse ISCC similarity output into tabular format with unique identifiers.
4
5 Input format (from iscc-sum --similar):
6 ISCC:K4AOMG... *file1.txt
7 ~08 ISCC:K4AOMG... *file2.txt
8 ~10 ISCC:K4AOMG... *file3.txt
9 ISCC:K4AGSPO... *file4.txt
10
11 Output format (tabular with 7 columns, bidirectional):
12 file_id filename iscc_code match_id match_filename match_iscc_hash distance
13 23 file1.txt K4AOMG... 24 file2.txt K4AOMG... 8
14 24 file2.txt K4AOMG... 23 file1.txt K4AOMG... 8
15 25 file4.txt K4AGSPO... -1
16 """
17 import argparse
18
19
20 def clean_filename(filename):
21 """Remove directory prefix from filename."""
22 # Remove 'input_files/' prefix if present
23 if filename.startswith('input_files/'):
24 filename = filename[len('input_files/'):]
25
26 return filename
27
28
29 def load_id_mapping(mapping_file):
30 """Load filename to element_identifier mapping.
31
32 Returns: dict mapping cleaned filename -> element_identifier
33 """
34 mapping = {}
35 with open(mapping_file, 'r') as f:
36 for line in f:
37 parts = line.strip().split('\t')
38 if len(parts) == 2:
39 filename, element_id = parts
40 # Clean the filename the same way as in parse
41 cleaned = clean_filename(filename)
42 mapping[cleaned] = element_id
43 return mapping
44
45
46 def parse_iscc_line(line):
47 """Parse ISCC line and extract code and filename.
48
49 Format: "ISCC:CODE *filename" or " ~NN ISCC:CODE *filename"
50 Returns: (code, filename) or (None, None) if parse fails
51 """
52 # Find the * separator
53 if ' *' not in line:
54 return None, None
55
56 # Split on ' *' to get code part and filename
57 parts = line.split(' *', 1)
58 code_part = parts[0].strip()
59 filename = clean_filename(parts[1].strip())
60
61 # Extract CODE (after 'ISCC:')
62 if 'ISCC:' in code_part:
63 code = code_part.split('ISCC:', 1)[1].strip()
64 else:
65 code = ''
66
67 return code, filename
68
69
70 def main():
71 parser = argparse.ArgumentParser(
72 description='Parse ISCC similarity output into tabular format'
73 )
74 parser.add_argument(
75 'similarity_raw',
76 help='Raw similarity output from iscc-sum --similar'
77 )
78 parser.add_argument(
79 'id_mapping',
80 help='TSV file mapping filenames to element identifiers'
81 )
82 parser.add_argument(
83 'output_file',
84 help='Tabular output file'
85 )
86 args = parser.parse_args()
87
88 # Load ID mapping
89 id_map = load_id_mapping(args.id_mapping)
90
91 # Parse similarity output
92 file_codes = {} # filename -> code mapping
93 matches = [] # List of (file1, code1, file2, code2, distance)
94 current_ref = None
95 current_code = None
96
97 with open(args.similarity_raw, 'r') as f:
98 for line in f:
99 line = line.rstrip()
100 if not line:
101 continue
102
103 if line.startswith('ISCC:'):
104 # Reference file: "ISCC:CODE *filename"
105 code, filename = parse_iscc_line(line)
106 if code and filename:
107 current_ref = filename
108 current_code = code
109 file_codes[filename] = code
110
111 elif line.startswith(' ') and current_ref:
112 # Similar file: " ~NN ISCC:CODE *filename"
113 parts = line.strip().split(None, 1) # Split on first whitespace
114 if len(parts) == 2:
115 dist_str = parts[0].replace('~', '')
116 distance = int(dist_str)
117
118 # Parse the rest of the line for ISCC and filename
119 code, filename = parse_iscc_line(parts[1])
120
121 if code and filename:
122 matches.append((current_ref, current_code, filename, code, distance))
123 file_codes[filename] = code
124 # Write output with identifiers
125 with open(args.output_file, 'w') as out:
126 # Write header (7 columns)
127 out.write("file_id\tfilename\tiscc_code\tmatch_id\tmatch_filename\tmatch_iscc_code\tdistance\n")
128
129 # Track which files have matches
130 files_with_matches = set()
131
132 # Write similarity matches in both directions
133 for file1, code1, file2, code2, distance in matches:
134 # Get element identifiers
135 file1_name = id_map[file1]
136 file2_name = id_map[file2]
137 file1_id = str.split(file1, '_', 1)[0] # Extract ID from filename
138 file2_id = str.split(file2, '_', 1)[0] # Extract ID from filename
139
140 # Write A -> B (file_id is the numeric ID, filename is the element_identifier)
141 out.write(f"{file1_id}\t{file1_name}\t{code1}\t{file2_id}\t{file2_name}\t{code2}\t{distance}\n")
142 # Write B -> A (bidirectional)
143 out.write(f"{file2_id}\t{file2_name}\t{code2}\t{file1_id}\t{file1_name}\t{code1}\t{distance}\n")
144
145 files_with_matches.add(file1)
146 files_with_matches.add(file2)
147
148 # Write files with no matches (distance = -1, empty match columns)
149 for filename in sorted(file_codes.keys()):
150 if filename not in files_with_matches:
151 file_id = str.split(filename, '_', 1)[0] # Extract ID from filename
152 element_name = id_map[filename]
153 code_val = file_codes[filename]
154 out.write(f"{file_id}\t{element_name}\t{code_val}\t\t\t\t-1\n")
155
156
157 if __name__ == '__main__':
158 main()