Mercurial > repos > imgteam > iscc_sum
comparison iscc_similarity_parse_output.py @ 1:7d2c95a58897 draft default tip
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
| author | imgteam |
|---|---|
| date | Fri, 19 Dec 2025 15:03:29 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:b9caa783059f | 1:7d2c95a58897 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """ | |
| 3 Parse ISCC similarity output into tabular format with unique identifiers. | |
| 4 | |
| 5 Input format (from iscc-sum --similar): | |
| 6 ISCC:K4AOMG... *file1.txt | |
| 7 ~08 ISCC:K4AOMG... *file2.txt | |
| 8 ~10 ISCC:K4AOMG... *file3.txt | |
| 9 ISCC:K4AGSPO... *file4.txt | |
| 10 | |
| 11 Output format (tabular with 7 columns, bidirectional): | |
| 12 file_id filename iscc_code match_id match_filename match_iscc_hash distance | |
| 13 23 file1.txt K4AOMG... 24 file2.txt K4AOMG... 8 | |
| 14 24 file2.txt K4AOMG... 23 file1.txt K4AOMG... 8 | |
| 15 25 file4.txt K4AGSPO... -1 | |
| 16 """ | |
| 17 import argparse | |
| 18 | |
| 19 | |
| 20 def clean_filename(filename): | |
| 21 """Remove directory prefix from filename.""" | |
| 22 # Remove 'input_files/' prefix if present | |
| 23 if filename.startswith('input_files/'): | |
| 24 filename = filename[len('input_files/'):] | |
| 25 | |
| 26 return filename | |
| 27 | |
| 28 | |
| 29 def load_id_mapping(mapping_file): | |
| 30 """Load filename to element_identifier mapping. | |
| 31 | |
| 32 Returns: dict mapping cleaned filename -> element_identifier | |
| 33 """ | |
| 34 mapping = {} | |
| 35 with open(mapping_file, 'r') as f: | |
| 36 for line in f: | |
| 37 parts = line.strip().split('\t') | |
| 38 if len(parts) == 2: | |
| 39 filename, element_id = parts | |
| 40 # Clean the filename the same way as in parse | |
| 41 cleaned = clean_filename(filename) | |
| 42 mapping[cleaned] = element_id | |
| 43 return mapping | |
| 44 | |
| 45 | |
| 46 def parse_iscc_line(line): | |
| 47 """Parse ISCC line and extract code and filename. | |
| 48 | |
| 49 Format: "ISCC:CODE *filename" or " ~NN ISCC:CODE *filename" | |
| 50 Returns: (code, filename) or (None, None) if parse fails | |
| 51 """ | |
| 52 # Find the * separator | |
| 53 if ' *' not in line: | |
| 54 return None, None | |
| 55 | |
| 56 # Split on ' *' to get code part and filename | |
| 57 parts = line.split(' *', 1) | |
| 58 code_part = parts[0].strip() | |
| 59 filename = clean_filename(parts[1].strip()) | |
| 60 | |
| 61 # Extract CODE (after 'ISCC:') | |
| 62 if 'ISCC:' in code_part: | |
| 63 code = code_part.split('ISCC:', 1)[1].strip() | |
| 64 else: | |
| 65 code = '' | |
| 66 | |
| 67 return code, filename | |
| 68 | |
| 69 | |
| 70 def main(): | |
| 71 parser = argparse.ArgumentParser( | |
| 72 description='Parse ISCC similarity output into tabular format' | |
| 73 ) | |
| 74 parser.add_argument( | |
| 75 'similarity_raw', | |
| 76 help='Raw similarity output from iscc-sum --similar' | |
| 77 ) | |
| 78 parser.add_argument( | |
| 79 'id_mapping', | |
| 80 help='TSV file mapping filenames to element identifiers' | |
| 81 ) | |
| 82 parser.add_argument( | |
| 83 'output_file', | |
| 84 help='Tabular output file' | |
| 85 ) | |
| 86 args = parser.parse_args() | |
| 87 | |
| 88 # Load ID mapping | |
| 89 id_map = load_id_mapping(args.id_mapping) | |
| 90 | |
| 91 # Parse similarity output | |
| 92 file_codes = {} # filename -> code mapping | |
| 93 matches = [] # List of (file1, code1, file2, code2, distance) | |
| 94 current_ref = None | |
| 95 current_code = None | |
| 96 | |
| 97 with open(args.similarity_raw, 'r') as f: | |
| 98 for line in f: | |
| 99 line = line.rstrip() | |
| 100 if not line: | |
| 101 continue | |
| 102 | |
| 103 if line.startswith('ISCC:'): | |
| 104 # Reference file: "ISCC:CODE *filename" | |
| 105 code, filename = parse_iscc_line(line) | |
| 106 if code and filename: | |
| 107 current_ref = filename | |
| 108 current_code = code | |
| 109 file_codes[filename] = code | |
| 110 | |
| 111 elif line.startswith(' ') and current_ref: | |
| 112 # Similar file: " ~NN ISCC:CODE *filename" | |
| 113 parts = line.strip().split(None, 1) # Split on first whitespace | |
| 114 if len(parts) == 2: | |
| 115 dist_str = parts[0].replace('~', '') | |
| 116 distance = int(dist_str) | |
| 117 | |
| 118 # Parse the rest of the line for ISCC and filename | |
| 119 code, filename = parse_iscc_line(parts[1]) | |
| 120 | |
| 121 if code and filename: | |
| 122 matches.append((current_ref, current_code, filename, code, distance)) | |
| 123 file_codes[filename] = code | |
| 124 # Write output with identifiers | |
| 125 with open(args.output_file, 'w') as out: | |
| 126 # Write header (7 columns) | |
| 127 out.write("file_id\tfilename\tiscc_code\tmatch_id\tmatch_filename\tmatch_iscc_code\tdistance\n") | |
| 128 | |
| 129 # Track which files have matches | |
| 130 files_with_matches = set() | |
| 131 | |
| 132 # Write similarity matches in both directions | |
| 133 for file1, code1, file2, code2, distance in matches: | |
| 134 # Get element identifiers | |
| 135 file1_name = id_map[file1] | |
| 136 file2_name = id_map[file2] | |
| 137 file1_id = str.split(file1, '_', 1)[0] # Extract ID from filename | |
| 138 file2_id = str.split(file2, '_', 1)[0] # Extract ID from filename | |
| 139 | |
| 140 # Write A -> B (file_id is the numeric ID, filename is the element_identifier) | |
| 141 out.write(f"{file1_id}\t{file1_name}\t{code1}\t{file2_id}\t{file2_name}\t{code2}\t{distance}\n") | |
| 142 # Write B -> A (bidirectional) | |
| 143 out.write(f"{file2_id}\t{file2_name}\t{code2}\t{file1_id}\t{file1_name}\t{code1}\t{distance}\n") | |
| 144 | |
| 145 files_with_matches.add(file1) | |
| 146 files_with_matches.add(file2) | |
| 147 | |
| 148 # Write files with no matches (distance = -1, empty match columns) | |
| 149 for filename in sorted(file_codes.keys()): | |
| 150 if filename not in files_with_matches: | |
| 151 file_id = str.split(filename, '_', 1)[0] # Extract ID from filename | |
| 152 element_name = id_map[filename] | |
| 153 code_val = file_codes[filename] | |
| 154 out.write(f"{file_id}\t{element_name}\t{code_val}\t\t\t\t-1\n") | |
| 155 | |
| 156 | |
| 157 if __name__ == '__main__': | |
| 158 main() |
