Mercurial > repos > imgteam > iscc_sum_verify
annotate iscc_similarity_parse_output.py @ 0:fe0c92fe6f66 draft default tip
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
| author | imgteam |
|---|---|
| date | Fri, 19 Dec 2025 15:03:21 +0000 |
| parents | |
| children |
| rev | line source |
|---|---|
|
0
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
1 #!/usr/bin/env python |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
2 """ |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
3 Parse ISCC similarity output into tabular format with unique identifiers. |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
4 |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
5 Input format (from iscc-sum --similar): |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
6 ISCC:K4AOMG... *file1.txt |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
7 ~08 ISCC:K4AOMG... *file2.txt |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
8 ~10 ISCC:K4AOMG... *file3.txt |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
9 ISCC:K4AGSPO... *file4.txt |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
10 |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
11 Output format (tabular with 7 columns, bidirectional): |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
12 file_id filename iscc_code match_id match_filename match_iscc_hash distance |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
13 23 file1.txt K4AOMG... 24 file2.txt K4AOMG... 8 |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
14 24 file2.txt K4AOMG... 23 file1.txt K4AOMG... 8 |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
15 25 file4.txt K4AGSPO... -1 |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
16 """ |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
17 import argparse |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
18 |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
19 |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
20 def clean_filename(filename): |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
21 """Remove directory prefix from filename.""" |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
22 # Remove 'input_files/' prefix if present |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
23 if filename.startswith('input_files/'): |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
24 filename = filename[len('input_files/'):] |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
25 |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
26 return filename |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
27 |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
28 |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
29 def load_id_mapping(mapping_file): |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
30 """Load filename to element_identifier mapping. |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
31 |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
32 Returns: dict mapping cleaned filename -> element_identifier |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
33 """ |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
34 mapping = {} |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
35 with open(mapping_file, 'r') as f: |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
36 for line in f: |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
37 parts = line.strip().split('\t') |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
38 if len(parts) == 2: |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
39 filename, element_id = parts |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
40 # Clean the filename the same way as in parse |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
41 cleaned = clean_filename(filename) |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
42 mapping[cleaned] = element_id |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
43 return mapping |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
44 |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
45 |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
46 def parse_iscc_line(line): |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
47 """Parse ISCC line and extract code and filename. |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
48 |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
49 Format: "ISCC:CODE *filename" or " ~NN ISCC:CODE *filename" |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
50 Returns: (code, filename) or (None, None) if parse fails |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
51 """ |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
52 # Find the * separator |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
53 if ' *' not in line: |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
54 return None, None |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
55 |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
56 # Split on ' *' to get code part and filename |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
57 parts = line.split(' *', 1) |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
58 code_part = parts[0].strip() |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
59 filename = clean_filename(parts[1].strip()) |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
60 |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
61 # Extract CODE (after 'ISCC:') |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
62 if 'ISCC:' in code_part: |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
63 code = code_part.split('ISCC:', 1)[1].strip() |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
64 else: |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
65 code = '' |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
66 |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
67 return code, filename |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
68 |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
69 |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
70 def main(): |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
71 parser = argparse.ArgumentParser( |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
72 description='Parse ISCC similarity output into tabular format' |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
73 ) |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
74 parser.add_argument( |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
75 'similarity_raw', |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
76 help='Raw similarity output from iscc-sum --similar' |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
77 ) |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
78 parser.add_argument( |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
79 'id_mapping', |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
80 help='TSV file mapping filenames to element identifiers' |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
81 ) |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
82 parser.add_argument( |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
83 'output_file', |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
84 help='Tabular output file' |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
85 ) |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
86 args = parser.parse_args() |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
87 |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
88 # Load ID mapping |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
89 id_map = load_id_mapping(args.id_mapping) |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
90 |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
91 # Parse similarity output |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
92 file_codes = {} # filename -> code mapping |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
93 matches = [] # List of (file1, code1, file2, code2, distance) |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
94 current_ref = None |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
95 current_code = None |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
96 |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
97 with open(args.similarity_raw, 'r') as f: |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
98 for line in f: |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
99 line = line.rstrip() |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
100 if not line: |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
101 continue |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
102 |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
103 if line.startswith('ISCC:'): |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
104 # Reference file: "ISCC:CODE *filename" |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
105 code, filename = parse_iscc_line(line) |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
106 if code and filename: |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
107 current_ref = filename |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
108 current_code = code |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
109 file_codes[filename] = code |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
110 |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
111 elif line.startswith(' ') and current_ref: |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
112 # Similar file: " ~NN ISCC:CODE *filename" |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
113 parts = line.strip().split(None, 1) # Split on first whitespace |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
114 if len(parts) == 2: |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
115 dist_str = parts[0].replace('~', '') |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
116 distance = int(dist_str) |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
117 |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
118 # Parse the rest of the line for ISCC and filename |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
119 code, filename = parse_iscc_line(parts[1]) |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
120 |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
121 if code and filename: |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
122 matches.append((current_ref, current_code, filename, code, distance)) |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
123 file_codes[filename] = code |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
124 # Write output with identifiers |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
125 with open(args.output_file, 'w') as out: |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
126 # Write header (7 columns) |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
127 out.write("file_id\tfilename\tiscc_code\tmatch_id\tmatch_filename\tmatch_iscc_code\tdistance\n") |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
128 |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
129 # Track which files have matches |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
130 files_with_matches = set() |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
131 |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
132 # Write similarity matches in both directions |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
133 for file1, code1, file2, code2, distance in matches: |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
134 # Get element identifiers |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
135 file1_name = id_map[file1] |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
136 file2_name = id_map[file2] |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
137 file1_id = str.split(file1, '_', 1)[0] # Extract ID from filename |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
138 file2_id = str.split(file2, '_', 1)[0] # Extract ID from filename |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
139 |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
140 # Write A -> B (file_id is the numeric ID, filename is the element_identifier) |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
141 out.write(f"{file1_id}\t{file1_name}\t{code1}\t{file2_id}\t{file2_name}\t{code2}\t{distance}\n") |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
142 # Write B -> A (bidirectional) |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
143 out.write(f"{file2_id}\t{file2_name}\t{code2}\t{file1_id}\t{file1_name}\t{code1}\t{distance}\n") |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
144 |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
145 files_with_matches.add(file1) |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
146 files_with_matches.add(file2) |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
147 |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
148 # Write files with no matches (distance = -1, empty match columns) |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
149 for filename in sorted(file_codes.keys()): |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
150 if filename not in files_with_matches: |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
151 file_id = str.split(filename, '_', 1)[0] # Extract ID from filename |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
152 element_name = id_map[filename] |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
153 code_val = file_codes[filename] |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
154 out.write(f"{file_id}\t{element_name}\t{code_val}\t\t\t\t-1\n") |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
155 |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
156 |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
157 if __name__ == '__main__': |
|
fe0c92fe6f66
planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/iscc-sum commit 6db86b8b65a0e05b7f3541d505fbe900633fc72a
imgteam
parents:
diff
changeset
|
158 main() |
