annotate sequence_overview.py @ 95:d63eff357515 draft

planemo upload commit d96a736dcd6da34137f79861fbc6369716c332f1
author rhpvorderman
date Mon, 27 Mar 2023 13:11:53 +0000
parents 84e9e5c8c101
children 385dea3c6cb5
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
92
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
1 #!/usr/bin/env/python3
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
2
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
3 """Create a HTML sequence overview"""
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
4
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
5 import argparse
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
6 import os
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
7 import typing
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
8 from collections import defaultdict
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
9 from pathlib import Path
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
10 from typing import Dict, Iterable, List
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
11
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
12
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
13 class SequenceTableRow(typing.NamedTuple):
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
14 sequence_id: str
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
15 sequence: str
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
16 best_match: str
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
17 functionality: str
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
18
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
19
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
20 class SequenceStats:
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
21 __slots__ = ("counts", "table_rows")
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
22
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
23 def __init__(self):
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
24 self.counts: Dict[str, int] = {
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
25 "IGA1": 0,
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
26 "IGA2": 0,
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
27 "IGE": 0,
94
84e9e5c8c101 "planemo upload commit d4be85014b638f1d50b318d4b735be7f6e973140"
rhpvorderman
parents: 93
diff changeset
28 "IGA": 0, # IGA and IGG without subclasses only exist when the
84e9e5c8c101 "planemo upload commit d4be85014b638f1d50b318d4b735be7f6e973140"
rhpvorderman
parents: 93
diff changeset
29 "IGG": 0, # everything is IGA or IGG option is chosen.
92
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
30 "IGG1": 0,
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
31 "IGG2": 0,
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
32 "IGG3": 0,
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
33 "IGG4": 0,
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
34 "IGM": 0,
93
8fcf31272f6e planemo upload commit a43893724cc769bed8a1f19a5b19ec1ba20cb63c
rhpvorderman
parents: 92
diff changeset
35 "unmatched": 0,
8fcf31272f6e planemo upload commit a43893724cc769bed8a1f19a5b19ec1ba20cb63c
rhpvorderman
parents: 92
diff changeset
36 "all": 0,
8fcf31272f6e planemo upload commit a43893724cc769bed8a1f19a5b19ec1ba20cb63c
rhpvorderman
parents: 92
diff changeset
37 }
92
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
38 self.table_rows: List[SequenceTableRow] = []
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
39
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
40
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
41 def get_sequence_stats(before_unique: str,
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
42 sequence_columns: List[str]):
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
43 sequence_statistics = defaultdict(SequenceStats)
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
44 with open(before_unique, "rt") as table:
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
45 header = next(table)
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
46 header_columns = header.strip("\n").split("\t")
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
47 for line in table:
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
48 values = line.strip("\n").split("\t")
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
49 row_dict = dict(zip(header_columns, values))
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
50 sequence = " ".join(row_dict[column] for column in sequence_columns)
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
51 best_match = row_dict["best_match"]
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
52 original_match = best_match
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
53 if best_match.startswith("unmatched"):
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
54 best_match = "unmatched"
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
55 sequence_statistics[sequence].counts[best_match] += 1
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
56 functionality = row_dict["Functionality"]
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
57 sequence_statistics[sequence].table_rows.append(
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
58 SequenceTableRow(row_dict["Sequence.ID"], sequence,
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
59 original_match, functionality))
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
60 return sequence_statistics
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
61
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
62
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
63 def get_background_color(value: str):
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
64 if value in ("TRUE", "T"):
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
65 return "#eafaf1"
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
66 elif value in ("FALSE", "F"):
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
67 return "#f9ebea"
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
68 try:
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
69 flt = float(value)
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
70 except ValueError:
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
71 return "white"
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
72 if flt > 0:
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
73 return "#eaecee"
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
74 return "white"
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
75
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
76
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
77 def td(val):
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
78 return f"<td bgcolor='{get_background_color(val)}'>{val}</td>"
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
79
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
80
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
81 def tr(val: Iterable[str]):
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
82 return f"<tr>{''.join(td(v) for v in val)}</tr>\n"
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
83
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
84
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
85 def make_link(link, val):
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
86 return f"<a href='{link}'>{val}</a>"
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
87
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
88
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
89 def tbl(df: Iterable[Iterable[str]]):
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
90 return f"<table border='1'>{''.join(tr(v) for v in df)}</table>\n"
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
91
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
92
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
93 def to_bool_str(cond):
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
94 return "TRUE" if cond else "FALSE"
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
95
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
96
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
97 def sequence_overview(before_unique: str,
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
98 outdir: str,
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
99 empty_region_filter: str):
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
100 os.makedirs(outdir, exist_ok=True)
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
101 sequence_columns = [
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
102 "FR1.IMGT.seq", "CDR1.IMGT.seq", "FR2.IMGT.seq", "CDR2.IMGT.seq",
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
103 "FR3.IMGT.seq", "CDR3.IMGT.seq"]
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
104 if empty_region_filter == "leader":
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
105 sequence_columns = sequence_columns
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
106 elif empty_region_filter == "FR1":
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
107 sequence_columns = sequence_columns[1:]
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
108 elif empty_region_filter == "CDR1":
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
109 sequence_columns = sequence_columns[2:]
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
110 elif empty_region_filter == "FR2":
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
111 sequence_columns = sequence_columns[3:]
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
112 else:
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
113 raise ValueError(f"Unknown region filter: {empty_region_filter}")
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
114 main_html_file = os.path.join(outdir, "index.html")
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
115 by_id_file = os.path.join(outdir, "by_id.html")
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
116 with open(main_html_file, "wt") as main_html, open(by_id_file, "wt") as by_id:
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
117 main_html.write("<center><img src='data:image/png;base64,"
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
118 "iVBORw0KGgoAAAANSUhEUgAAAA8AAAAPCAYAAAA71pVKAAAAzElEQ"
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
119 "VQoka2TwQ2CQBBFpwTshw4ImW8ogJMlUIMmhNCDxgasAi50oSXA8X"
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
120 "lAjCG7aqKTzGX/vsnM31mzR0gk7tTudO5MEizpzvQ4ryUSe408J3X"
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
121 "n+grE0p1rnpOamVmWsZG4rS+dzzAMsN8Hi9yyjI1JNGtxu4VxBJgL"
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
122 "RLpoTKIPiW0LlwtUVRTubW2OBGUJu92cZRmdfbKQMAw8o+vi5v0fL"
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
123 "orZ7Y9waGYJjsf38DJz0O1PsEQffOcv4Sa6YYfDDJ5Obzbsp93+5Vf"
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
124 "dATueO1fdLdI0AAAAAElFTkSuQmCC'"
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
125 "> Please note that this tab is based on all "
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
126 "sequences before filter unique sequences and the "
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
127 "remove duplicates based on filters are applied. In "
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
128 "this table only sequences occuring more than once "
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
129 "are included. </center>")
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
130 main_html.write("<table border='1' class='pure-table pure-table-striped'>")
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
131 main_html.write(f"<caption>"
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
132 f"{'+'.join(column.split('.')[0] for column in sequence_columns)} sequences "
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
133 f"that show up more than once</caption>")
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
134 main_html.write("<tr>")
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
135 main_html.write("<th>Sequence</th><th>Functionality</th><th>IGA1</th>"
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
136 "<th>IGA2</th><th>IGG1</th><th>IGG2</th><th>IGG3</th>"
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
137 "<th>IGG4</th><th>IGM</th><th>IGE</th><th>UN</th>")
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
138 main_html.write("<th>total IGA</th><th>total IGG</th><th>total IGM</th>"
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
139 "<th>total IGE</th><th>number of subclasses</th>"
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
140 "<th>present in both IGA and IGG</th>"
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
141 "<th>present in IGA, IGG and IGM</th>"
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
142 "<th>present in IGA, IGG and IGE</th>"
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
143 "<th>present in IGA, IGG, IGM and IGE</th>"
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
144 "<th>IGA1+IGA2</th>")
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
145 main_html.write("<th>IGG1+IGG2</th><th>IGG1+IGG3</th>"
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
146 "<th>IGG1+IGG4</th><th>IGG2+IGG3</th>"
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
147 "<th>IGG2+IGG4</th><th>IGG3+IGG4</th>")
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
148 main_html.write("<th>IGG1+IGG2+IGG3</th><th>IGG2+IGG3+IGG4</th>"
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
149 "<th>IGG1+IGG2+IGG4</th><th>IGG1+IGG3+IGG4</th>"
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
150 "<th>IGG1+IGG2+IGG3+IGG4</th>")
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
151 main_html.write("</tr>\n")
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
152 sequence_stats = get_sequence_stats(before_unique, sequence_columns)
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
153 sorted_sequences = sorted(sequence_stats.keys())
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
154
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
155 single_sequences = 0 # sequence only found once, skipped
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
156 in_multiple = 0 # same sequence across multiple subclasses
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
157 multiple_in_one = 0 # same sequence multiple times in one subclass
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
158 unmatched = 0 # all the sequences are unmatched
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
159 some_unmatched = 0 # one or more sequences in a clone are unmatched
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
160 matched = 0 # should be the same als matched sequences
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
161
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
162 for i, sequence in enumerate(sorted_sequences, start=1):
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
163 sequence_stat: SequenceStats = sequence_stats[sequence]
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
164 count_dict = sequence_stat.counts
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
165 class_sum = sum(count_dict.values())
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
166 if class_sum == 1:
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
167 single_sequences += 1
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
168 continue
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
169 if count_dict["unmatched"] == class_sum:
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
170 unmatched += 1
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
171 continue
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
172 in_classes = sum(1 for key, value in count_dict.items()
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
173 if value > 0 and key != "unmatched")
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
174 matched += in_classes
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
175 if any(value == class_sum for value in count_dict.values()):
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
176 multiple_in_one += 1
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
177 elif count_dict["unmatched"] > 0:
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
178 some_unmatched += 1
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
179 else:
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
180 in_multiple += 1
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
181 # Use a dict so we can preserve the order and get all the unique
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
182 # items. With a set the order is not preserved.
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
183 functionality_dict = {row.functionality: None
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
184 for row in sequence_stat.table_rows}
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
185 functionality = ",".join(functionality_dict.keys())
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
186 links: Dict[str, str] = {}
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
187 for key, value in count_dict.items():
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
188 name_key = "un" if key == "unmatched" else key
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
189 html_file = f"{name_key}_{i}.html"
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
190 links[key] = html_file
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
191 if value > 0:
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
192 rows = [row for row in sequence_stat.table_rows
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
193 # Startswith to also get unmatched columns
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
194 if row.best_match.startswith(key)]
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
195 Path(outdir, html_file).write_text(tbl(rows))
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
196 for row in rows:
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
197 by_id.write(make_link(html_file, row.sequence_id) + "<br />\n")
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
198 iga_count = count_dict["IGA1"] + count_dict["IGA2"]
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
199 igg_count = count_dict["IGG1"] + count_dict["IGG2"] + \
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
200 count_dict["IGG3"] + count_dict["IGG4"]
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
201
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
202 contained_classes = set(key for key, value
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
203 in count_dict.items() if value > 0)
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
204 if iga_count:
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
205 contained_classes.add("IGA")
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
206 if igg_count:
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
207 contained_classes.add("IGG")
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
208 main_row = [
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
209 sequence, functionality,
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
210 make_link(links["IGA1"], count_dict["IGA1"]),
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
211 make_link(links["IGA2"], count_dict["IGA2"]),
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
212 make_link(links["IGG1"], count_dict["IGG1"]),
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
213 make_link(links["IGG2"], count_dict["IGG2"]),
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
214 make_link(links["IGG3"], count_dict["IGG3"]),
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
215 make_link(links["IGG4"], count_dict["IGG4"]),
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
216 make_link(links["IGM"], count_dict["IGM"]),
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
217 make_link(links["IGE"], count_dict["IGE"]),
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
218 make_link(links["unmatched"], count_dict["unmatched"]),
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
219 iga_count,
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
220 igg_count,
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
221 count_dict["IGM"],
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
222 count_dict["IGE"],
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
223 in_classes,
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
224 to_bool_str({"IGA", "IGG"}.issubset(contained_classes)),
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
225 to_bool_str({"IGA", "IGG", "IGM"}.issubset(contained_classes)),
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
226 to_bool_str({"IGA", "IGG", "IGE"}.issubset(contained_classes)),
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
227 to_bool_str({"IGA", "IGG", "IGM", "IGE"}.issubset(contained_classes)),
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
228 to_bool_str({"IGA1", "IGA2"}.issubset(contained_classes)),
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
229 to_bool_str({"IGG1", "IGG2"}.issubset(contained_classes)),
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
230 to_bool_str({"IGG1", "IGG3"}.issubset(contained_classes)),
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
231 to_bool_str({"IGG1", "IGG4"}.issubset(contained_classes)),
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
232 to_bool_str({"IGG2", "IGG3"}.issubset(contained_classes)),
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
233 to_bool_str({"IGG2", "IGG4"}.issubset(contained_classes)),
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
234 to_bool_str({"IGG3", "IGG4"}.issubset(contained_classes)),
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
235 to_bool_str({"IGG1", "IGG2", "IGG3"}.issubset(contained_classes)),
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
236 to_bool_str({"IGG2", "IGG3", "IGG4"}.issubset(contained_classes)),
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
237 to_bool_str({"IGG1", "IGG2", "IGG4"}.issubset(contained_classes)),
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
238 to_bool_str({"IGG1", "IGG3", "IGG4"}.issubset(contained_classes)),
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
239 to_bool_str({"IGG1", "IGG2", "IGG3", "IGG4"}.issubset(contained_classes)),
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
240 ]
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
241 main_html.write(tr(main_row))
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
242 main_html.write("</table>")
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
243
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
244
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
245 def argument_parser() -> argparse.ArgumentParser:
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
246 parser = argparse.ArgumentParser()
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
247 parser.add_argument("--before-unique", help="File with the overview before unique filters")
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
248 parser.add_argument("--outdir", help="Output directory")
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
249 parser.add_argument("--empty-region-filter")
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
250 return parser
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
251
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
252
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
253 def main():
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
254 args = argument_parser().parse_args()
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
255 sequence_overview(args.before_unique,
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
256 args.outdir,
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
257 args.empty_region_filter)
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
258
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
259
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
260 if __name__ == "__main__":
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
261 main()