Mercurial > repos > iuc > alphagenome_sequence_predictor

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/alphagenome_interval_predictor.py	Fri Mar 20 20:09:58 2026 +0000
@@ -0,0 +1,281 @@
+#!/usr/bin/env python
+"""
+AlphaGenome Interval Predictor for Galaxy
+
+Predicts regulatory tracks for genomic intervals — no variants, just baseline
+characterization of the chromatin/expression landscape.
+"""
+
+import argparse
+import csv
+import logging
+import os
+import sys
+
+import numpy as np
+from alphagenome.data import genome
+from alphagenome.models import dna_client
+
+__version__ = "0.6.1"
+
+OUTPUT_TYPE_MAP = {
+    "RNA_SEQ": dna_client.OutputType.RNA_SEQ,
+    "ATAC": dna_client.OutputType.ATAC,
+    "CAGE": dna_client.OutputType.CAGE,
+    "DNASE": dna_client.OutputType.DNASE,
+    "CHIP_HISTONE": dna_client.OutputType.CHIP_HISTONE,
+    "CHIP_TF": dna_client.OutputType.CHIP_TF,
+    "SPLICE_SITES": dna_client.OutputType.SPLICE_SITES,
+    "PROCAP": dna_client.OutputType.PROCAP,
+}
+
+ORGANISM_MAP = {
+    "human": dna_client.Organism.HOMO_SAPIENS,
+    "mouse": dna_client.Organism.MUS_MUSCULUS,
+}
+
+SEQUENCE_LENGTH_MAP = {
+    "16KB": 16_384,
+    "128KB": 131_072,
+    "512KB": 524_288,
+    "1MB": 1_048_576,
+}
+
+
+def create_model(api_key, local_model=False):
+    if local_model:
+        from alphagenome_research.model import dna_model
+        return dna_model.create_from_huggingface("all_folds")
+    return dna_client.create(api_key)
+
+
+def parse_bed(bed_path, max_intervals):
+    intervals = []
+    with open(bed_path) as f:
+        for line_num, line in enumerate(f):
+            line = line.strip()
+            if not line or line.startswith("#") or line.startswith("track") or line.startswith("browser"):
+                continue
+            fields = line.split("\t")
+            if len(fields) < 3:
+                logging.warning("Skipping malformed BED line %d: %s", line_num + 1, line)
+                continue
+            chrom = fields[0]
+            start = int(fields[1])
+            end = int(fields[2])
+            name = fields[3] if len(fields) > 3 else f"{chrom}:{start}-{end}"
+            if len(intervals) >= max_intervals:
+                logging.warning("Reached max intervals (%d), skipping remaining", max_intervals)
+                break
+            intervals.append((chrom, start, end, name))
+    return intervals
+
+
+def extract_region_slice(values, interval_start, region_start, region_end):
+    """Slice prediction array to the original BED region within the resized interval."""
+    offset_start = region_start - interval_start
+    offset_end = region_end - interval_start
+    # Clamp to valid bounds
+    offset_start = max(0, offset_start)
+    offset_end = min(values.shape[0], offset_end)
+    return values[offset_start:offset_end]
+
+
+def run(args):
+    logging.info("AlphaGenome Interval Predictor v%s", __version__)
+    logging.info("Input: %s", args.input)
+    logging.info("Output types: %s", ", ".join(args.output_types))
+    logging.info("Output mode: %s", args.output_mode)
+    logging.info("Organism: %s", args.organism)
+    logging.info("Sequence length: %s", args.sequence_length)
+    logging.info("Max intervals: %d", args.max_intervals)
+
+    if args.test_fixture:
+        import json
+        with open(args.test_fixture) as f:
+            fixture_data = json.load(f)
+        with open(args.output, "w", newline="") as outfile:
+            writer = csv.writer(outfile, delimiter="\t")
+            writer.writerow(fixture_data["columns"])
+            for row in fixture_data["rows"]:
+                writer.writerow(row)
+        logging.info("Fixture mode: wrote %d rows to %s", len(fixture_data["rows"]), args.output)
+        return
+
+    api_key = args.api_key or os.environ.get("ALPHAGENOME_API_KEY")
+    if not api_key and not args.local_model:
+        logging.error("No API key provided. Set ALPHAGENOME_API_KEY or use --api-key")
+        sys.exit(1)
+
+    organism = ORGANISM_MAP[args.organism]
+    seq_length = SEQUENCE_LENGTH_MAP[args.sequence_length]
+    requested_outputs = [OUTPUT_TYPE_MAP[t] for t in args.output_types]
+    ontology_terms = []
+    if args.ontology_terms:
+        ontology_terms = [t.strip() for t in args.ontology_terms.split(",") if t.strip()]
+
+    intervals = parse_bed(args.input, args.max_intervals)
+    if not intervals:
+        logging.error("No valid intervals found in input BED file")
+        sys.exit(1)
+    logging.info("Loaded %d intervals", len(intervals))
+
+    logging.info("Connecting to AlphaGenome...")
+    model = create_model(api_key, local_model=args.local_model)
+    logging.info("Model ready.")
+
+    stats = {"total": 0, "predicted": 0, "errors": 0}
+
+    with open(args.output, "w", newline="") as outfile:
+        writer = csv.writer(outfile, delimiter="\t")
+
+        if args.output_mode == "summary":
+            writer.writerow([
+                "chrom", "start", "end", "name", "output_type",
+                "track_name", "ontology_curie", "mean_signal", "max_signal",
+            ])
+        else:
+            writer.writerow([
+                "chrom", "bin_start", "bin_end", "region_name", "output_type",
+                "track_name", "ontology_curie", "mean_signal",
+            ])
+
+        for interval_num, (chrom, start, end, name) in enumerate(intervals):
+            stats["total"] += 1
+
+            if interval_num > 0 and interval_num % 10 == 0:
+                logging.info(
+                    "Progress: %d/%d intervals (%d predicted, %d errors)",
+                    interval_num, len(intervals), stats["predicted"], stats["errors"],
+                )
+
+            try:
+                interval = genome.Interval(chrom, start, end).resize(seq_length)
+
+                output = model.predict_interval(
+                    interval, organism=organism,
+                    requested_outputs=requested_outputs,
+                    ontology_terms=ontology_terms,
+                )
+
+                for otype in args.output_types:
+                    attr_name = otype.lower()
+                    track_data = getattr(output, attr_name, None)
+                    if track_data is None:
+                        logging.warning("No %s data for %s", otype, name)
+                        continue
+
+                    values = track_data.values  # (seq_length, num_tracks)
+                    metadata = track_data.metadata  # DataFrame with track info
+
+                    region_values = extract_region_slice(
+                        values, interval.start, start, end,
+                    )
+
+                    num_tracks = region_values.shape[1] if region_values.ndim > 1 else 1
+                    if region_values.ndim == 1:
+                        region_values = region_values.reshape(-1, 1)
+
+                    for track_idx in range(num_tracks):
+                        track_vals = region_values[:, track_idx]
+                        track_name = ""
+                        ontology_curie = ""
+                        if metadata is not None and len(metadata) > track_idx:
+                            row = metadata.iloc[track_idx]
+                            track_name = str(row.get("track_name", ""))
+                            ontology_curie = str(row.get("ontology_curie", ""))
+
+                        if args.output_mode == "summary":
+                            mean_sig = float(np.mean(track_vals))
+                            max_sig = float(np.max(track_vals))
+                            writer.writerow([
+                                chrom, start, end, name, otype,
+                                track_name, ontology_curie,
+                                f"{mean_sig:.6f}", f"{max_sig:.6f}",
+                            ])
+                        else:
+                            # Binned mode
+                            region_len = region_values.shape[0]
+                            bin_size = args.bin_size
+                            for bin_start_offset in range(0, region_len, bin_size):
+                                bin_end_offset = min(bin_start_offset + bin_size, region_len)
+                                bin_vals = track_vals[bin_start_offset:bin_end_offset]
+                                mean_sig = float(np.mean(bin_vals))
+                                writer.writerow([
+                                    chrom, start + bin_start_offset,
+                                    start + bin_end_offset, name, otype,
+                                    track_name, ontology_curie,
+                                    f"{mean_sig:.6f}",
+                                ])
+
+                stats["predicted"] += 1
+
+            except Exception as e:
+                logging.error("Error predicting %s (%s:%d-%d): %s", name, chrom, start, end, e)
+                stats["errors"] += 1
+
+    logging.info("=" * 50)
+    logging.info("DONE — %d total, %d predicted, %d errors",
+                 stats["total"], stats["predicted"], stats["errors"])
+    logging.info("Output: %s", args.output)
+
+    if stats["errors"] > 0 and stats["predicted"] == 0:
+        logging.error("All intervals failed. Check API key and network connectivity.")
+        sys.exit(1)
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description="Predict regulatory tracks for genomic intervals using AlphaGenome",
+    )
+    parser.add_argument("--input", required=True, help="Input BED file")
+    parser.add_argument("--output", required=True, help="Output TSV file")
+    parser.add_argument("--api-key", default=None, help="AlphaGenome API key (or set ALPHAGENOME_API_KEY)")
+    parser.add_argument(
+        "--organism", choices=["human", "mouse"], default="human",
+    )
+    parser.add_argument(
+        "--output-types", nargs="+", choices=list(OUTPUT_TYPE_MAP.keys()),
+        default=["RNA_SEQ"],
+    )
+    parser.add_argument("--ontology-terms", default=None)
+    parser.add_argument(
+        "--sequence-length", choices=list(SEQUENCE_LENGTH_MAP.keys()), default="1MB",
+    )
+    parser.add_argument("--max-intervals", type=int, default=50)
+    parser.add_argument(
+        "--output-mode", choices=["summary", "binned"], default="summary",
+    )
+    parser.add_argument("--bin-size", type=int, default=128)
+    parser.add_argument("--local-model", action="store_true")
+    parser.add_argument("--test-fixture", default=None,
+                        help="Test fixture JSON for CI testing (bypasses API)")
+    parser.add_argument("--verbose", action="store_true")
+    parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
+    return parser.parse_args()
+
+
+def main():
+    args = parse_arguments()
+
+    level = logging.DEBUG if args.verbose else logging.INFO
+    logging.basicConfig(
+        level=level,
+        format="%(asctime)s - %(levelname)s - %(message)s",
+        handlers=[logging.StreamHandler(sys.stderr)],
+    )
+
+    try:
+        run(args)
+    except KeyboardInterrupt:
+        logging.error("Interrupted")
+        sys.exit(130)
+    except Exception as e:
+        logging.error("Fatal error: %s", e)
+        if args.verbose:
+            logging.exception("Details:")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/alphagenome_ism_scanner.py	Fri Mar 20 20:09:58 2026 +0000
@@ -0,0 +1,259 @@
+#!/usr/bin/env python
+"""
+AlphaGenome ISM Scanner for Galaxy
+
+In-silico saturation mutagenesis — systematically mutates every position in a
+region to all 3 alt bases and scores each. Uses score_ism_variants() with
+server-side chunking and parallelism.
+"""
+
+import argparse
+import csv
+import logging
+import os
+import sys
+
+import numpy as np
+from alphagenome.data import genome
+from alphagenome.models import dna_client
+from alphagenome.models.variant_scorers import RECOMMENDED_VARIANT_SCORERS
+
+__version__ = "0.6.1"
+
+ORGANISM_MAP = {
+    "human": dna_client.Organism.HOMO_SAPIENS,
+    "mouse": dna_client.Organism.MUS_MUSCULUS,
+}
+
+SEQUENCE_LENGTH_MAP = {
+    "16KB": 16_384,
+    "128KB": 131_072,
+    "512KB": 524_288,
+    "1MB": 1_048_576,
+}
+
+
+def create_model(api_key, local_model=False):
+    if local_model:
+        from alphagenome_research.model import dna_model
+        return dna_model.create_from_huggingface("all_folds")
+    return dna_client.create(api_key)
+
+
+def parse_bed(bed_path, max_regions, max_region_width):
+    regions = []
+    with open(bed_path) as f:
+        for line_num, line in enumerate(f):
+            line = line.strip()
+            if not line or line.startswith("#") or line.startswith("track") or line.startswith("browser"):
+                continue
+            fields = line.split("\t")
+            if len(fields) < 3:
+                logging.warning("Skipping malformed BED line %d: %s", line_num + 1, line)
+                continue
+            chrom = fields[0]
+            start = int(fields[1])
+            end = int(fields[2])
+            name = fields[3] if len(fields) > 3 else f"{chrom}:{start}-{end}"
+            width = end - start
+            if width > max_region_width:
+                logging.warning(
+                    "Region %s is %dbp, exceeding max width %dbp — trimming to center %dbp",
+                    name, width, max_region_width, max_region_width,
+                )
+                center = (start + end) // 2
+                start = center - max_region_width // 2
+                end = start + max_region_width
+            if len(regions) >= max_regions:
+                logging.warning("Reached max regions (%d), skipping remaining", max_regions)
+                break
+            regions.append((chrom, start, end, name))
+    return regions
+
+
+def run(args):
+    logging.info("AlphaGenome ISM Scanner v%s", __version__)
+    logging.info("Input: %s", args.input)
+    logging.info("Scorers: %s", ", ".join(args.scorers))
+    logging.info("Organism: %s", args.organism)
+    logging.info("Sequence length: %s", args.sequence_length)
+    logging.info("Max regions: %d, max region width: %dbp", args.max_regions, args.max_region_width)
+
+    if args.test_fixture:
+        import json
+        with open(args.test_fixture) as f:
+            fixture_data = json.load(f)
+        with open(args.output, "w", newline="") as outfile:
+            writer = csv.writer(outfile, delimiter="\t")
+            writer.writerow(fixture_data["columns"])
+            for row in fixture_data["rows"]:
+                writer.writerow(row)
+        logging.info("Fixture mode: wrote %d rows to %s", len(fixture_data["rows"]), args.output)
+        return
+
+    api_key = args.api_key or os.environ.get("ALPHAGENOME_API_KEY")
+    if not api_key and not args.local_model:
+        logging.error("No API key provided. Set ALPHAGENOME_API_KEY or use --api-key")
+        sys.exit(1)
+
+    organism = ORGANISM_MAP[args.organism]
+    seq_length = SEQUENCE_LENGTH_MAP[args.sequence_length]
+
+    available_keys = list(RECOMMENDED_VARIANT_SCORERS.keys())
+    for key in args.scorers:
+        if key not in RECOMMENDED_VARIANT_SCORERS:
+            logging.error("Unknown scorer key: %s (available: %s)", key, ", ".join(available_keys))
+            sys.exit(1)
+    selected_scorers = [RECOMMENDED_VARIANT_SCORERS[k] for k in args.scorers]
+
+    regions = parse_bed(args.input, args.max_regions, args.max_region_width)
+    if not regions:
+        logging.error("No valid regions found in input BED file")
+        sys.exit(1)
+    logging.info("Loaded %d regions", len(regions))
+
+    logging.info("Connecting to AlphaGenome...")
+    model = create_model(api_key, local_model=args.local_model)
+    logging.info("Model ready.")
+
+    stats = {"regions": 0, "scored": 0, "errors": 0}
+    row_count = 0
+
+    with open(args.output, "w", newline="") as outfile:
+        writer = csv.writer(outfile, delimiter="\t")
+        writer.writerow([
+            "region", "position", "ref_base", "alt_base",
+            "gene_id", "gene_name", "gene_type",
+            "scorer", "track_name", "ontology_curie",
+            "raw_score", "quantile_score",
+        ])
+
+        for region_num, (chrom, start, end, name) in enumerate(regions):
+            stats["regions"] += 1
+            width = end - start
+            logging.info("Region %d/%d: %s (%s:%d-%d, %dbp, %d mutations)",
+                         region_num + 1, len(regions), name, chrom, start, end, width, width * 3)
+
+            try:
+                interval = genome.Interval(chrom, start, end).resize(seq_length)
+                ism_interval = genome.Interval(chrom, start, end, strand="+")
+
+                results = model.score_ism_variants(
+                    interval, ism_interval,
+                    variant_scorers=selected_scorers,
+                    organism=organism,
+                    max_workers=args.max_workers,
+                )
+
+                # results is list[list[AnnData]] — outer=variants (3*width), inner=scorers
+                # Each AnnData has: uns['variant'] with position/ref/alt,
+                # X for raw scores, layers['quantiles'], obs for genes, var for tracks
+                for var_results in results:
+                    for scorer_idx, ad in enumerate(var_results):
+                        variant_obj = ad.uns["variant"]
+                        pos = variant_obj.position
+                        ref_base = variant_obj.reference_bases
+                        alt_base = variant_obj.alternate_bases
+                        scorer_name = args.scorers[scorer_idx] if scorer_idx < len(args.scorers) else f"scorer_{scorer_idx}"
+
+                        raw_scores = ad.X  # shape (n_genes, n_tracks)
+                        quantile_scores = ad.layers.get("quantiles", None)
+
+                        for gene_idx in range(ad.n_obs):
+                            gene_row = ad.obs.iloc[gene_idx]
+                            gene_id = str(gene_row.get("gene_id", ""))
+                            gene_name = str(gene_row.get("gene_name", ""))
+                            gene_type = str(gene_row.get("gene_type", ""))
+
+                            for track_idx in range(ad.n_vars):
+                                track_row = ad.var.iloc[track_idx]
+                                track_name = str(track_row.get("name", ""))
+                                ontology_curie = str(track_row.get("ontology_curie", ""))
+
+                                raw = float(raw_scores[gene_idx, track_idx])
+                                if np.isnan(raw):
+                                    continue
+                                quant = ""
+                                if quantile_scores is not None:
+                                    q = float(quantile_scores[gene_idx, track_idx])
+                                    if not np.isnan(q):
+                                        quant = f"{q:.6f}"
+
+                                writer.writerow([
+                                    name, pos, ref_base, alt_base,
+                                    gene_id, gene_name, gene_type,
+                                    scorer_name, track_name, ontology_curie,
+                                    f"{raw:.6f}", quant,
+                                ])
+                                row_count += 1
+
+                stats["scored"] += 1
+                logging.info("Region %s: %d ISM variants scored", name, len(results))
+
+            except Exception as e:
+                logging.error("Error scanning region %s (%s:%d-%d): %s", name, chrom, start, end, e)
+                stats["errors"] += 1
+
+    logging.info("Wrote %d rows to %s", row_count, args.output)
+
+    logging.info("=" * 50)
+    logging.info("DONE — %d regions, %d scored, %d errors",
+                 stats["regions"], stats["scored"], stats["errors"])
+
+    if stats["errors"] > 0 and stats["scored"] == 0:
+        logging.error("All regions failed. Check API key and network connectivity.")
+        sys.exit(1)
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description="In-silico saturation mutagenesis using AlphaGenome score_ism_variants()",
+    )
+    parser.add_argument("--input", required=True, help="Input BED file")
+    parser.add_argument("--output", required=True, help="Output TSV file")
+    parser.add_argument("--api-key", default=None, help="AlphaGenome API key (or set ALPHAGENOME_API_KEY)")
+    parser.add_argument(
+        "--organism", choices=["human", "mouse"], default="human",
+    )
+    parser.add_argument(
+        "--scorers", nargs="+", default=["RNA_SEQ", "ATAC"],
+        help="Scorer keys from RECOMMENDED_VARIANT_SCORERS",
+    )
+    parser.add_argument(
+        "--sequence-length", choices=list(SEQUENCE_LENGTH_MAP.keys()), default="1MB",
+    )
+    parser.add_argument("--max-regions", type=int, default=10)
+    parser.add_argument("--max-region-width", type=int, default=200)
+    parser.add_argument("--max-workers", type=int, default=5)
+    parser.add_argument("--local-model", action="store_true")
+    parser.add_argument("--test-fixture", default=None,
+                        help="Test fixture JSON for CI testing (bypasses API)")
+    parser.add_argument("--verbose", action="store_true")
+    parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
+    return parser.parse_args()
+
+
+def main():
+    args = parse_arguments()
+
+    level = logging.DEBUG if args.verbose else logging.INFO
+    logging.basicConfig(
+        level=level,
+        format="%(asctime)s - %(levelname)s - %(message)s",
+        handlers=[logging.StreamHandler(sys.stderr)],
+    )
+
+    try:
+        run(args)
+    except KeyboardInterrupt:
+        logging.error("Interrupted")
+        sys.exit(130)
+    except Exception as e:
+        logging.error("Fatal error: %s", e)
+        if args.verbose:
+            logging.exception("Details:")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/alphagenome_sequence_predictor.py	Fri Mar 20 20:09:58 2026 +0000
@@ -0,0 +1,297 @@
+#!/usr/bin/env python
+"""
+AlphaGenome Sequence Predictor for Galaxy
+
+Predicts regulatory tracks from raw DNA sequence — no genomic coordinates needed.
+For synthetic biology (designed sequences) and non-reference assemblies.
+"""
+
+import argparse
+import csv
+import logging
+import os
+import sys
+
+import numpy as np
+from alphagenome.models import dna_client
+
+__version__ = "0.6.1"
+
+OUTPUT_TYPE_MAP = {
+    "RNA_SEQ": dna_client.OutputType.RNA_SEQ,
+    "ATAC": dna_client.OutputType.ATAC,
+    "CAGE": dna_client.OutputType.CAGE,
+    "DNASE": dna_client.OutputType.DNASE,
+    "CHIP_HISTONE": dna_client.OutputType.CHIP_HISTONE,
+    "CHIP_TF": dna_client.OutputType.CHIP_TF,
+    "SPLICE_SITES": dna_client.OutputType.SPLICE_SITES,
+    "PROCAP": dna_client.OutputType.PROCAP,
+}
+
+ORGANISM_MAP = {
+    "human": dna_client.Organism.HOMO_SAPIENS,
+    "mouse": dna_client.Organism.MUS_MUSCULUS,
+}
+
+SEQUENCE_LENGTH_MAP = {
+    "16KB": 16_384,
+    "128KB": 131_072,
+    "512KB": 524_288,
+    "1MB": 1_048_576,
+}
+
+
+def create_model(api_key, local_model=False):
+    if local_model:
+        from alphagenome_research.model import dna_model
+        return dna_model.create_from_huggingface("all_folds")
+    return dna_client.create(api_key)
+
+
+def parse_fasta(fasta_path, max_sequences):
+    sequences = []
+    current_id = None
+    current_seq = []
+
+    with open(fasta_path) as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            if line.startswith(">"):
+                if current_id is not None:
+                    sequences.append((current_id, "".join(current_seq)))
+                    if len(sequences) >= max_sequences:
+                        logging.warning("Reached max sequences (%d), skipping remaining", max_sequences)
+                        current_id = None
+                        break
+                current_id = line[1:].split()[0]
+                current_seq = []
+            else:
+                current_seq.append(line)
+    if current_id is not None:
+        sequences.append((current_id, "".join(current_seq)))
+
+    return sequences
+
+
+def prepare_sequence(seq, target_length):
+    """Pad (N-centered) or center-trim to target_length. Returns (seq, content_start, content_end)."""
+    seq = seq.upper()
+    if len(seq) == target_length:
+        return seq, 0, len(seq)
+    elif len(seq) < target_length:
+        pad_total = target_length - len(seq)
+        pad_left = pad_total // 2
+        prepared = "N" * pad_left + seq + "N" * (pad_total - pad_left)
+        return prepared, pad_left, pad_left + len(seq)
+    else:
+        trim_start = (len(seq) - target_length) // 2
+        prepared = seq[trim_start:trim_start + target_length]
+        return prepared, 0, target_length
+
+
+def run(args):
+    logging.info("AlphaGenome Sequence Predictor v%s", __version__)
+    logging.info("Input: %s", args.input)
+    logging.info("Output types: %s", ", ".join(args.output_types))
+    logging.info("Output mode: %s", args.output_mode)
+    logging.info("Organism: %s", args.organism)
+    logging.info("Sequence length: %s", args.sequence_length)
+    logging.info("Max sequences: %d", args.max_sequences)
+
+    if args.test_fixture:
+        import json
+        with open(args.test_fixture) as f:
+            fixture_data = json.load(f)
+        with open(args.output, "w", newline="") as outfile:
+            writer = csv.writer(outfile, delimiter="\t")
+            writer.writerow(fixture_data["columns"])
+            for row in fixture_data["rows"]:
+                writer.writerow(row)
+        logging.info("Fixture mode: wrote %d rows to %s", len(fixture_data["rows"]), args.output)
+        return
+
+    api_key = args.api_key or os.environ.get("ALPHAGENOME_API_KEY")
+    if not api_key and not args.local_model:
+        logging.error("No API key provided. Set ALPHAGENOME_API_KEY or use --api-key")
+        sys.exit(1)
+
+    organism = ORGANISM_MAP[args.organism]
+    target_length = SEQUENCE_LENGTH_MAP[args.sequence_length]
+    requested_outputs = [OUTPUT_TYPE_MAP[t] for t in args.output_types]
+    ontology_terms = []
+    if args.ontology_terms:
+        ontology_terms = [t.strip() for t in args.ontology_terms.split(",") if t.strip()]
+
+    sequences = parse_fasta(args.input, args.max_sequences)
+    if not sequences:
+        logging.error("No valid sequences found in input FASTA file")
+        sys.exit(1)
+    logging.info("Loaded %d sequences", len(sequences))
+
+    logging.info("Connecting to AlphaGenome...")
+    model = create_model(api_key, local_model=args.local_model)
+    logging.info("Model ready.")
+
+    stats = {"total": 0, "predicted": 0, "errors": 0}
+
+    with open(args.output, "w", newline="") as outfile:
+        writer = csv.writer(outfile, delimiter="\t")
+
+        if args.output_mode == "summary":
+            writer.writerow([
+                "sequence_id", "sequence_length", "output_type",
+                "track_name", "ontology_curie", "mean_signal", "max_signal",
+            ])
+        else:
+            writer.writerow([
+                "sequence_id", "bin_start", "bin_end", "output_type",
+                "track_name", "ontology_curie", "mean_signal",
+            ])
+
+        for seq_num, (seq_id, raw_seq) in enumerate(sequences):
+            stats["total"] += 1
+            orig_length = len(raw_seq)
+
+            if seq_num > 0 and seq_num % 5 == 0:
+                logging.info(
+                    "Progress: %d/%d sequences (%d predicted, %d errors)",
+                    seq_num, len(sequences), stats["predicted"], stats["errors"],
+                )
+
+            logging.info("Sequence %d/%d: %s (%dbp)", seq_num + 1, len(sequences), seq_id, orig_length)
+
+            try:
+                prepared_seq, content_start, content_end = prepare_sequence(raw_seq, target_length)
+
+                if orig_length != target_length:
+                    if orig_length < target_length:
+                        logging.info("  N-padded %dbp -> %dbp", orig_length, target_length)
+                    else:
+                        logging.info("  Center-trimmed %dbp -> %dbp", orig_length, target_length)
+
+                output = model.predict_sequence(
+                    prepared_seq, organism=organism,
+                    requested_outputs=requested_outputs,
+                    ontology_terms=ontology_terms,
+                )
+
+                for otype in args.output_types:
+                    attr_name = otype.lower()
+                    track_data = getattr(output, attr_name, None)
+                    if track_data is None:
+                        logging.warning("No %s data for %s", otype, seq_id)
+                        continue
+
+                    values = track_data.values
+                    metadata = track_data.metadata
+
+                    # Slice to the actual content region (non-N portion)
+                    content_values = values[content_start:content_end]
+
+                    num_tracks = content_values.shape[1] if content_values.ndim > 1 else 1
+                    if content_values.ndim == 1:
+                        content_values = content_values.reshape(-1, 1)
+
+                    for track_idx in range(num_tracks):
+                        track_vals = content_values[:, track_idx]
+                        track_name = ""
+                        ontology_curie = ""
+                        if metadata is not None and len(metadata) > track_idx:
+                            row = metadata.iloc[track_idx]
+                            track_name = str(row.get("track_name", ""))
+                            ontology_curie = str(row.get("ontology_curie", ""))
+
+                        if args.output_mode == "summary":
+                            mean_sig = float(np.mean(track_vals))
+                            max_sig = float(np.max(track_vals))
+                            writer.writerow([
+                                seq_id, orig_length, otype,
+                                track_name, ontology_curie,
+                                f"{mean_sig:.6f}", f"{max_sig:.6f}",
+                            ])
+                        else:
+                            content_len = content_values.shape[0]
+                            bin_size = args.bin_size
+                            for bin_start in range(0, content_len, bin_size):
+                                bin_end = min(bin_start + bin_size, content_len)
+                                bin_vals = track_vals[bin_start:bin_end]
+                                mean_sig = float(np.mean(bin_vals))
+                                writer.writerow([
+                                    seq_id, bin_start, bin_end, otype,
+                                    track_name, ontology_curie,
+                                    f"{mean_sig:.6f}",
+                                ])
+
+                stats["predicted"] += 1
+
+            except Exception as e:
+                logging.error("Error predicting %s: %s", seq_id, e)
+                stats["errors"] += 1
+
+    logging.info("=" * 50)
+    logging.info("DONE — %d total, %d predicted, %d errors",
+                 stats["total"], stats["predicted"], stats["errors"])
+    logging.info("Output: %s", args.output)
+
+    if stats["errors"] > 0 and stats["predicted"] == 0:
+        logging.error("All sequences failed. Check API key and network connectivity.")
+        sys.exit(1)
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description="Predict regulatory tracks from DNA sequence using AlphaGenome",
+    )
+    parser.add_argument("--input", required=True, help="Input FASTA file")
+    parser.add_argument("--output", required=True, help="Output TSV file")
+    parser.add_argument("--api-key", default=None, help="AlphaGenome API key (or set ALPHAGENOME_API_KEY)")
+    parser.add_argument(
+        "--organism", choices=["human", "mouse"], default="human",
+    )
+    parser.add_argument(
+        "--output-types", nargs="+", choices=list(OUTPUT_TYPE_MAP.keys()),
+        default=["RNA_SEQ"],
+    )
+    parser.add_argument("--ontology-terms", default=None)
+    parser.add_argument(
+        "--sequence-length", choices=list(SEQUENCE_LENGTH_MAP.keys()), default="16KB",
+    )
+    parser.add_argument("--max-sequences", type=int, default=20)
+    parser.add_argument(
+        "--output-mode", choices=["summary", "binned"], default="summary",
+    )
+    parser.add_argument("--bin-size", type=int, default=128)
+    parser.add_argument("--local-model", action="store_true")
+    parser.add_argument("--test-fixture", default=None,
+                        help="Test fixture JSON for CI testing (bypasses API)")
+    parser.add_argument("--verbose", action="store_true")
+    parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
+    return parser.parse_args()
+
+
+def main():
+    args = parse_arguments()
+
+    level = logging.DEBUG if args.verbose else logging.INFO
+    logging.basicConfig(
+        level=level,
+        format="%(asctime)s - %(levelname)s - %(message)s",
+        handlers=[logging.StreamHandler(sys.stderr)],
+    )
+
+    try:
+        run(args)
+    except KeyboardInterrupt:
+        logging.error("Interrupted")
+        sys.exit(130)
+    except Exception as e:
+        logging.error("Fatal error: %s", e)
+        if args.verbose:
+            logging.exception("Details:")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/alphagenome_sequence_predictor.xml	Fri Mar 20 20:09:58 2026 +0000
@@ -0,0 +1,144 @@
+<tool id="alphagenome_sequence_predictor" name="AlphaGenome Sequence Predictor" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description>Predict regulatory tracks from DNA sequence</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements"/>
+    <version_command>echo @TOOL_VERSION@</version_command>
+    <command detect_errors="exit_code"><![CDATA[
+        python '$__tool_directory__/alphagenome_sequence_predictor.py'
+            --input '$input_fasta'
+            --output '$output_tsv'
+            --organism '$organism'
+            --output-types
+            #for $otype in $output_types
+                '$otype'
+            #end for
+            --sequence-length '$sequence_length'
+            --max-sequences $max_sequences
+            --output-mode '$output_mode.mode'
+            #if str($output_mode.mode) == "binned"
+                --bin-size $output_mode.bin_size
+            #end if
+            @CMD_ONTOLOGY_TERMS@
+            @CMD_TEST_FIXTURE@
+    ]]></command>
+    <inputs>
+        <param name="input_fasta" type="data" format="fasta" label="Input FASTA file" help="DNA sequences to predict regulatory tracks for"/>
+        <expand macro="organism_param"/>
+        <expand macro="output_types_param"/>
+        <expand macro="ontology_terms_param"/>
+        <param name="sequence_length" type="select" label="Prediction window size" help="Window size in bases. Sequences shorter than the window are N-padded; longer sequences are center-trimmed.">
+            <option value="16KB" selected="true">16 kb (recommended for synbio)</option>
+            <option value="128KB">128 kb</option>
+            <option value="512KB">512 kb</option>
+            <option value="1MB">1 Mb</option>
+        </param>
+        <param name="max_sequences" type="integer" value="20" min="1" max="1000" label="Maximum sequences to process" help="API is rate-limited; start small to verify results"/>
+        <conditional name="output_mode">
+            <param name="mode" type="select" label="Output mode">
+                <option value="summary" selected="true">Summary (one row per sequence x track)</option>
+                <option value="binned">Binned (spatial resolution within sequences)</option>
+            </param>
+            <when value="summary"/>
+            <when value="binned">
+                <param name="bin_size" type="integer" value="128" min="1" max="4096" label="Bin size (bp)" help="Divide each sequence into bins of this width for spatial resolution"/>
+            </when>
+        </conditional>
+        <expand macro="test_fixture_param"/>
+    </inputs>
+    <outputs>
+        <data name="output_tsv" format="tabular"/>
+    </outputs>
+    <tests>
+        <test expect_num_outputs="1">
+            <param name="input_fasta" value="test_sequences.fa"/>
+            <param name="organism" value="human"/>
+            <param name="output_types" value="RNA_SEQ"/>
+            <param name="sequence_length" value="16KB"/>
+            <param name="max_sequences" value="2"/>
+            <conditional name="output_mode">
+                <param name="mode" value="summary"/>
+            </conditional>
+            <param name="test_fixture" value="test-data/fixture_sequence_predictor.json"/>
+            <output name="output_tsv">
+                <assert_contents>
+                    <has_text text="sequence_id"/>
+                    <has_text text="mean_signal"/>
+                    <has_text text="max_signal"/>
+                    <has_text text="test_promoter_1"/>
+                    <has_text text="RNA_SEQ"/>
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+**AlphaGenome Sequence Predictor**
+
+Predicts regulatory tracks from raw DNA sequence using AlphaGenome's
+``predict_sequence()`` API. No genomic coordinates needed — designed for
+synthetic biology sequences, designed regulatory elements, and
+non-reference assemblies.
+
+-----
+
+**What it does**
+
+For each sequence in the input FASTA file:
+
+1. Pads short sequences with N bases (centered) or center-trims long sequences
+   to fit the prediction window size
+2. Calls ``predict_sequence()`` for selected output types
+3. Computes summary statistics over the actual sequence content (excluding N padding)
+4. Reports per-track values in tabular format
+
+-----
+
+**Sequence handling**
+
+- Sequences **shorter** than the window size are N-padded (centered). The model
+  still makes predictions, and statistics are computed only over the real content.
+- Sequences **longer** than the window are center-trimmed to fit.
+- Sequences **equal** to the window size are used as-is.
+
+For typical synbio applications (promoters, enhancers, etc.), the 16 KB window
+is recommended as sequences are usually short.
+
+-----
+
+**Output modes**
+
+- **Summary**: One row per sequence x track with mean and max signal.
+- **Binned**: Divides each sequence into fixed-width bins (default 128bp) and
+  reports mean signal per bin for spatial resolution.
+
+-----
+
+**Output columns — summary mode**
+
+- ``sequence_id`` — FASTA header ID
+- ``sequence_length`` — original sequence length before padding/trimming
+- ``output_type`` — RNA_SEQ, ATAC, etc.
+- ``track_name`` — specific experimental track
+- ``ontology_curie`` — tissue/cell type ontology term
+- ``mean_signal`` — mean predicted signal
+- ``max_signal`` — maximum predicted signal
+
+**Output columns — binned mode**
+
+- ``sequence_id`` — FASTA header ID
+- ``bin_start``, ``bin_end`` — bin coordinates within the sequence
+- ``output_type``, ``track_name``, ``ontology_curie`` — as above
+- ``mean_signal`` — mean predicted signal in the bin
+
+-----
+
+**Use cases**
+
+- Evaluate designed promoter/enhancer sequences
+- Predict splicing of synthetic gene constructs
+- Characterize regulatory elements from non-reference genomes
+- Compare predicted activity of sequence variants in synthetic biology
+    ]]></help>
+    <expand macro="citations"/>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/alphagenome_variant_effect.py	Fri Mar 20 20:09:58 2026 +0000
@@ -0,0 +1,302 @@
+#!/usr/bin/env python
+"""
+AlphaGenome Variant Effect Predictor for Galaxy
+
+POC tool that scores genetic variants using the real AlphaGenome API.
+Uses predict_variant() to compute log-fold-change effect scores per output type.
+"""
+
+import argparse
+import logging
+import os
+import sys
+
+import cyvcf2
+import numpy as np
+from alphagenome.data import genome
+from alphagenome.models import dna_client
+
+__version__ = "0.6.1"
+
+OUTPUT_TYPE_MAP = {
+    "RNA_SEQ": dna_client.OutputType.RNA_SEQ,
+    "ATAC": dna_client.OutputType.ATAC,
+    "CAGE": dna_client.OutputType.CAGE,
+    "DNASE": dna_client.OutputType.DNASE,
+    "CHIP_HISTONE": dna_client.OutputType.CHIP_HISTONE,
+    "CHIP_TF": dna_client.OutputType.CHIP_TF,
+    "SPLICE_SITES": dna_client.OutputType.SPLICE_SITES,
+    "SPLICE_SITE_USAGE": dna_client.OutputType.SPLICE_SITE_USAGE,
+    "SPLICE_JUNCTIONS": dna_client.OutputType.SPLICE_JUNCTIONS,
+    "CONTACT_MAPS": dna_client.OutputType.CONTACT_MAPS,
+    "PROCAP": dna_client.OutputType.PROCAP,
+}
+
+INFO_FIELD_MAP = {
+    "RNA_SEQ": "AG_RNA_LFC",
+    "ATAC": "AG_ATAC_LFC",
+    "CAGE": "AG_CAGE_LFC",
+    "DNASE": "AG_DNASE_LFC",
+    "CHIP_HISTONE": "AG_HISTONE_LFC",
+    "CHIP_TF": "AG_TF_LFC",
+    "SPLICE_SITES": "AG_SPLICE_LFC",
+    "SPLICE_SITE_USAGE": "AG_SPLICEUSE_LFC",
+    "SPLICE_JUNCTIONS": "AG_SPLICEJNC_LFC",
+    "CONTACT_MAPS": "AG_CONTACT_LFC",
+    "PROCAP": "AG_PROCAP_LFC",
+}
+
+ORGANISM_MAP = {
+    "human": dna_client.Organism.HOMO_SAPIENS,
+    "mouse": dna_client.Organism.MUS_MUSCULUS,
+}
+
+SEQUENCE_LENGTH_MAP = {
+    "16KB": 16_384,
+    "128KB": 131_072,
+    "512KB": 524_288,
+    "1MB": 1_048_576,
+}
+
+
+def create_model(api_key, local_model=False):
+    if local_model:
+        from alphagenome_research.model import dna_model
+        return dna_model.create_from_huggingface("all_folds")
+    return dna_client.create(api_key)
+
+
+def compute_max_abs_lfc(ref_values, alt_values):
+    ref_vals = np.asarray(ref_values, dtype=np.float64)
+    alt_vals = np.asarray(alt_values, dtype=np.float64)
+    epsilon = 1e-6
+    lfc = np.log2((alt_vals + epsilon) / (ref_vals + epsilon))
+    return float(np.max(np.abs(lfc)))
+
+
+def get_track_values(prediction_output, output_type_name):
+    """Attribute name is lowercase OutputType enum (e.g. RNA_SEQ -> output.rna_seq)."""
+    attr_name = output_type_name.lower()
+    track = getattr(prediction_output, attr_name, None)
+    if track is None:
+        return None
+    return track.values
+
+
+def run(args):
+    logging.info("AlphaGenome Variant Effect Predictor v%s", __version__)
+    logging.info("Input: %s", args.input)
+    logging.info("Output types: %s", ", ".join(args.output_types))
+    logging.info("Organism: %s", args.organism)
+    logging.info("Sequence length: %s", args.sequence_length)
+    logging.info("Max variants: %d", args.max_variants)
+
+    # Fixture mode for CI testing (bypasses API)
+    fixture_lookup = None
+    if args.test_fixture:
+        import json
+        with open(args.test_fixture) as f:
+            fixture_data = json.load(f)
+        fixture_lookup = {}
+        for v in fixture_data["variants"]:
+            key = (v["chrom"], v["pos"], v["ref"], v["alt"])
+            fixture_lookup[key] = v["scores"]
+        logging.info("Fixture mode: %d pre-computed variants", len(fixture_lookup))
+
+    if fixture_lookup is None:
+        api_key = args.api_key or os.environ.get("ALPHAGENOME_API_KEY")
+        if not api_key and not args.local_model:
+            logging.error("No API key provided. Set ALPHAGENOME_API_KEY or use --api-key")
+            sys.exit(1)
+
+        organism = ORGANISM_MAP[args.organism]
+        seq_length = SEQUENCE_LENGTH_MAP[args.sequence_length]
+        requested_outputs = [OUTPUT_TYPE_MAP[t] for t in args.output_types]
+        ontology_terms = []
+        if args.ontology_terms:
+            ontology_terms = [t.strip() for t in args.ontology_terms.split(",") if t.strip()]
+
+        logging.info("Connecting to AlphaGenome...")
+        model = create_model(api_key, local_model=args.local_model)
+        logging.info("Model ready.")
+
+    vcf_reader = cyvcf2.VCF(args.input)
+
+    for otype in args.output_types:
+        info_id = INFO_FIELD_MAP[otype]
+        vcf_reader.add_info_to_header({
+            "ID": info_id,
+            "Number": "A",
+            "Type": "Float",
+            "Description": f"AlphaGenome {otype} max absolute log-fold-change",
+        })
+    vcf_reader.add_info_to_header({
+        "ID": "AG_MAX_EFFECT",
+        "Number": "A",
+        "Type": "Float",
+        "Description": "AlphaGenome max effect across all selected output types",
+    })
+    vcf_reader.add_to_header(f"##AlphaGenomeVariantEffectVersion={__version__}")
+
+    vcf_writer = cyvcf2.Writer(args.output, vcf_reader)
+
+    stats = {"total": 0, "scored": 0, "errors": 0, "skipped": 0}
+
+    try:
+        for variant_num, record in enumerate(vcf_reader):
+            stats["total"] += 1
+
+            if variant_num >= args.max_variants:
+                stats["skipped"] += 1
+                vcf_writer.write_record(record)
+                continue
+
+            if variant_num > 0 and variant_num % 10 == 0:
+                logging.info(
+                    "Progress: %d/%d variants processed (%d scored, %d errors)",
+                    variant_num, args.max_variants, stats["scored"], stats["errors"],
+                )
+
+            chrom = record.CHROM
+            pos = record.POS  # 1-based, matches API expectation
+            ref = record.REF
+
+            # Process first ALT allele only for POC
+            if not record.ALT:
+                vcf_writer.write_record(record)
+                continue
+
+            alt = record.ALT[0]
+
+            try:
+                all_scores = []
+                if fixture_lookup is not None:
+                    fixture_scores = fixture_lookup.get((chrom, pos, ref, alt), {})
+                    for otype in args.output_types:
+                        if otype in fixture_scores:
+                            score = fixture_scores[otype]
+                            info_id = INFO_FIELD_MAP[otype]
+                            record.INFO[info_id] = round(score, 6)
+                            all_scores.append(score)
+                else:
+                    variant = genome.Variant(
+                        chromosome=chrom,
+                        position=pos,
+                        reference_bases=ref,
+                        alternate_bases=alt,
+                    )
+                    interval = variant.reference_interval.resize(seq_length)
+
+                    outputs = model.predict_variant(
+                        interval=interval,
+                        variant=variant,
+                        organism=organism,
+                        ontology_terms=ontology_terms,
+                        requested_outputs=requested_outputs,
+                    )
+
+                    for otype in args.output_types:
+                        ref_vals = get_track_values(outputs.reference, otype)
+                        alt_vals = get_track_values(outputs.alternate, otype)
+                        if ref_vals is not None and alt_vals is not None:
+                            score = compute_max_abs_lfc(ref_vals, alt_vals)
+                            info_id = INFO_FIELD_MAP[otype]
+                            record.INFO[info_id] = round(score, 6)
+                            all_scores.append(score)
+                        else:
+                            logging.warning(
+                                "No %s track in output for %s:%d %s>%s",
+                                otype, chrom, pos, ref, alt,
+                            )
+
+                if all_scores:
+                    record.INFO["AG_MAX_EFFECT"] = round(max(all_scores), 6)
+                    stats["scored"] += 1
+                else:
+                    stats["errors"] += 1
+
+            except Exception as e:
+                logging.error("Error scoring %s:%d %s>%s: %s", chrom, pos, ref, alt, e)
+                stats["errors"] += 1
+
+            vcf_writer.write_record(record)
+
+    finally:
+        vcf_writer.close()
+        vcf_reader.close()
+
+    logging.info("=" * 50)
+    logging.info("DONE — %d total, %d scored, %d errors, %d skipped (over limit)",
+                 stats["total"], stats["scored"], stats["errors"], stats["skipped"])
+    logging.info("Output: %s", args.output)
+
+    if stats["errors"] > 0 and stats["scored"] == 0:
+        logging.error("All variants failed. Check API key and network connectivity.")
+        sys.exit(1)
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description="Score genetic variants using AlphaGenome predict_variant()",
+    )
+    parser.add_argument("--input", required=True, help="Input VCF file")
+    parser.add_argument("--output", required=True, help="Output VCF file")
+    parser.add_argument("--api-key", default=None, help="AlphaGenome API key (or set ALPHAGENOME_API_KEY)")
+    parser.add_argument(
+        "--organism", choices=["human", "mouse"], default="human",
+        help="Organism (default: human)",
+    )
+    parser.add_argument(
+        "--output-types", nargs="+", choices=list(OUTPUT_TYPE_MAP.keys()),
+        default=["RNA_SEQ"],
+        help="AlphaGenome output types to predict (default: RNA_SEQ)",
+    )
+    parser.add_argument(
+        "--ontology-terms", default=None,
+        help="Comma-separated ontology terms (e.g. UBERON:0001157,CL:0000746)",
+    )
+    parser.add_argument(
+        "--sequence-length", choices=list(SEQUENCE_LENGTH_MAP.keys()), default="1MB",
+        help="Prediction window size (default: 1MB)",
+    )
+    parser.add_argument(
+        "--max-variants", type=int, default=100,
+        help="Maximum variants to process (default: 100)",
+    )
+    parser.add_argument(
+        "--local-model", action="store_true",
+        help="Use local HuggingFace model instead of API",
+    )
+    parser.add_argument("--test-fixture", default=None,
+                        help="Test fixture JSON for CI testing (bypasses API)")
+    parser.add_argument("--verbose", action="store_true", help="Debug logging")
+    parser.add_argument(
+        "--version", action="version", version=f"%(prog)s {__version__}",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_arguments()
+
+    level = logging.DEBUG if args.verbose else logging.INFO
+    logging.basicConfig(
+        level=level,
+        format="%(asctime)s - %(levelname)s - %(message)s",
+        handlers=[logging.StreamHandler(sys.stderr)],
+    )
+
+    try:
+        run(args)
+    except KeyboardInterrupt:
+        logging.error("Interrupted")
+        sys.exit(130)
+    except Exception as e:
+        logging.error("Fatal error: %s", e)
+        if args.verbose:
+            logging.exception("Details:")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/alphagenome_variant_scorer.py	Fri Mar 20 20:09:58 2026 +0000
@@ -0,0 +1,206 @@
+#!/usr/bin/env python
+"""
+AlphaGenome Variant Scorer for Galaxy
+
+Uses score_variant() for server-side gene-level aggregation with spatial masking
+and empirical quantile normalization. Outputs structured per-gene, per-track scores
+via tidy_scores().
+"""
+
+import argparse
+import logging
+import os
+import sys
+
+import cyvcf2
+from alphagenome.data import genome
+from alphagenome.models import dna_client
+from alphagenome.models.variant_scorers import RECOMMENDED_VARIANT_SCORERS, tidy_scores
+
+__version__ = "0.6.1"
+
+ORGANISM_MAP = {
+    "human": dna_client.Organism.HOMO_SAPIENS,
+    "mouse": dna_client.Organism.MUS_MUSCULUS,
+}
+
+SEQUENCE_LENGTH_MAP = {
+    "16KB": 16_384,
+    "128KB": 131_072,
+    "512KB": 524_288,
+    "1MB": 1_048_576,
+}
+
+
+def create_model(api_key, local_model=False):
+    if local_model:
+        from alphagenome_research.model import dna_model
+        return dna_model.create_from_huggingface("all_folds")
+    return dna_client.create(api_key)
+
+
+def run(args):
+    logging.info("AlphaGenome Variant Scorer v%s", __version__)
+    logging.info("Input: %s", args.input)
+    logging.info("Scorers: %s", ", ".join(args.scorers))
+    logging.info("Organism: %s", args.organism)
+    logging.info("Sequence length: %s", args.sequence_length)
+    logging.info("Max variants: %d", args.max_variants)
+
+    if args.test_fixture:
+        import json
+        import pandas as pd
+        with open(args.test_fixture) as f:
+            fixture_data = json.load(f)
+        df = pd.DataFrame(fixture_data["rows"], columns=fixture_data["columns"])
+        df.to_csv(args.output, sep="\t", index=False)
+        logging.info("Fixture mode: wrote %d rows to %s", len(df), args.output)
+        return
+
+    api_key = args.api_key or os.environ.get("ALPHAGENOME_API_KEY")
+    if not api_key and not args.local_model:
+        logging.error("No API key provided. Set ALPHAGENOME_API_KEY or use --api-key")
+        sys.exit(1)
+
+    organism = ORGANISM_MAP[args.organism]
+    seq_length = SEQUENCE_LENGTH_MAP[args.sequence_length]
+
+    available_keys = list(RECOMMENDED_VARIANT_SCORERS.keys())
+    selected_keys = args.scorers
+    for key in selected_keys:
+        if key not in RECOMMENDED_VARIANT_SCORERS:
+            logging.error("Unknown scorer key: %s (available: %s)", key, ", ".join(available_keys))
+            sys.exit(1)
+    selected_scorers = [RECOMMENDED_VARIANT_SCORERS[k] for k in selected_keys]
+    logging.info("Using %d scorers", len(selected_scorers))
+
+    logging.info("Connecting to AlphaGenome...")
+    model = create_model(api_key, local_model=args.local_model)
+    logging.info("Model ready.")
+
+    vcf_reader = cyvcf2.VCF(args.input)
+
+    stats = {"total": 0, "scored": 0, "errors": 0, "skipped": 0}
+    all_rows = []
+
+    try:
+        for variant_num, record in enumerate(vcf_reader):
+            stats["total"] += 1
+
+            if variant_num >= args.max_variants:
+                stats["skipped"] += 1
+                continue
+
+            if variant_num > 0 and variant_num % 10 == 0:
+                logging.info(
+                    "Progress: %d/%d variants processed (%d scored, %d errors)",
+                    variant_num, args.max_variants, stats["scored"], stats["errors"],
+                )
+
+            chrom = record.CHROM
+            pos = record.POS
+            ref = record.REF
+
+            if not record.ALT:
+                continue
+
+            alt = record.ALT[0]
+            variant_id = f"{chrom}:{pos}:{ref}>{alt}"
+
+            try:
+                variant = genome.Variant(
+                    chromosome=chrom,
+                    position=pos,
+                    reference_bases=ref,
+                    alternate_bases=alt,
+                )
+                interval = variant.reference_interval.resize(seq_length)
+
+                scores = model.score_variant(
+                    interval, variant, selected_scorers, organism=organism,
+                )
+
+                df = tidy_scores(scores)
+                all_rows.append(df)
+                stats["scored"] += 1
+
+                logging.debug("Scored %s: %d rows", variant_id, len(df))
+
+            except Exception as e:
+                logging.error("Error scoring %s: %s", variant_id, e)
+                stats["errors"] += 1
+
+    finally:
+        vcf_reader.close()
+
+    if all_rows:
+        import pandas as pd
+        combined = pd.concat(all_rows, ignore_index=True)
+        combined.to_csv(args.output, sep="\t", index=False)
+        logging.info("Wrote %d rows to %s", len(combined), args.output)
+    else:
+        with open(args.output, "w") as f:
+            f.write("variant_id\n")
+        logging.warning("No variants scored successfully")
+
+    logging.info("=" * 50)
+    logging.info("DONE — %d total, %d scored, %d errors, %d skipped (over limit)",
+                 stats["total"], stats["scored"], stats["errors"], stats["skipped"])
+
+    if stats["errors"] > 0 and stats["scored"] == 0:
+        logging.error("All variants failed. Check API key and network connectivity.")
+        sys.exit(1)
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description="Score variants using AlphaGenome score_variant() with gene-level aggregation",
+    )
+    parser.add_argument("--input", required=True, help="Input VCF file")
+    parser.add_argument("--output", required=True, help="Output TSV file")
+    parser.add_argument("--api-key", default=None, help="AlphaGenome API key (or set ALPHAGENOME_API_KEY)")
+    parser.add_argument(
+        "--organism", choices=["human", "mouse"], default="human",
+    )
+    parser.add_argument(
+        "--scorers", nargs="+", default=["RNA_SEQ", "ATAC", "SPLICE_SITES"],
+        help="Scorer keys from RECOMMENDED_VARIANT_SCORERS",
+    )
+    parser.add_argument(
+        "--sequence-length", choices=list(SEQUENCE_LENGTH_MAP.keys()), default="1MB",
+    )
+    parser.add_argument(
+        "--max-variants", type=int, default=100,
+    )
+    parser.add_argument("--local-model", action="store_true")
+    parser.add_argument("--test-fixture", default=None,
+                        help="Test fixture JSON for CI testing (bypasses API)")
+    parser.add_argument("--verbose", action="store_true")
+    parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
+    return parser.parse_args()
+
+
+def main():
+    args = parse_arguments()
+
+    level = logging.DEBUG if args.verbose else logging.INFO
+    logging.basicConfig(
+        level=level,
+        format="%(asctime)s - %(levelname)s - %(message)s",
+        handlers=[logging.StreamHandler(sys.stderr)],
+    )
+
+    try:
+        run(args)
+    except KeyboardInterrupt:
+        logging.error("Interrupted")
+        sys.exit(130)
+    except Exception as e:
+        logging.error("Fatal error: %s", e)
+        if args.verbose:
+            logging.exception("Details:")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/generate_test_fixtures.sh	Fri Mar 20 20:09:58 2026 +0000
@@ -0,0 +1,205 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Regenerate fixture JSON files for all 5 AlphaGenome Galaxy tools.
+# Runs each tool against the real API with the same params as the Galaxy <test>
+# sections, then converts the output to the fixture format.
+#
+# Usage:
+#   export ALPHAGENOME_API_KEY=<your-key>
+#   bash tools/alphagenome/generate_test_fixtures.sh
+
+if [[ -z "${ALPHAGENOME_API_KEY:-}" ]]; then
+    echo "ERROR: ALPHAGENOME_API_KEY is not set" >&2
+    exit 1
+fi
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$SCRIPT_DIR"
+
+TMPDIR=$(mktemp -d)
+trap 'rm -rf "$TMPDIR"' EXIT
+
+PASS=0
+FAIL=0
+MAX_FIXTURE_ROWS=30
+
+run_tool() {
+    local name="$1"
+    shift
+    echo "--- Running $name ---"
+    if python3 "$@" 2>"$TMPDIR/${name}.log"; then
+        tail -3 "$TMPDIR/${name}.log"
+        echo "    OK"
+        PASS=$((PASS + 1))
+    else
+        tail -10 "$TMPDIR/${name}.log" >&2
+        echo "    FAILED (see log above)" >&2
+        FAIL=$((FAIL + 1))
+        return 1
+    fi
+}
+
+# ── 1. variant_effect (VCF output → fixture JSON) ──────────────────────
+
+run_tool "variant_effect" \
+    alphagenome_variant_effect.py \
+    --input  test-data/test_input.vcf \
+    --output "$TMPDIR/variant_effect.vcf" \
+    --api-key "$ALPHAGENOME_API_KEY" \
+    --output-types RNA_SEQ \
+    --sequence-length 128KB \
+    --max-variants 3 \
+    --verbose
+
+python3 -c '
+import json, sys
+
+# Reverse the INFO_FIELD_MAP from the tool script
+FIELD_TO_TYPE = {
+    "AG_RNA_LFC": "RNA_SEQ",
+    "AG_ATAC_LFC": "ATAC",
+    "AG_CAGE_LFC": "CAGE",
+    "AG_DNASE_LFC": "DNASE",
+    "AG_HISTONE_LFC": "CHIP_HISTONE",
+    "AG_TF_LFC": "CHIP_TF",
+    "AG_SPLICE_LFC": "SPLICE_SITES",
+    "AG_SPLICEUSE_LFC": "SPLICE_SITE_USAGE",
+    "AG_SPLICEJNC_LFC": "SPLICE_JUNCTIONS",
+    "AG_CONTACT_LFC": "CONTACT_MAPS",
+    "AG_PROCAP_LFC": "PROCAP",
+}
+
+variants = []
+for line in open(sys.argv[1]):
+    if line.startswith("#"):
+        continue
+    fields = line.strip().split("\t")
+    chrom, pos, ref, alt = fields[0], int(fields[1]), fields[3], fields[4]
+    info = fields[7]
+    scores = {}
+    for kv in info.split(";"):
+        if "=" not in kv:
+            continue
+        key, val = kv.split("=", 1)
+        if key in FIELD_TO_TYPE:
+            scores[FIELD_TO_TYPE[key]] = round(float(val), 6)
+    variants.append({"chrom": chrom, "pos": pos, "ref": ref, "alt": alt, "scores": scores})
+
+with open(sys.argv[2], "w") as f:
+    json.dump({"variants": variants}, f, indent=2)
+    f.write("\n")
+print(f"  -> wrote {len(variants)} variants to {sys.argv[2]}")
+' "$TMPDIR/variant_effect.vcf" \
+  "test-data/fixture_variant_effect.json"
+
+# ── Helper: TSV → fixture JSON ──────────────────────────────────────────
+
+tsv_to_fixture() {
+    local tsv_file="$1"
+    local json_file="$2"
+    python3 -c '
+import csv, json, sys
+
+max_rows = int(sys.argv[3])
+
+with open(sys.argv[1]) as f:
+    reader = csv.reader(f, delimiter="\t")
+    columns = next(reader)
+    rows = []
+    total = 0
+    for row in reader:
+        total += 1
+        if len(rows) >= max_rows:
+            continue
+        typed = []
+        for val in row:
+            try:
+                typed.append(int(val))
+            except ValueError:
+                try:
+                    typed.append(float(val))
+                except ValueError:
+                    typed.append(val)
+        rows.append(typed)
+
+with open(sys.argv[2], "w") as f:
+    json.dump({"columns": columns, "rows": rows}, f, indent=2)
+    f.write("\n")
+if total > max_rows:
+    print(f"  -> wrote {len(columns)} columns, {len(rows)}/{total} rows (capped) to {sys.argv[2]}")
+else:
+    print(f"  -> wrote {len(columns)} columns, {len(rows)} rows to {sys.argv[2]}")
+' "$tsv_file" "$json_file" "$MAX_FIXTURE_ROWS"
+}
+
+# ── 2. variant_scorer (TSV output) ─────────────────────────────────────
+
+run_tool "variant_scorer" \
+    alphagenome_variant_scorer.py \
+    --input  test-data/test_input.vcf \
+    --output "$TMPDIR/variant_scorer.tsv" \
+    --api-key "$ALPHAGENOME_API_KEY" \
+    --scorers RNA_SEQ \
+    --sequence-length 16KB \
+    --max-variants 3 \
+    --verbose
+
+tsv_to_fixture "$TMPDIR/variant_scorer.tsv" \
+    "test-data/fixture_variant_scorer.json"
+
+# ── 3. ism_scanner (TSV output) ────────────────────────────────────────
+
+run_tool "ism_scanner" \
+    alphagenome_ism_scanner.py \
+    --input  test-data/test_regions.bed \
+    --output "$TMPDIR/ism_scanner.tsv" \
+    --api-key "$ALPHAGENOME_API_KEY" \
+    --scorers RNA_SEQ \
+    --sequence-length 16KB \
+    --max-regions 1 \
+    --max-region-width 10 \
+    --verbose
+
+tsv_to_fixture "$TMPDIR/ism_scanner.tsv" \
+    "test-data/fixture_ism_scanner.json"
+
+# ── 4. interval_predictor (TSV output) ─────────────────────────────────
+
+run_tool "interval_predictor" \
+    alphagenome_interval_predictor.py \
+    --input  test-data/test_intervals.bed \
+    --output "$TMPDIR/interval_predictor.tsv" \
+    --api-key "$ALPHAGENOME_API_KEY" \
+    --output-types RNA_SEQ \
+    --sequence-length 16KB \
+    --max-intervals 3 \
+    --output-mode summary \
+    --verbose
+
+tsv_to_fixture "$TMPDIR/interval_predictor.tsv" \
+    "test-data/fixture_interval_predictor.json"
+
+# ── 5. sequence_predictor (TSV output) ─────────────────────────────────
+
+run_tool "sequence_predictor" \
+    alphagenome_sequence_predictor.py \
+    --input  test-data/test_sequences.fa \
+    --output "$TMPDIR/sequence_predictor.tsv" \
+    --api-key "$ALPHAGENOME_API_KEY" \
+    --output-types RNA_SEQ \
+    --sequence-length 16KB \
+    --max-sequences 2 \
+    --output-mode summary \
+    --verbose
+
+tsv_to_fixture "$TMPDIR/sequence_predictor.tsv" \
+    "test-data/fixture_sequence_predictor.json"
+
+# ── Summary ─────────────────────────────────────────────────────────────
+
+echo ""
+echo "=== Done: $PASS passed, $FAIL failed ==="
+if [[ $FAIL -gt 0 ]]; then
+    exit 1
+fi
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Fri Mar 20 20:09:58 2026 +0000
@@ -0,0 +1,64 @@
+<macros>
+    <token name="@TOOL_VERSION@">0.6.1</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@PROFILE@">25.0</token>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@TOOL_VERSION@">alphagenome</requirement>
+            <requirement type="package" version="0.31.4">cyvcf2</requirement>
+            <credentials name="alphagenome" version="1.0" label="AlphaGenome API" description="API key from Google DeepMind AlphaGenome">
+                <secret name="api_key" inject_as_env="ALPHAGENOME_API_KEY" label="API Key" description="Your AlphaGenome API key"/>
+            </credentials>
+        </requirements>
+    </xml>
+    <xml name="organism_param">
+        <param name="organism" type="select" label="Organism">
+            <option value="human" selected="true">Human (hg38)</option>
+            <option value="mouse">Mouse (mm10)</option>
+        </param>
+    </xml>
+    <xml name="sequence_length_param">
+        <param name="sequence_length" type="select" label="Prediction window size" help="Genomic context window in bases around the target region (e.g. 1 Mb = 1,048,576 bases, not bytes)">
+            <option value="16KB">16 kb</option>
+            <option value="128KB">128 kb</option>
+            <option value="512KB">512 kb</option>
+            <option value="1MB" selected="true">1 Mb</option>
+        </param>
+    </xml>
+    <xml name="output_types_param">
+        <param name="output_types" type="select" multiple="true" label="Output types to predict">
+            <option value="RNA_SEQ" selected="true">RNA-seq</option>
+            <option value="ATAC">ATAC-seq</option>
+            <option value="CAGE">CAGE</option>
+            <option value="DNASE">DNase</option>
+            <option value="CHIP_HISTONE">ChIP-seq histone</option>
+            <option value="CHIP_TF">ChIP-seq TF</option>
+            <option value="SPLICE_SITES">Splice sites</option>
+            <option value="PROCAP">PRO-cap</option>
+            <validator type="no_options" message="Select at least one output type"/>
+        </param>
+    </xml>
+    <xml name="ontology_terms_param">
+        <param name="ontology_terms" type="text" value="" label="Ontology terms (optional)" help="Comma-separated UBERON/CL terms for tissue context, e.g. UBERON:0002107,CL:0000746">
+            <validator type="regex" message="Only alphanumeric characters, colons, commas, and spaces are allowed">[A-Za-z0-9:, ]*</validator>
+        </param>
+    </xml>
+    <xml name="test_fixture_param">
+        <param name="test_fixture" type="hidden" value=""/>
+    </xml>
+    <token name="@CMD_ONTOLOGY_TERMS@"><![CDATA[
+            #if str($ontology_terms).strip()
+                --ontology-terms '$ontology_terms'
+            #end if
+    ]]></token>
+    <token name="@CMD_TEST_FIXTURE@"><![CDATA[
+            #if $test_fixture
+                --test-fixture '$__tool_directory__/$test_fixture'
+            #end if
+    ]]></token>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1038/s41586-025-10014-0</citation>
+        </citations>
+    </xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fixture_interval_predictor.json	Fri Mar 20 20:09:58 2026 +0000
@@ -0,0 +1,345 @@
+{
+  "columns": [
+    "chrom",
+    "start",
+    "end",
+    "name",
+    "output_type",
+    "track_name",
+    "ontology_curie",
+    "mean_signal",
+    "max_signal"
+  ],
+  "rows": [
+    [
+      "chr22",
+      36201500,
+      36202500,
+      "test_interval_1",
+      "RNA_SEQ",
+      "",
+      "CL:0000047",
+      1.4e-05,
+      5.9e-05
+    ],
+    [
+      "chr22",
+      36201500,
+      36202500,
+      "test_interval_1",
+      "RNA_SEQ",
+      "",
+      "CL:0000062",
+      4.2e-05,
+      0.0001
+    ],
+    [
+      "chr22",
+      36201500,
+      36202500,
+      "test_interval_1",
+      "RNA_SEQ",
+      "",
+      "CL:0000084",
+      2.4e-05,
+      0.000137
+    ],
+    [
+      "chr22",
+      36201500,
+      36202500,
+      "test_interval_1",
+      "RNA_SEQ",
+      "",
+      "CL:0000084",
+      1.5e-05,
+      4.4e-05
+    ],
+    [
+      "chr22",
+      36201500,
+      36202500,
+      "test_interval_1",
+      "RNA_SEQ",
+      "",
+      "CL:0000115",
+      0.000209,
+      0.001122
+    ],
+    [
+      "chr22",
+      36201500,
+      36202500,
+      "test_interval_1",
+      "RNA_SEQ",
+      "",
+      "CL:0000127",
+      6e-06,
+      1.8e-05
+    ],
+    [
+      "chr22",
+      36201500,
+      36202500,
+      "test_interval_1",
+      "RNA_SEQ",
+      "",
+      "CL:0000134",
+      8e-06,
+      3.6e-05
+    ],
+    [
+      "chr22",
+      36201500,
+      36202500,
+      "test_interval_1",
+      "RNA_SEQ",
+      "",
+      "CL:0000137",
+      2e-05,
+      8.4e-05
+    ],
+    [
+      "chr22",
+      36201500,
+      36202500,
+      "test_interval_1",
+      "RNA_SEQ",
+      "",
+      "CL:0000138",
+      1.6e-05,
+      7.2e-05
+    ],
+    [
+      "chr22",
+      36201500,
+      36202500,
+      "test_interval_1",
+      "RNA_SEQ",
+      "",
+      "CL:0000169",
+      0.000116,
+      0.000366
+    ],
+    [
+      "chr22",
+      36201500,
+      36202500,
+      "test_interval_1",
+      "RNA_SEQ",
+      "",
+      "CL:0000182",
+      3e-05,
+      9e-05
+    ],
+    [
+      "chr22",
+      36201500,
+      36202500,
+      "test_interval_1",
+      "RNA_SEQ",
+      "",
+      "CL:0000187",
+      9.1e-05,
+      0.000439
+    ],
+    [
+      "chr22",
+      36201500,
+      36202500,
+      "test_interval_1",
+      "RNA_SEQ",
+      "",
+      "CL:0000192",
+      0.000107,
+      0.000298
+    ],
+    [
+      "chr22",
+      36201500,
+      36202500,
+      "test_interval_1",
+      "RNA_SEQ",
+      "",
+      "CL:0000221",
+      2.7e-05,
+      0.000164
+    ],
+    [
+      "chr22",
+      36201500,
+      36202500,
+      "test_interval_1",
+      "RNA_SEQ",
+      "",
+      "CL:0000222",
+      4.3e-05,
+      0.000254
+    ],
+    [
+      "chr22",
+      36201500,
+      36202500,
+      "test_interval_1",
+      "RNA_SEQ",
+      "",
+      "CL:0000223",
+      2.6e-05,
+      0.000135
+    ],
+    [
+      "chr22",
+      36201500,
+      36202500,
+      "test_interval_1",
+      "RNA_SEQ",
+      "",
+      "CL:0000223",
+      2.8e-05,
+      6.6e-05
+    ],
+    [
+      "chr22",
+      36201500,
+      36202500,
+      "test_interval_1",
+      "RNA_SEQ",
+      "",
+      "CL:0000236",
+      1.1e-05,
+      4.1e-05
+    ],
+    [
+      "chr22",
+      36201500,
+      36202500,
+      "test_interval_1",
+      "RNA_SEQ",
+      "",
+      "CL:0000236",
+      5e-06,
+      1.7e-05
+    ],
+    [
+      "chr22",
+      36201500,
+      36202500,
+      "test_interval_1",
+      "RNA_SEQ",
+      "",
+      "CL:0000307",
+      4.2e-05,
+      8.2e-05
+    ],
+    [
+      "chr22",
+      36201500,
+      36202500,
+      "test_interval_1",
+      "RNA_SEQ",
+      "",
+      "CL:0000312",
+      1e-05,
+      2.2e-05
+    ],
+    [
+      "chr22",
+      36201500,
+      36202500,
+      "test_interval_1",
+      "RNA_SEQ",
+      "",
+      "CL:0000346",
+      8.1e-05,
+      0.000198
+    ],
+    [
+      "chr22",
+      36201500,
+      36202500,
+      "test_interval_1",
+      "RNA_SEQ",
+      "",
+      "CL:0000351",
+      0.000123,
+      0.000305
+    ],
+    [
+      "chr22",
+      36201500,
+      36202500,
+      "test_interval_1",
+      "RNA_SEQ",
+      "",
+      "CL:0000515",
+      2.1e-05,
+      9e-05
+    ],
+    [
+      "chr22",
+      36201500,
+      36202500,
+      "test_interval_1",
+      "RNA_SEQ",
+      "",
+      "CL:0000515",
+      8.4e-05,
+      0.000626
+    ],
+    [
+      "chr22",
+      36201500,
+      36202500,
+      "test_interval_1",
+      "RNA_SEQ",
+      "",
+      "CL:0000594",
+      8e-05,
+      0.00022
+    ],
+    [
+      "chr22",
+      36201500,
+      36202500,
+      "test_interval_1",
+      "RNA_SEQ",
+      "",
+      "CL:0000623",
+      2.6e-05,
+      0.000101
+    ],
+    [
+      "chr22",
+      36201500,
+      36202500,
+      "test_interval_1",
+      "RNA_SEQ",
+      "",
+      "CL:0000623",
+      3.4e-05,
+      9.8e-05
+    ],
+    [
+      "chr22",
+      36201500,
+      36202500,
+      "test_interval_1",
+      "RNA_SEQ",
+      "",
+      "CL:0000624",
+      9e-06,
+      3.4e-05
+    ],
+    [
+      "chr22",
+      36201500,
+      36202500,
+      "test_interval_1",
+      "RNA_SEQ",
+      "",
+      "CL:0000625",
+      9e-06,
+      2.9e-05
+    ]
+  ]
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fixture_ism_scanner.json	Fri Mar 20 20:09:58 2026 +0000
@@ -0,0 +1,438 @@
+{
+  "columns": [
+    "region",
+    "position",
+    "ref_base",
+    "alt_base",
+    "gene_id",
+    "gene_name",
+    "gene_type",
+    "scorer",
+    "track_name",
+    "ontology_curie",
+    "raw_score",
+    "quantile_score"
+  ],
+  "rows": [
+    [
+      "test_region_1",
+      36201696,
+      "T",
+      "A",
+      "ENSG00000100336.18",
+      "APOL4",
+      "protein_coding",
+      "RNA_SEQ",
+      "CL:0000047 polyA plus RNA-seq",
+      "CL:0000047",
+      -1.721732,
+      -0.99998
+    ],
+    [
+      "test_region_1",
+      36201696,
+      "T",
+      "A",
+      "ENSG00000100336.18",
+      "APOL4",
+      "protein_coding",
+      "RNA_SEQ",
+      "CL:0000062 total RNA-seq",
+      "CL:0000062",
+      -2.872142,
+      -0.99998
+    ],
+    [
+      "test_region_1",
+      36201696,
+      "T",
+      "A",
+      "ENSG00000100336.18",
+      "APOL4",
+      "protein_coding",
+      "RNA_SEQ",
+      "CL:0000084 polyA plus RNA-seq",
+      "CL:0000084",
+      -0.43112,
+      -0.99998
+    ],
+    [
+      "test_region_1",
+      36201696,
+      "T",
+      "A",
+      "ENSG00000100336.18",
+      "APOL4",
+      "protein_coding",
+      "RNA_SEQ",
+      "CL:0000084 total RNA-seq",
+      "CL:0000084",
+      -0.323944,
+      -0.99998
+    ],
+    [
+      "test_region_1",
+      36201696,
+      "T",
+      "A",
+      "ENSG00000100336.18",
+      "APOL4",
+      "protein_coding",
+      "RNA_SEQ",
+      "CL:0000115 total RNA-seq",
+      "CL:0000115",
+      -1.891316,
+      -0.99998
+    ],
+    [
+      "test_region_1",
+      36201696,
+      "T",
+      "A",
+      "ENSG00000100336.18",
+      "APOL4",
+      "protein_coding",
+      "RNA_SEQ",
+      "CL:0000127 total RNA-seq",
+      "CL:0000127",
+      -2.695157,
+      -0.99998
+    ],
+    [
+      "test_region_1",
+      36201696,
+      "T",
+      "A",
+      "ENSG00000100336.18",
+      "APOL4",
+      "protein_coding",
+      "RNA_SEQ",
+      "CL:0000134 polyA plus RNA-seq",
+      "CL:0000134",
+      -2.826576,
+      -0.99998
+    ],
+    [
+      "test_region_1",
+      36201696,
+      "T",
+      "A",
+      "ENSG00000100336.18",
+      "APOL4",
+      "protein_coding",
+      "RNA_SEQ",
+      "CL:0000137 total RNA-seq",
+      "CL:0000137",
+      -2.560256,
+      -0.99998
+    ],
+    [
+      "test_region_1",
+      36201696,
+      "T",
+      "A",
+      "ENSG00000100336.18",
+      "APOL4",
+      "protein_coding",
+      "RNA_SEQ",
+      "CL:0000138 total RNA-seq",
+      "CL:0000138",
+      -2.42312,
+      -0.99998
+    ],
+    [
+      "test_region_1",
+      36201696,
+      "T",
+      "A",
+      "ENSG00000100336.18",
+      "APOL4",
+      "protein_coding",
+      "RNA_SEQ",
+      "CL:0000169 total RNA-seq",
+      "CL:0000169",
+      -0.795844,
+      -0.99998
+    ],
+    [
+      "test_region_1",
+      36201696,
+      "T",
+      "A",
+      "ENSG00000100336.18",
+      "APOL4",
+      "protein_coding",
+      "RNA_SEQ",
+      "CL:0000182 total RNA-seq",
+      "CL:0000182",
+      -1.609686,
+      -0.99998
+    ],
+    [
+      "test_region_1",
+      36201696,
+      "T",
+      "A",
+      "ENSG00000100336.18",
+      "APOL4",
+      "protein_coding",
+      "RNA_SEQ",
+      "CL:0000187 total RNA-seq",
+      "CL:0000187",
+      -3.523364,
+      -0.99998
+    ],
+    [
+      "test_region_1",
+      36201696,
+      "T",
+      "A",
+      "ENSG00000100336.18",
+      "APOL4",
+      "protein_coding",
+      "RNA_SEQ",
+      "CL:0000192 total RNA-seq",
+      "CL:0000192",
+      -1.707838,
+      -0.99998
+    ],
+    [
+      "test_region_1",
+      36201696,
+      "T",
+      "A",
+      "ENSG00000100336.18",
+      "APOL4",
+      "protein_coding",
+      "RNA_SEQ",
+      "CL:0000221 polyA plus RNA-seq",
+      "CL:0000221",
+      -1.927729,
+      -0.99998
+    ],
+    [
+      "test_region_1",
+      36201696,
+      "T",
+      "A",
+      "ENSG00000100336.18",
+      "APOL4",
+      "protein_coding",
+      "RNA_SEQ",
+      "CL:0000222 polyA plus RNA-seq",
+      "CL:0000222",
+      -2.618365,
+      -0.99998
+    ],
+    [
+      "test_region_1",
+      36201696,
+      "T",
+      "A",
+      "ENSG00000100336.18",
+      "APOL4",
+      "protein_coding",
+      "RNA_SEQ",
+      "CL:0000223 polyA plus RNA-seq",
+      "CL:0000223",
+      -2.313126,
+      -0.99998
+    ],
+    [
+      "test_region_1",
+      36201696,
+      "T",
+      "A",
+      "ENSG00000100336.18",
+      "APOL4",
+      "protein_coding",
+      "RNA_SEQ",
+      "CL:0000223 total RNA-seq",
+      "CL:0000223",
+      -1.154866,
+      -0.99998
+    ],
+    [
+      "test_region_1",
+      36201696,
+      "T",
+      "A",
+      "ENSG00000100336.18",
+      "APOL4",
+      "protein_coding",
+      "RNA_SEQ",
+      "CL:0000236 polyA plus RNA-seq",
+      "CL:0000236",
+      -0.299119,
+      -0.99998
+    ],
+    [
+      "test_region_1",
+      36201696,
+      "T",
+      "A",
+      "ENSG00000100336.18",
+      "APOL4",
+      "protein_coding",
+      "RNA_SEQ",
+      "CL:0000236 total RNA-seq",
+      "CL:0000236",
+      -0.133313,
+      -0.999941
+    ],
+    [
+      "test_region_1",
+      36201696,
+      "T",
+      "A",
+      "ENSG00000100336.18",
+      "APOL4",
+      "protein_coding",
+      "RNA_SEQ",
+      "CL:0000307 total RNA-seq",
+      "CL:0000307",
+      -1.233429,
+      -0.99998
+    ],
+    [
+      "test_region_1",
+      36201696,
+      "T",
+      "A",
+      "ENSG00000100336.18",
+      "APOL4",
+      "protein_coding",
+      "RNA_SEQ",
+      "CL:0000312 polyA plus RNA-seq",
+      "CL:0000312",
+      -0.757559,
+      -0.99998
+    ],
+    [
+      "test_region_1",
+      36201696,
+      "T",
+      "A",
+      "ENSG00000100336.18",
+      "APOL4",
+      "protein_coding",
+      "RNA_SEQ",
+      "CL:0000346 total RNA-seq",
+      "CL:0000346",
+      -2.658391,
+      -0.99998
+    ],
+    [
+      "test_region_1",
+      36201696,
+      "T",
+      "A",
+      "ENSG00000100336.18",
+      "APOL4",
+      "protein_coding",
+      "RNA_SEQ",
+      "CL:0000351 polyA plus RNA-seq",
+      "CL:0000351",
+      -1.029845,
+      -0.99998
+    ],
+    [
+      "test_region_1",
+      36201696,
+      "T",
+      "A",
+      "ENSG00000100336.18",
+      "APOL4",
+      "protein_coding",
+      "RNA_SEQ",
+      "CL:0000515 polyA plus RNA-seq",
+      "CL:0000515",
+      -3.428352,
+      -0.99998
+    ],
+    [
+      "test_region_1",
+      36201696,
+      "T",
+      "A",
+      "ENSG00000100336.18",
+      "APOL4",
+      "protein_coding",
+      "RNA_SEQ",
+      "CL:0000515 total RNA-seq",
+      "CL:0000515",
+      -3.587678,
+      -0.99998
+    ],
+    [
+      "test_region_1",
+      36201696,
+      "T",
+      "A",
+      "ENSG00000100336.18",
+      "APOL4",
+      "protein_coding",
+      "RNA_SEQ",
+      "CL:0000594 total RNA-seq",
+      "CL:0000594",
+      -3.365285,
+      -0.99998
+    ],
+    [
+      "test_region_1",
+      36201696,
+      "T",
+      "A",
+      "ENSG00000100336.18",
+      "APOL4",
+      "protein_coding",
+      "RNA_SEQ",
+      "CL:0000623 polyA plus RNA-seq",
+      "CL:0000623",
+      -0.693357,
+      -0.99998
+    ],
+    [
+      "test_region_1",
+      36201696,
+      "T",
+      "A",
+      "ENSG00000100336.18",
+      "APOL4",
+      "protein_coding",
+      "RNA_SEQ",
+      "CL:0000623 total RNA-seq",
+      "CL:0000623",
+      -0.57456,
+      -0.99998
+    ],
+    [
+      "test_region_1",
+      36201696,
+      "T",
+      "A",
+      "ENSG00000100336.18",
+      "APOL4",
+      "protein_coding",
+      "RNA_SEQ",
+      "CL:0000624 total RNA-seq",
+      "CL:0000624",
+      -0.244827,
+      -0.99998
+    ],
+    [
+      "test_region_1",
+      36201696,
+      "T",
+      "A",
+      "ENSG00000100336.18",
+      "APOL4",
+      "protein_coding",
+      "RNA_SEQ",
+      "CL:0000625 total RNA-seq",
+      "CL:0000625",
+      -0.203565,
+      -0.999972
+    ]
+  ]
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fixture_sequence_predictor.json	Fri Mar 20 20:09:58 2026 +0000
@@ -0,0 +1,283 @@
+{
+  "columns": [
+    "sequence_id",
+    "sequence_length",
+    "output_type",
+    "track_name",
+    "ontology_curie",
+    "mean_signal",
+    "max_signal"
+  ],
+  "rows": [
+    [
+      "test_promoter_1",
+      180,
+      "RNA_SEQ",
+      "",
+      "CL:0000047",
+      6.3e-05,
+      8.2e-05
+    ],
+    [
+      "test_promoter_1",
+      180,
+      "RNA_SEQ",
+      "",
+      "CL:0000062",
+      9.1e-05,
+      0.000117
+    ],
+    [
+      "test_promoter_1",
+      180,
+      "RNA_SEQ",
+      "",
+      "CL:0000084",
+      0.000152,
+      0.000211
+    ],
+    [
+      "test_promoter_1",
+      180,
+      "RNA_SEQ",
+      "",
+      "CL:0000084",
+      0.000308,
+      0.000469
+    ],
+    [
+      "test_promoter_1",
+      180,
+      "RNA_SEQ",
+      "",
+      "CL:0000115",
+      0.000518,
+      0.000832
+    ],
+    [
+      "test_promoter_1",
+      180,
+      "RNA_SEQ",
+      "",
+      "CL:0000127",
+      8.1e-05,
+      0.000119
+    ],
+    [
+      "test_promoter_1",
+      180,
+      "RNA_SEQ",
+      "",
+      "CL:0000134",
+      8.5e-05,
+      0.000133
+    ],
+    [
+      "test_promoter_1",
+      180,
+      "RNA_SEQ",
+      "",
+      "CL:0000137",
+      0.000436,
+      0.000717
+    ],
+    [
+      "test_promoter_1",
+      180,
+      "RNA_SEQ",
+      "",
+      "CL:0000138",
+      0.000336,
+      0.000553
+    ],
+    [
+      "test_promoter_1",
+      180,
+      "RNA_SEQ",
+      "",
+      "CL:0000169",
+      0.000233,
+      0.000366
+    ],
+    [
+      "test_promoter_1",
+      180,
+      "RNA_SEQ",
+      "",
+      "CL:0000182",
+      8.6e-05,
+      0.000111
+    ],
+    [
+      "test_promoter_1",
+      180,
+      "RNA_SEQ",
+      "",
+      "CL:0000187",
+      9.1e-05,
+      0.000121
+    ],
+    [
+      "test_promoter_1",
+      180,
+      "RNA_SEQ",
+      "",
+      "CL:0000192",
+      0.00032,
+      0.000454
+    ],
+    [
+      "test_promoter_1",
+      180,
+      "RNA_SEQ",
+      "",
+      "CL:0000221",
+      0.001081,
+      0.001839
+    ],
+    [
+      "test_promoter_1",
+      180,
+      "RNA_SEQ",
+      "",
+      "CL:0000222",
+      0.001735,
+      0.00351
+    ],
+    [
+      "test_promoter_1",
+      180,
+      "RNA_SEQ",
+      "",
+      "CL:0000223",
+      0.00145,
+      0.002502
+    ],
+    [
+      "test_promoter_1",
+      180,
+      "RNA_SEQ",
+      "",
+      "CL:0000223",
+      0.00069,
+      0.001114
+    ],
+    [
+      "test_promoter_1",
+      180,
+      "RNA_SEQ",
+      "",
+      "CL:0000236",
+      0.000311,
+      0.000443
+    ],
+    [
+      "test_promoter_1",
+      180,
+      "RNA_SEQ",
+      "",
+      "CL:0000236",
+      0.000246,
+      0.000418
+    ],
+    [
+      "test_promoter_1",
+      180,
+      "RNA_SEQ",
+      "",
+      "CL:0000307",
+      0.000283,
+      0.000401
+    ],
+    [
+      "test_promoter_1",
+      180,
+      "RNA_SEQ",
+      "",
+      "CL:0000312",
+      0.000164,
+      0.000211
+    ],
+    [
+      "test_promoter_1",
+      180,
+      "RNA_SEQ",
+      "",
+      "CL:0000346",
+      9.6e-05,
+      0.00013
+    ],
+    [
+      "test_promoter_1",
+      180,
+      "RNA_SEQ",
+      "",
+      "CL:0000351",
+      8.7e-05,
+      0.000128
+    ],
+    [
+      "test_promoter_1",
+      180,
+      "RNA_SEQ",
+      "",
+      "CL:0000515",
+      9.9e-05,
+      0.000143
+    ],
+    [
+      "test_promoter_1",
+      180,
+      "RNA_SEQ",
+      "",
+      "CL:0000515",
+      0.000128,
+      0.000172
+    ],
+    [
+      "test_promoter_1",
+      180,
+      "RNA_SEQ",
+      "",
+      "CL:0000594",
+      0.000181,
+      0.000259
+    ],
+    [
+      "test_promoter_1",
+      180,
+      "RNA_SEQ",
+      "",
+      "CL:0000623",
+      0.000139,
+      0.00018
+    ],
+    [
+      "test_promoter_1",
+      180,
+      "RNA_SEQ",
+      "",
+      "CL:0000623",
+      0.000363,
+      0.000565
+    ],
+    [
+      "test_promoter_1",
+      180,
+      "RNA_SEQ",
+      "",
+      "CL:0000624",
+      0.000282,
+      0.000452
+    ],
+    [
+      "test_promoter_1",
+      180,
+      "RNA_SEQ",
+      "",
+      "CL:0000625",
+      0.00025,
+      0.000406
+    ]
+  ]
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fixture_variant_effect.json	Fri Mar 20 20:09:58 2026 +0000
@@ -0,0 +1,31 @@
+{
+  "variants": [
+    {
+      "chrom": "chr22",
+      "pos": 36201698,
+      "ref": "A",
+      "alt": "C",
+      "scores": {
+        "RNA_SEQ": 9.74931
+      }
+    },
+    {
+      "chrom": "chr22",
+      "pos": 36201750,
+      "ref": "G",
+      "alt": "T",
+      "scores": {
+        "RNA_SEQ": 0.794512
+      }
+    },
+    {
+      "chrom": "chr22",
+      "pos": 36202000,
+      "ref": "C",
+      "alt": "A",
+      "scores": {
+        "RNA_SEQ": 0.960974
+      }
+    }
+  ]
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fixture_variant_scorer.json	Fri Mar 20 20:09:58 2026 +0000
@@ -0,0 +1,779 @@
+{
+  "columns": [
+    "variant_id",
+    "scored_interval",
+    "gene_id",
+    "gene_name",
+    "gene_type",
+    "gene_strand",
+    "junction_Start",
+    "junction_End",
+    "output_type",
+    "variant_scorer",
+    "track_name",
+    "track_strand",
+    "Assay title",
+    "ontology_curie",
+    "biosample_name",
+    "biosample_type",
+    "biosample_life_stage",
+    "gtex_tissue",
+    "data_source",
+    "endedness",
+    "genetically_modified",
+    "raw_score",
+    "quantile_score"
+  ],
+  "rows": [
+    [
+      "chr22:36201698:A>C",
+      "chr22:36193506-36209890:.",
+      "ENSG00000100336",
+      "APOL4",
+      "protein_coding",
+      "-",
+      "",
+      "",
+      "RNA_SEQ",
+      "GeneMaskLFCScorer(requested_output=RNA_SEQ)",
+      "CL:0000047 polyA plus RNA-seq",
+      "-",
+      "polyA plus RNA-seq",
+      "CL:0000047",
+      "neuronal stem cell",
+      "in_vitro_differentiated_cells",
+      "embryonic",
+      "",
+      "encode",
+      "paired",
+      "False",
+      -1.534143,
+      -0.99998
+    ],
+    [
+      "chr22:36201698:A>C",
+      "chr22:36193506-36209890:.",
+      "ENSG00000100336",
+      "APOL4",
+      "protein_coding",
+      "-",
+      "",
+      "",
+      "RNA_SEQ",
+      "GeneMaskLFCScorer(requested_output=RNA_SEQ)",
+      "CL:0000062 total RNA-seq",
+      "-",
+      "total RNA-seq",
+      "CL:0000062",
+      "osteoblast",
+      "primary_cell",
+      "adult",
+      "",
+      "encode",
+      "paired",
+      "False",
+      -2.4925048,
+      -0.99998
+    ],
+    [
+      "chr22:36201698:A>C",
+      "chr22:36193506-36209890:.",
+      "ENSG00000100336",
+      "APOL4",
+      "protein_coding",
+      "-",
+      "",
+      "",
+      "RNA_SEQ",
+      "GeneMaskLFCScorer(requested_output=RNA_SEQ)",
+      "CL:0000084 polyA plus RNA-seq",
+      "-",
+      "polyA plus RNA-seq",
+      "CL:0000084",
+      "T-cell",
+      "primary_cell",
+      "adult",
+      "",
+      "encode",
+      "paired",
+      "False",
+      -0.48444033,
+      -0.99998
+    ],
+    [
+      "chr22:36201698:A>C",
+      "chr22:36193506-36209890:.",
+      "ENSG00000100336",
+      "APOL4",
+      "protein_coding",
+      "-",
+      "",
+      "",
+      "RNA_SEQ",
+      "GeneMaskLFCScorer(requested_output=RNA_SEQ)",
+      "CL:0000084 total RNA-seq",
+      "-",
+      "total RNA-seq",
+      "CL:0000084",
+      "T-cell",
+      "primary_cell",
+      "adult",
+      "",
+      "encode",
+      "single",
+      "False",
+      -0.3864045,
+      -0.99998
+    ],
+    [
+      "chr22:36201698:A>C",
+      "chr22:36193506-36209890:.",
+      "ENSG00000100336",
+      "APOL4",
+      "protein_coding",
+      "-",
+      "",
+      "",
+      "RNA_SEQ",
+      "GeneMaskLFCScorer(requested_output=RNA_SEQ)",
+      "CL:0000115 total RNA-seq",
+      "-",
+      "total RNA-seq",
+      "CL:0000115",
+      "endothelial cell",
+      "in_vitro_differentiated_cells",
+      "adult",
+      "",
+      "encode",
+      "single",
+      "False",
+      -1.7566066,
+      -0.99998
+    ],
+    [
+      "chr22:36201698:A>C",
+      "chr22:36193506-36209890:.",
+      "ENSG00000100336",
+      "APOL4",
+      "protein_coding",
+      "-",
+      "",
+      "",
+      "RNA_SEQ",
+      "GeneMaskLFCScorer(requested_output=RNA_SEQ)",
+      "CL:0000127 total RNA-seq",
+      "-",
+      "total RNA-seq",
+      "CL:0000127",
+      "astrocyte",
+      "primary_cell",
+      "unknown",
+      "",
+      "encode",
+      "paired",
+      "False",
+      -2.2548208,
+      -0.99998
+    ],
+    [
+      "chr22:36201698:A>C",
+      "chr22:36193506-36209890:.",
+      "ENSG00000100336",
+      "APOL4",
+      "protein_coding",
+      "-",
+      "",
+      "",
+      "RNA_SEQ",
+      "GeneMaskLFCScorer(requested_output=RNA_SEQ)",
+      "CL:0000134 polyA plus RNA-seq",
+      "-",
+      "polyA plus RNA-seq",
+      "CL:0000134",
+      "mesenchymal stem cell",
+      "in_vitro_differentiated_cells",
+      "embryonic",
+      "",
+      "encode",
+      "paired",
+      "False",
+      -2.487202,
+      -0.99998
+    ],
+    [
+      "chr22:36201698:A>C",
+      "chr22:36193506-36209890:.",
+      "ENSG00000100336",
+      "APOL4",
+      "protein_coding",
+      "-",
+      "",
+      "",
+      "RNA_SEQ",
+      "GeneMaskLFCScorer(requested_output=RNA_SEQ)",
+      "CL:0000137 total RNA-seq",
+      "-",
+      "total RNA-seq",
+      "CL:0000137",
+      "osteocyte",
+      "in_vitro_differentiated_cells",
+      "embryonic",
+      "",
+      "encode",
+      "single",
+      "False",
+      -2.1787338,
+      -0.99998
+    ],
+    [
+      "chr22:36201698:A>C",
+      "chr22:36193506-36209890:.",
+      "ENSG00000100336",
+      "APOL4",
+      "protein_coding",
+      "-",
+      "",
+      "",
+      "RNA_SEQ",
+      "GeneMaskLFCScorer(requested_output=RNA_SEQ)",
+      "CL:0000138 total RNA-seq",
+      "-",
+      "total RNA-seq",
+      "CL:0000138",
+      "chondrocyte",
+      "in_vitro_differentiated_cells",
+      "embryonic",
+      "",
+      "encode",
+      "single",
+      "False",
+      -2.0341547,
+      -0.99998
+    ],
+    [
+      "chr22:36201698:A>C",
+      "chr22:36193506-36209890:.",
+      "ENSG00000100336",
+      "APOL4",
+      "protein_coding",
+      "-",
+      "",
+      "",
+      "RNA_SEQ",
+      "GeneMaskLFCScorer(requested_output=RNA_SEQ)",
+      "CL:0000169 total RNA-seq",
+      "-",
+      "total RNA-seq",
+      "CL:0000169",
+      "type B pancreatic cell",
+      "in_vitro_differentiated_cells",
+      "embryonic",
+      "",
+      "encode",
+      "single",
+      "False",
+      -0.5709431,
+      -0.99998
+    ],
+    [
+      "chr22:36201698:A>C",
+      "chr22:36193506-36209890:.",
+      "ENSG00000100336",
+      "APOL4",
+      "protein_coding",
+      "-",
+      "",
+      "",
+      "RNA_SEQ",
+      "GeneMaskLFCScorer(requested_output=RNA_SEQ)",
+      "CL:0000182 total RNA-seq",
+      "-",
+      "total RNA-seq",
+      "CL:0000182",
+      "hepatocyte",
+      "in_vitro_differentiated_cells",
+      "embryonic",
+      "",
+      "encode",
+      "paired",
+      "False",
+      -1.3045998,
+      -0.99998
+    ],
+    [
+      "chr22:36201698:A>C",
+      "chr22:36193506-36209890:.",
+      "ENSG00000100336",
+      "APOL4",
+      "protein_coding",
+      "-",
+      "",
+      "",
+      "RNA_SEQ",
+      "GeneMaskLFCScorer(requested_output=RNA_SEQ)",
+      "CL:0000187 total RNA-seq",
+      "-",
+      "total RNA-seq",
+      "CL:0000187",
+      "myocyte",
+      "in_vitro_differentiated_cells",
+      "adult",
+      "",
+      "encode",
+      "paired",
+      "False",
+      -3.1106043,
+      -0.99998
+    ],
+    [
+      "chr22:36201698:A>C",
+      "chr22:36193506-36209890:.",
+      "ENSG00000100336",
+      "APOL4",
+      "protein_coding",
+      "-",
+      "",
+      "",
+      "RNA_SEQ",
+      "GeneMaskLFCScorer(requested_output=RNA_SEQ)",
+      "CL:0000192 total RNA-seq",
+      "-",
+      "total RNA-seq",
+      "CL:0000192",
+      "smooth muscle cell",
+      "in_vitro_differentiated_cells",
+      "embryonic",
+      "",
+      "encode",
+      "paired",
+      "False",
+      -1.362958,
+      -0.99998
+    ],
+    [
+      "chr22:36201698:A>C",
+      "chr22:36193506-36209890:.",
+      "ENSG00000100336",
+      "APOL4",
+      "protein_coding",
+      "-",
+      "",
+      "",
+      "RNA_SEQ",
+      "GeneMaskLFCScorer(requested_output=RNA_SEQ)",
+      "CL:0000221 polyA plus RNA-seq",
+      "-",
+      "polyA plus RNA-seq",
+      "CL:0000221",
+      "ectodermal cell",
+      "in_vitro_differentiated_cells",
+      "embryonic",
+      "",
+      "encode",
+      "paired",
+      "False",
+      -1.7337704,
+      -0.99998
+    ],
+    [
+      "chr22:36201698:A>C",
+      "chr22:36193506-36209890:.",
+      "ENSG00000100336",
+      "APOL4",
+      "protein_coding",
+      "-",
+      "",
+      "",
+      "RNA_SEQ",
+      "GeneMaskLFCScorer(requested_output=RNA_SEQ)",
+      "CL:0000222 polyA plus RNA-seq",
+      "-",
+      "polyA plus RNA-seq",
+      "CL:0000222",
+      "mesodermal cell",
+      "in_vitro_differentiated_cells",
+      "embryonic",
+      "",
+      "encode",
+      "paired",
+      "False",
+      -2.3969855,
+      -0.99998
+    ],
+    [
+      "chr22:36201698:A>C",
+      "chr22:36193506-36209890:.",
+      "ENSG00000100336",
+      "APOL4",
+      "protein_coding",
+      "-",
+      "",
+      "",
+      "RNA_SEQ",
+      "GeneMaskLFCScorer(requested_output=RNA_SEQ)",
+      "CL:0000223 polyA plus RNA-seq",
+      "-",
+      "polyA plus RNA-seq",
+      "CL:0000223",
+      "endodermal cell",
+      "in_vitro_differentiated_cells",
+      "embryonic",
+      "",
+      "encode",
+      "paired",
+      "False",
+      -2.0788126,
+      -0.99998
+    ],
+    [
+      "chr22:36201698:A>C",
+      "chr22:36193506-36209890:.",
+      "ENSG00000100336",
+      "APOL4",
+      "protein_coding",
+      "-",
+      "",
+      "",
+      "RNA_SEQ",
+      "GeneMaskLFCScorer(requested_output=RNA_SEQ)",
+      "CL:0000223 total RNA-seq",
+      "-",
+      "total RNA-seq",
+      "CL:0000223",
+      "endodermal cell",
+      "in_vitro_differentiated_cells",
+      "embryonic",
+      "",
+      "encode",
+      "single",
+      "False",
+      -1.0922494,
+      -0.99998
+    ],
+    [
+      "chr22:36201698:A>C",
+      "chr22:36193506-36209890:.",
+      "ENSG00000100336",
+      "APOL4",
+      "protein_coding",
+      "-",
+      "",
+      "",
+      "RNA_SEQ",
+      "GeneMaskLFCScorer(requested_output=RNA_SEQ)",
+      "CL:0000236 polyA plus RNA-seq",
+      "-",
+      "polyA plus RNA-seq",
+      "CL:0000236",
+      "B cell",
+      "primary_cell",
+      "adult",
+      "",
+      "encode",
+      "paired",
+      "False",
+      -0.3181591,
+      -0.99998
+    ],
+    [
+      "chr22:36201698:A>C",
+      "chr22:36193506-36209890:.",
+      "ENSG00000100336",
+      "APOL4",
+      "protein_coding",
+      "-",
+      "",
+      "",
+      "RNA_SEQ",
+      "GeneMaskLFCScorer(requested_output=RNA_SEQ)",
+      "CL:0000236 total RNA-seq",
+      "-",
+      "total RNA-seq",
+      "CL:0000236",
+      "B cell",
+      "primary_cell",
+      "adult",
+      "",
+      "encode",
+      "single",
+      "False",
+      -0.17071676,
+      -0.9999627
+    ],
+    [
+      "chr22:36201698:A>C",
+      "chr22:36193506-36209890:.",
+      "ENSG00000100336",
+      "APOL4",
+      "protein_coding",
+      "-",
+      "",
+      "",
+      "RNA_SEQ",
+      "GeneMaskLFCScorer(requested_output=RNA_SEQ)",
+      "CL:0000307 total RNA-seq",
+      "-",
+      "total RNA-seq",
+      "CL:0000307",
+      "tracheal epithelial cell",
+      "primary_cell",
+      "adult",
+      "",
+      "encode",
+      "paired",
+      "False",
+      -1.0373387,
+      -0.99998
+    ],
+    [
+      "chr22:36201698:A>C",
+      "chr22:36193506-36209890:.",
+      "ENSG00000100336",
+      "APOL4",
+      "protein_coding",
+      "-",
+      "",
+      "",
+      "RNA_SEQ",
+      "GeneMaskLFCScorer(requested_output=RNA_SEQ)",
+      "CL:0000312 polyA plus RNA-seq",
+      "-",
+      "polyA plus RNA-seq",
+      "CL:0000312",
+      "keratinocyte",
+      "primary_cell",
+      "unknown",
+      "",
+      "encode",
+      "paired",
+      "False",
+      -0.59641504,
+      -0.99998
+    ],
+    [
+      "chr22:36201698:A>C",
+      "chr22:36193506-36209890:.",
+      "ENSG00000100336",
+      "APOL4",
+      "protein_coding",
+      "-",
+      "",
+      "",
+      "RNA_SEQ",
+      "GeneMaskLFCScorer(requested_output=RNA_SEQ)",
+      "CL:0000346 total RNA-seq",
+      "-",
+      "total RNA-seq",
+      "CL:0000346",
+      "hair follicle dermal papilla cell",
+      "primary_cell",
+      "adult",
+      "",
+      "encode",
+      "paired",
+      "False",
+      -2.2779784,
+      -0.99998
+    ],
+    [
+      "chr22:36201698:A>C",
+      "chr22:36193506-36209890:.",
+      "ENSG00000100336",
+      "APOL4",
+      "protein_coding",
+      "-",
+      "",
+      "",
+      "RNA_SEQ",
+      "GeneMaskLFCScorer(requested_output=RNA_SEQ)",
+      "CL:0000351 polyA plus RNA-seq",
+      "-",
+      "polyA plus RNA-seq",
+      "CL:0000351",
+      "trophoblast cell",
+      "in_vitro_differentiated_cells",
+      "embryonic",
+      "",
+      "encode",
+      "paired",
+      "False",
+      -0.8832159,
+      -0.99998
+    ],
+    [
+      "chr22:36201698:A>C",
+      "chr22:36193506-36209890:.",
+      "ENSG00000100336",
+      "APOL4",
+      "protein_coding",
+      "-",
+      "",
+      "",
+      "RNA_SEQ",
+      "GeneMaskLFCScorer(requested_output=RNA_SEQ)",
+      "CL:0000515 polyA plus RNA-seq",
+      "-",
+      "polyA plus RNA-seq",
+      "CL:0000515",
+      "skeletal muscle myoblast",
+      "primary_cell",
+      "unknown",
+      "",
+      "encode",
+      "paired",
+      "False",
+      -3.0128946,
+      -0.99998
+    ],
+    [
+      "chr22:36201698:A>C",
+      "chr22:36193506-36209890:.",
+      "ENSG00000100336",
+      "APOL4",
+      "protein_coding",
+      "-",
+      "",
+      "",
+      "RNA_SEQ",
+      "GeneMaskLFCScorer(requested_output=RNA_SEQ)",
+      "CL:0000515 total RNA-seq",
+      "-",
+      "total RNA-seq",
+      "CL:0000515",
+      "skeletal muscle myoblast",
+      "primary_cell",
+      "unknown",
+      "",
+      "encode",
+      "paired",
+      "False",
+      -3.1980705,
+      -0.99998
+    ],
+    [
+      "chr22:36201698:A>C",
+      "chr22:36193506-36209890:.",
+      "ENSG00000100336",
+      "APOL4",
+      "protein_coding",
+      "-",
+      "",
+      "",
+      "RNA_SEQ",
+      "GeneMaskLFCScorer(requested_output=RNA_SEQ)",
+      "CL:0000594 total RNA-seq",
+      "-",
+      "total RNA-seq",
+      "CL:0000594",
+      "skeletal muscle satellite cell",
+      "primary_cell",
+      "adult",
+      "",
+      "encode",
+      "paired",
+      "False",
+      -3.0219214,
+      -0.99998
+    ],
+    [
+      "chr22:36201698:A>C",
+      "chr22:36193506-36209890:.",
+      "ENSG00000100336",
+      "APOL4",
+      "protein_coding",
+      "-",
+      "",
+      "",
+      "RNA_SEQ",
+      "GeneMaskLFCScorer(requested_output=RNA_SEQ)",
+      "CL:0000623 polyA plus RNA-seq",
+      "-",
+      "polyA plus RNA-seq",
+      "CL:0000623",
+      "natural killer cell",
+      "primary_cell",
+      "adult",
+      "",
+      "encode",
+      "paired",
+      "False",
+      -0.789505,
+      -0.99998
+    ],
+    [
+      "chr22:36201698:A>C",
+      "chr22:36193506-36209890:.",
+      "ENSG00000100336",
+      "APOL4",
+      "protein_coding",
+      "-",
+      "",
+      "",
+      "RNA_SEQ",
+      "GeneMaskLFCScorer(requested_output=RNA_SEQ)",
+      "CL:0000623 total RNA-seq",
+      "-",
+      "total RNA-seq",
+      "CL:0000623",
+      "natural killer cell",
+      "primary_cell",
+      "adult",
+      "",
+      "encode",
+      "single",
+      "False",
+      -0.672935,
+      -0.99998
+    ],
+    [
+      "chr22:36201698:A>C",
+      "chr22:36193506-36209890:.",
+      "ENSG00000100336",
+      "APOL4",
+      "protein_coding",
+      "-",
+      "",
+      "",
+      "RNA_SEQ",
+      "GeneMaskLFCScorer(requested_output=RNA_SEQ)",
+      "CL:0000624 total RNA-seq",
+      "-",
+      "total RNA-seq",
+      "CL:0000624",
+      "CD4-positive, alpha-beta T cell",
+      "primary_cell",
+      "adult",
+      "",
+      "encode",
+      "single",
+      "False",
+      -0.309525,
+      -0.99998
+    ],
+    [
+      "chr22:36201698:A>C",
+      "chr22:36193506-36209890:.",
+      "ENSG00000100336",
+      "APOL4",
+      "protein_coding",
+      "-",
+      "",
+      "",
+      "RNA_SEQ",
+      "GeneMaskLFCScorer(requested_output=RNA_SEQ)",
+      "CL:0000625 total RNA-seq",
+      "-",
+      "total RNA-seq",
+      "CL:0000625",
+      "CD8-positive, alpha-beta T cell",
+      "primary_cell",
+      "adult",
+      "",
+      "encode",
+      "single",
+      "False",
+      -0.26380253,
+      -0.99998
+    ]
+  ]
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_input.vcf	Fri Mar 20 20:09:58 2026 +0000
@@ -0,0 +1,7 @@
+##fileformat=VCFv4.2
+##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
+##contig=<ID=chr22,length=50818468>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+chr22	36201698	.	A	C	.	PASS	.
+chr22	36201750	.	G	T	.	PASS	.
+chr22	36202000	.	C	A	.	PASS	.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_intervals.bed	Fri Mar 20 20:09:58 2026 +0000
@@ -0,0 +1,3 @@
+chr22	36201500	36202500	test_interval_1
+chr22	36210000	36211000	test_interval_2
+chr22	36220000	36220500	test_interval_3
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_regions.bed	Fri Mar 20 20:09:58 2026 +0000
@@ -0,0 +1,2 @@
+chr22	36201690	36201710	test_region_1
+chr22	36201990	36202010	test_region_2
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_sequences.fa	Fri Mar 20 20:09:58 2026 +0000
@@ -0,0 +1,7 @@
+>test_promoter_1
+ATGCGTACGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCATGCATGCATG
+CGATCGATCGATCGTAGCTAGCTAGCTAGCATGCATGCATGCGATCGATCGATCGTAGCT
+AGCTAGCTAGCTAGCATGCATGCATGCGATCGATCGATCGTAGCTAGCTAGCTAGCATGC
+>test_enhancer_1
+GCTAGCTAGCTAGCATGCATGCATGCGATCGATCGATCGTAGCTAGCTAGCTAGCATGCA
+TGCATGCGATCGATCGATCGTAGCTAGCTAGCTAGCATGCATGCATGCGATCGATCGATC