Mercurial > repos > iuc > alphagenome_sequence_predictor
changeset 0:b940a8d384e8 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/alphagenome commit 5c5a853db3db3e6590e2e39d9b4b54c5a3b1a708
| author | iuc |
|---|---|
| date | Fri, 20 Mar 2026 20:09:58 +0000 |
| parents | |
| children | |
| files | alphagenome_interval_predictor.py alphagenome_ism_scanner.py alphagenome_sequence_predictor.py alphagenome_sequence_predictor.xml alphagenome_variant_effect.py alphagenome_variant_scorer.py generate_test_fixtures.sh macros.xml test-data/fixture_interval_predictor.json test-data/fixture_ism_scanner.json test-data/fixture_sequence_predictor.json test-data/fixture_variant_effect.json test-data/fixture_variant_scorer.json test-data/test_input.vcf test-data/test_intervals.bed test-data/test_regions.bed test-data/test_sequences.fa |
| diffstat | 17 files changed, 3653 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alphagenome_interval_predictor.py Fri Mar 20 20:09:58 2026 +0000 @@ -0,0 +1,281 @@ +#!/usr/bin/env python +""" +AlphaGenome Interval Predictor for Galaxy + +Predicts regulatory tracks for genomic intervals — no variants, just baseline +characterization of the chromatin/expression landscape. +""" + +import argparse +import csv +import logging +import os +import sys + +import numpy as np +from alphagenome.data import genome +from alphagenome.models import dna_client + +__version__ = "0.6.1" + +OUTPUT_TYPE_MAP = { + "RNA_SEQ": dna_client.OutputType.RNA_SEQ, + "ATAC": dna_client.OutputType.ATAC, + "CAGE": dna_client.OutputType.CAGE, + "DNASE": dna_client.OutputType.DNASE, + "CHIP_HISTONE": dna_client.OutputType.CHIP_HISTONE, + "CHIP_TF": dna_client.OutputType.CHIP_TF, + "SPLICE_SITES": dna_client.OutputType.SPLICE_SITES, + "PROCAP": dna_client.OutputType.PROCAP, +} + +ORGANISM_MAP = { + "human": dna_client.Organism.HOMO_SAPIENS, + "mouse": dna_client.Organism.MUS_MUSCULUS, +} + +SEQUENCE_LENGTH_MAP = { + "16KB": 16_384, + "128KB": 131_072, + "512KB": 524_288, + "1MB": 1_048_576, +} + + +def create_model(api_key, local_model=False): + if local_model: + from alphagenome_research.model import dna_model + return dna_model.create_from_huggingface("all_folds") + return dna_client.create(api_key) + + +def parse_bed(bed_path, max_intervals): + intervals = [] + with open(bed_path) as f: + for line_num, line in enumerate(f): + line = line.strip() + if not line or line.startswith("#") or line.startswith("track") or line.startswith("browser"): + continue + fields = line.split("\t") + if len(fields) < 3: + logging.warning("Skipping malformed BED line %d: %s", line_num + 1, line) + continue + chrom = fields[0] + start = int(fields[1]) + end = int(fields[2]) + name = fields[3] if len(fields) > 3 else f"{chrom}:{start}-{end}" + if len(intervals) >= max_intervals: + logging.warning("Reached max intervals (%d), skipping remaining", max_intervals) + break + intervals.append((chrom, start, end, name)) + return intervals + + +def extract_region_slice(values, interval_start, region_start, region_end): + """Slice prediction array to the original BED region within the resized interval.""" + offset_start = region_start - interval_start + offset_end = region_end - interval_start + # Clamp to valid bounds + offset_start = max(0, offset_start) + offset_end = min(values.shape[0], offset_end) + return values[offset_start:offset_end] + + +def run(args): + logging.info("AlphaGenome Interval Predictor v%s", __version__) + logging.info("Input: %s", args.input) + logging.info("Output types: %s", ", ".join(args.output_types)) + logging.info("Output mode: %s", args.output_mode) + logging.info("Organism: %s", args.organism) + logging.info("Sequence length: %s", args.sequence_length) + logging.info("Max intervals: %d", args.max_intervals) + + if args.test_fixture: + import json + with open(args.test_fixture) as f: + fixture_data = json.load(f) + with open(args.output, "w", newline="") as outfile: + writer = csv.writer(outfile, delimiter="\t") + writer.writerow(fixture_data["columns"]) + for row in fixture_data["rows"]: + writer.writerow(row) + logging.info("Fixture mode: wrote %d rows to %s", len(fixture_data["rows"]), args.output) + return + + api_key = args.api_key or os.environ.get("ALPHAGENOME_API_KEY") + if not api_key and not args.local_model: + logging.error("No API key provided. Set ALPHAGENOME_API_KEY or use --api-key") + sys.exit(1) + + organism = ORGANISM_MAP[args.organism] + seq_length = SEQUENCE_LENGTH_MAP[args.sequence_length] + requested_outputs = [OUTPUT_TYPE_MAP[t] for t in args.output_types] + ontology_terms = [] + if args.ontology_terms: + ontology_terms = [t.strip() for t in args.ontology_terms.split(",") if t.strip()] + + intervals = parse_bed(args.input, args.max_intervals) + if not intervals: + logging.error("No valid intervals found in input BED file") + sys.exit(1) + logging.info("Loaded %d intervals", len(intervals)) + + logging.info("Connecting to AlphaGenome...") + model = create_model(api_key, local_model=args.local_model) + logging.info("Model ready.") + + stats = {"total": 0, "predicted": 0, "errors": 0} + + with open(args.output, "w", newline="") as outfile: + writer = csv.writer(outfile, delimiter="\t") + + if args.output_mode == "summary": + writer.writerow([ + "chrom", "start", "end", "name", "output_type", + "track_name", "ontology_curie", "mean_signal", "max_signal", + ]) + else: + writer.writerow([ + "chrom", "bin_start", "bin_end", "region_name", "output_type", + "track_name", "ontology_curie", "mean_signal", + ]) + + for interval_num, (chrom, start, end, name) in enumerate(intervals): + stats["total"] += 1 + + if interval_num > 0 and interval_num % 10 == 0: + logging.info( + "Progress: %d/%d intervals (%d predicted, %d errors)", + interval_num, len(intervals), stats["predicted"], stats["errors"], + ) + + try: + interval = genome.Interval(chrom, start, end).resize(seq_length) + + output = model.predict_interval( + interval, organism=organism, + requested_outputs=requested_outputs, + ontology_terms=ontology_terms, + ) + + for otype in args.output_types: + attr_name = otype.lower() + track_data = getattr(output, attr_name, None) + if track_data is None: + logging.warning("No %s data for %s", otype, name) + continue + + values = track_data.values # (seq_length, num_tracks) + metadata = track_data.metadata # DataFrame with track info + + region_values = extract_region_slice( + values, interval.start, start, end, + ) + + num_tracks = region_values.shape[1] if region_values.ndim > 1 else 1 + if region_values.ndim == 1: + region_values = region_values.reshape(-1, 1) + + for track_idx in range(num_tracks): + track_vals = region_values[:, track_idx] + track_name = "" + ontology_curie = "" + if metadata is not None and len(metadata) > track_idx: + row = metadata.iloc[track_idx] + track_name = str(row.get("track_name", "")) + ontology_curie = str(row.get("ontology_curie", "")) + + if args.output_mode == "summary": + mean_sig = float(np.mean(track_vals)) + max_sig = float(np.max(track_vals)) + writer.writerow([ + chrom, start, end, name, otype, + track_name, ontology_curie, + f"{mean_sig:.6f}", f"{max_sig:.6f}", + ]) + else: + # Binned mode + region_len = region_values.shape[0] + bin_size = args.bin_size + for bin_start_offset in range(0, region_len, bin_size): + bin_end_offset = min(bin_start_offset + bin_size, region_len) + bin_vals = track_vals[bin_start_offset:bin_end_offset] + mean_sig = float(np.mean(bin_vals)) + writer.writerow([ + chrom, start + bin_start_offset, + start + bin_end_offset, name, otype, + track_name, ontology_curie, + f"{mean_sig:.6f}", + ]) + + stats["predicted"] += 1 + + except Exception as e: + logging.error("Error predicting %s (%s:%d-%d): %s", name, chrom, start, end, e) + stats["errors"] += 1 + + logging.info("=" * 50) + logging.info("DONE — %d total, %d predicted, %d errors", + stats["total"], stats["predicted"], stats["errors"]) + logging.info("Output: %s", args.output) + + if stats["errors"] > 0 and stats["predicted"] == 0: + logging.error("All intervals failed. Check API key and network connectivity.") + sys.exit(1) + + +def parse_arguments(): + parser = argparse.ArgumentParser( + description="Predict regulatory tracks for genomic intervals using AlphaGenome", + ) + parser.add_argument("--input", required=True, help="Input BED file") + parser.add_argument("--output", required=True, help="Output TSV file") + parser.add_argument("--api-key", default=None, help="AlphaGenome API key (or set ALPHAGENOME_API_KEY)") + parser.add_argument( + "--organism", choices=["human", "mouse"], default="human", + ) + parser.add_argument( + "--output-types", nargs="+", choices=list(OUTPUT_TYPE_MAP.keys()), + default=["RNA_SEQ"], + ) + parser.add_argument("--ontology-terms", default=None) + parser.add_argument( + "--sequence-length", choices=list(SEQUENCE_LENGTH_MAP.keys()), default="1MB", + ) + parser.add_argument("--max-intervals", type=int, default=50) + parser.add_argument( + "--output-mode", choices=["summary", "binned"], default="summary", + ) + parser.add_argument("--bin-size", type=int, default=128) + parser.add_argument("--local-model", action="store_true") + parser.add_argument("--test-fixture", default=None, + help="Test fixture JSON for CI testing (bypasses API)") + parser.add_argument("--verbose", action="store_true") + parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}") + return parser.parse_args() + + +def main(): + args = parse_arguments() + + level = logging.DEBUG if args.verbose else logging.INFO + logging.basicConfig( + level=level, + format="%(asctime)s - %(levelname)s - %(message)s", + handlers=[logging.StreamHandler(sys.stderr)], + ) + + try: + run(args) + except KeyboardInterrupt: + logging.error("Interrupted") + sys.exit(130) + except Exception as e: + logging.error("Fatal error: %s", e) + if args.verbose: + logging.exception("Details:") + sys.exit(1) + + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alphagenome_ism_scanner.py Fri Mar 20 20:09:58 2026 +0000 @@ -0,0 +1,259 @@ +#!/usr/bin/env python +""" +AlphaGenome ISM Scanner for Galaxy + +In-silico saturation mutagenesis — systematically mutates every position in a +region to all 3 alt bases and scores each. Uses score_ism_variants() with +server-side chunking and parallelism. +""" + +import argparse +import csv +import logging +import os +import sys + +import numpy as np +from alphagenome.data import genome +from alphagenome.models import dna_client +from alphagenome.models.variant_scorers import RECOMMENDED_VARIANT_SCORERS + +__version__ = "0.6.1" + +ORGANISM_MAP = { + "human": dna_client.Organism.HOMO_SAPIENS, + "mouse": dna_client.Organism.MUS_MUSCULUS, +} + +SEQUENCE_LENGTH_MAP = { + "16KB": 16_384, + "128KB": 131_072, + "512KB": 524_288, + "1MB": 1_048_576, +} + + +def create_model(api_key, local_model=False): + if local_model: + from alphagenome_research.model import dna_model + return dna_model.create_from_huggingface("all_folds") + return dna_client.create(api_key) + + +def parse_bed(bed_path, max_regions, max_region_width): + regions = [] + with open(bed_path) as f: + for line_num, line in enumerate(f): + line = line.strip() + if not line or line.startswith("#") or line.startswith("track") or line.startswith("browser"): + continue + fields = line.split("\t") + if len(fields) < 3: + logging.warning("Skipping malformed BED line %d: %s", line_num + 1, line) + continue + chrom = fields[0] + start = int(fields[1]) + end = int(fields[2]) + name = fields[3] if len(fields) > 3 else f"{chrom}:{start}-{end}" + width = end - start + if width > max_region_width: + logging.warning( + "Region %s is %dbp, exceeding max width %dbp — trimming to center %dbp", + name, width, max_region_width, max_region_width, + ) + center = (start + end) // 2 + start = center - max_region_width // 2 + end = start + max_region_width + if len(regions) >= max_regions: + logging.warning("Reached max regions (%d), skipping remaining", max_regions) + break + regions.append((chrom, start, end, name)) + return regions + + +def run(args): + logging.info("AlphaGenome ISM Scanner v%s", __version__) + logging.info("Input: %s", args.input) + logging.info("Scorers: %s", ", ".join(args.scorers)) + logging.info("Organism: %s", args.organism) + logging.info("Sequence length: %s", args.sequence_length) + logging.info("Max regions: %d, max region width: %dbp", args.max_regions, args.max_region_width) + + if args.test_fixture: + import json + with open(args.test_fixture) as f: + fixture_data = json.load(f) + with open(args.output, "w", newline="") as outfile: + writer = csv.writer(outfile, delimiter="\t") + writer.writerow(fixture_data["columns"]) + for row in fixture_data["rows"]: + writer.writerow(row) + logging.info("Fixture mode: wrote %d rows to %s", len(fixture_data["rows"]), args.output) + return + + api_key = args.api_key or os.environ.get("ALPHAGENOME_API_KEY") + if not api_key and not args.local_model: + logging.error("No API key provided. Set ALPHAGENOME_API_KEY or use --api-key") + sys.exit(1) + + organism = ORGANISM_MAP[args.organism] + seq_length = SEQUENCE_LENGTH_MAP[args.sequence_length] + + available_keys = list(RECOMMENDED_VARIANT_SCORERS.keys()) + for key in args.scorers: + if key not in RECOMMENDED_VARIANT_SCORERS: + logging.error("Unknown scorer key: %s (available: %s)", key, ", ".join(available_keys)) + sys.exit(1) + selected_scorers = [RECOMMENDED_VARIANT_SCORERS[k] for k in args.scorers] + + regions = parse_bed(args.input, args.max_regions, args.max_region_width) + if not regions: + logging.error("No valid regions found in input BED file") + sys.exit(1) + logging.info("Loaded %d regions", len(regions)) + + logging.info("Connecting to AlphaGenome...") + model = create_model(api_key, local_model=args.local_model) + logging.info("Model ready.") + + stats = {"regions": 0, "scored": 0, "errors": 0} + row_count = 0 + + with open(args.output, "w", newline="") as outfile: + writer = csv.writer(outfile, delimiter="\t") + writer.writerow([ + "region", "position", "ref_base", "alt_base", + "gene_id", "gene_name", "gene_type", + "scorer", "track_name", "ontology_curie", + "raw_score", "quantile_score", + ]) + + for region_num, (chrom, start, end, name) in enumerate(regions): + stats["regions"] += 1 + width = end - start + logging.info("Region %d/%d: %s (%s:%d-%d, %dbp, %d mutations)", + region_num + 1, len(regions), name, chrom, start, end, width, width * 3) + + try: + interval = genome.Interval(chrom, start, end).resize(seq_length) + ism_interval = genome.Interval(chrom, start, end, strand="+") + + results = model.score_ism_variants( + interval, ism_interval, + variant_scorers=selected_scorers, + organism=organism, + max_workers=args.max_workers, + ) + + # results is list[list[AnnData]] — outer=variants (3*width), inner=scorers + # Each AnnData has: uns['variant'] with position/ref/alt, + # X for raw scores, layers['quantiles'], obs for genes, var for tracks + for var_results in results: + for scorer_idx, ad in enumerate(var_results): + variant_obj = ad.uns["variant"] + pos = variant_obj.position + ref_base = variant_obj.reference_bases + alt_base = variant_obj.alternate_bases + scorer_name = args.scorers[scorer_idx] if scorer_idx < len(args.scorers) else f"scorer_{scorer_idx}" + + raw_scores = ad.X # shape (n_genes, n_tracks) + quantile_scores = ad.layers.get("quantiles", None) + + for gene_idx in range(ad.n_obs): + gene_row = ad.obs.iloc[gene_idx] + gene_id = str(gene_row.get("gene_id", "")) + gene_name = str(gene_row.get("gene_name", "")) + gene_type = str(gene_row.get("gene_type", "")) + + for track_idx in range(ad.n_vars): + track_row = ad.var.iloc[track_idx] + track_name = str(track_row.get("name", "")) + ontology_curie = str(track_row.get("ontology_curie", "")) + + raw = float(raw_scores[gene_idx, track_idx]) + if np.isnan(raw): + continue + quant = "" + if quantile_scores is not None: + q = float(quantile_scores[gene_idx, track_idx]) + if not np.isnan(q): + quant = f"{q:.6f}" + + writer.writerow([ + name, pos, ref_base, alt_base, + gene_id, gene_name, gene_type, + scorer_name, track_name, ontology_curie, + f"{raw:.6f}", quant, + ]) + row_count += 1 + + stats["scored"] += 1 + logging.info("Region %s: %d ISM variants scored", name, len(results)) + + except Exception as e: + logging.error("Error scanning region %s (%s:%d-%d): %s", name, chrom, start, end, e) + stats["errors"] += 1 + + logging.info("Wrote %d rows to %s", row_count, args.output) + + logging.info("=" * 50) + logging.info("DONE — %d regions, %d scored, %d errors", + stats["regions"], stats["scored"], stats["errors"]) + + if stats["errors"] > 0 and stats["scored"] == 0: + logging.error("All regions failed. Check API key and network connectivity.") + sys.exit(1) + + +def parse_arguments(): + parser = argparse.ArgumentParser( + description="In-silico saturation mutagenesis using AlphaGenome score_ism_variants()", + ) + parser.add_argument("--input", required=True, help="Input BED file") + parser.add_argument("--output", required=True, help="Output TSV file") + parser.add_argument("--api-key", default=None, help="AlphaGenome API key (or set ALPHAGENOME_API_KEY)") + parser.add_argument( + "--organism", choices=["human", "mouse"], default="human", + ) + parser.add_argument( + "--scorers", nargs="+", default=["RNA_SEQ", "ATAC"], + help="Scorer keys from RECOMMENDED_VARIANT_SCORERS", + ) + parser.add_argument( + "--sequence-length", choices=list(SEQUENCE_LENGTH_MAP.keys()), default="1MB", + ) + parser.add_argument("--max-regions", type=int, default=10) + parser.add_argument("--max-region-width", type=int, default=200) + parser.add_argument("--max-workers", type=int, default=5) + parser.add_argument("--local-model", action="store_true") + parser.add_argument("--test-fixture", default=None, + help="Test fixture JSON for CI testing (bypasses API)") + parser.add_argument("--verbose", action="store_true") + parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}") + return parser.parse_args() + + +def main(): + args = parse_arguments() + + level = logging.DEBUG if args.verbose else logging.INFO + logging.basicConfig( + level=level, + format="%(asctime)s - %(levelname)s - %(message)s", + handlers=[logging.StreamHandler(sys.stderr)], + ) + + try: + run(args) + except KeyboardInterrupt: + logging.error("Interrupted") + sys.exit(130) + except Exception as e: + logging.error("Fatal error: %s", e) + if args.verbose: + logging.exception("Details:") + sys.exit(1) + + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alphagenome_sequence_predictor.py Fri Mar 20 20:09:58 2026 +0000 @@ -0,0 +1,297 @@ +#!/usr/bin/env python +""" +AlphaGenome Sequence Predictor for Galaxy + +Predicts regulatory tracks from raw DNA sequence — no genomic coordinates needed. +For synthetic biology (designed sequences) and non-reference assemblies. +""" + +import argparse +import csv +import logging +import os +import sys + +import numpy as np +from alphagenome.models import dna_client + +__version__ = "0.6.1" + +OUTPUT_TYPE_MAP = { + "RNA_SEQ": dna_client.OutputType.RNA_SEQ, + "ATAC": dna_client.OutputType.ATAC, + "CAGE": dna_client.OutputType.CAGE, + "DNASE": dna_client.OutputType.DNASE, + "CHIP_HISTONE": dna_client.OutputType.CHIP_HISTONE, + "CHIP_TF": dna_client.OutputType.CHIP_TF, + "SPLICE_SITES": dna_client.OutputType.SPLICE_SITES, + "PROCAP": dna_client.OutputType.PROCAP, +} + +ORGANISM_MAP = { + "human": dna_client.Organism.HOMO_SAPIENS, + "mouse": dna_client.Organism.MUS_MUSCULUS, +} + +SEQUENCE_LENGTH_MAP = { + "16KB": 16_384, + "128KB": 131_072, + "512KB": 524_288, + "1MB": 1_048_576, +} + + +def create_model(api_key, local_model=False): + if local_model: + from alphagenome_research.model import dna_model + return dna_model.create_from_huggingface("all_folds") + return dna_client.create(api_key) + + +def parse_fasta(fasta_path, max_sequences): + sequences = [] + current_id = None + current_seq = [] + + with open(fasta_path) as f: + for line in f: + line = line.strip() + if not line: + continue + if line.startswith(">"): + if current_id is not None: + sequences.append((current_id, "".join(current_seq))) + if len(sequences) >= max_sequences: + logging.warning("Reached max sequences (%d), skipping remaining", max_sequences) + current_id = None + break + current_id = line[1:].split()[0] + current_seq = [] + else: + current_seq.append(line) + if current_id is not None: + sequences.append((current_id, "".join(current_seq))) + + return sequences + + +def prepare_sequence(seq, target_length): + """Pad (N-centered) or center-trim to target_length. Returns (seq, content_start, content_end).""" + seq = seq.upper() + if len(seq) == target_length: + return seq, 0, len(seq) + elif len(seq) < target_length: + pad_total = target_length - len(seq) + pad_left = pad_total // 2 + prepared = "N" * pad_left + seq + "N" * (pad_total - pad_left) + return prepared, pad_left, pad_left + len(seq) + else: + trim_start = (len(seq) - target_length) // 2 + prepared = seq[trim_start:trim_start + target_length] + return prepared, 0, target_length + + +def run(args): + logging.info("AlphaGenome Sequence Predictor v%s", __version__) + logging.info("Input: %s", args.input) + logging.info("Output types: %s", ", ".join(args.output_types)) + logging.info("Output mode: %s", args.output_mode) + logging.info("Organism: %s", args.organism) + logging.info("Sequence length: %s", args.sequence_length) + logging.info("Max sequences: %d", args.max_sequences) + + if args.test_fixture: + import json + with open(args.test_fixture) as f: + fixture_data = json.load(f) + with open(args.output, "w", newline="") as outfile: + writer = csv.writer(outfile, delimiter="\t") + writer.writerow(fixture_data["columns"]) + for row in fixture_data["rows"]: + writer.writerow(row) + logging.info("Fixture mode: wrote %d rows to %s", len(fixture_data["rows"]), args.output) + return + + api_key = args.api_key or os.environ.get("ALPHAGENOME_API_KEY") + if not api_key and not args.local_model: + logging.error("No API key provided. Set ALPHAGENOME_API_KEY or use --api-key") + sys.exit(1) + + organism = ORGANISM_MAP[args.organism] + target_length = SEQUENCE_LENGTH_MAP[args.sequence_length] + requested_outputs = [OUTPUT_TYPE_MAP[t] for t in args.output_types] + ontology_terms = [] + if args.ontology_terms: + ontology_terms = [t.strip() for t in args.ontology_terms.split(",") if t.strip()] + + sequences = parse_fasta(args.input, args.max_sequences) + if not sequences: + logging.error("No valid sequences found in input FASTA file") + sys.exit(1) + logging.info("Loaded %d sequences", len(sequences)) + + logging.info("Connecting to AlphaGenome...") + model = create_model(api_key, local_model=args.local_model) + logging.info("Model ready.") + + stats = {"total": 0, "predicted": 0, "errors": 0} + + with open(args.output, "w", newline="") as outfile: + writer = csv.writer(outfile, delimiter="\t") + + if args.output_mode == "summary": + writer.writerow([ + "sequence_id", "sequence_length", "output_type", + "track_name", "ontology_curie", "mean_signal", "max_signal", + ]) + else: + writer.writerow([ + "sequence_id", "bin_start", "bin_end", "output_type", + "track_name", "ontology_curie", "mean_signal", + ]) + + for seq_num, (seq_id, raw_seq) in enumerate(sequences): + stats["total"] += 1 + orig_length = len(raw_seq) + + if seq_num > 0 and seq_num % 5 == 0: + logging.info( + "Progress: %d/%d sequences (%d predicted, %d errors)", + seq_num, len(sequences), stats["predicted"], stats["errors"], + ) + + logging.info("Sequence %d/%d: %s (%dbp)", seq_num + 1, len(sequences), seq_id, orig_length) + + try: + prepared_seq, content_start, content_end = prepare_sequence(raw_seq, target_length) + + if orig_length != target_length: + if orig_length < target_length: + logging.info(" N-padded %dbp -> %dbp", orig_length, target_length) + else: + logging.info(" Center-trimmed %dbp -> %dbp", orig_length, target_length) + + output = model.predict_sequence( + prepared_seq, organism=organism, + requested_outputs=requested_outputs, + ontology_terms=ontology_terms, + ) + + for otype in args.output_types: + attr_name = otype.lower() + track_data = getattr(output, attr_name, None) + if track_data is None: + logging.warning("No %s data for %s", otype, seq_id) + continue + + values = track_data.values + metadata = track_data.metadata + + # Slice to the actual content region (non-N portion) + content_values = values[content_start:content_end] + + num_tracks = content_values.shape[1] if content_values.ndim > 1 else 1 + if content_values.ndim == 1: + content_values = content_values.reshape(-1, 1) + + for track_idx in range(num_tracks): + track_vals = content_values[:, track_idx] + track_name = "" + ontology_curie = "" + if metadata is not None and len(metadata) > track_idx: + row = metadata.iloc[track_idx] + track_name = str(row.get("track_name", "")) + ontology_curie = str(row.get("ontology_curie", "")) + + if args.output_mode == "summary": + mean_sig = float(np.mean(track_vals)) + max_sig = float(np.max(track_vals)) + writer.writerow([ + seq_id, orig_length, otype, + track_name, ontology_curie, + f"{mean_sig:.6f}", f"{max_sig:.6f}", + ]) + else: + content_len = content_values.shape[0] + bin_size = args.bin_size + for bin_start in range(0, content_len, bin_size): + bin_end = min(bin_start + bin_size, content_len) + bin_vals = track_vals[bin_start:bin_end] + mean_sig = float(np.mean(bin_vals)) + writer.writerow([ + seq_id, bin_start, bin_end, otype, + track_name, ontology_curie, + f"{mean_sig:.6f}", + ]) + + stats["predicted"] += 1 + + except Exception as e: + logging.error("Error predicting %s: %s", seq_id, e) + stats["errors"] += 1 + + logging.info("=" * 50) + logging.info("DONE — %d total, %d predicted, %d errors", + stats["total"], stats["predicted"], stats["errors"]) + logging.info("Output: %s", args.output) + + if stats["errors"] > 0 and stats["predicted"] == 0: + logging.error("All sequences failed. Check API key and network connectivity.") + sys.exit(1) + + +def parse_arguments(): + parser = argparse.ArgumentParser( + description="Predict regulatory tracks from DNA sequence using AlphaGenome", + ) + parser.add_argument("--input", required=True, help="Input FASTA file") + parser.add_argument("--output", required=True, help="Output TSV file") + parser.add_argument("--api-key", default=None, help="AlphaGenome API key (or set ALPHAGENOME_API_KEY)") + parser.add_argument( + "--organism", choices=["human", "mouse"], default="human", + ) + parser.add_argument( + "--output-types", nargs="+", choices=list(OUTPUT_TYPE_MAP.keys()), + default=["RNA_SEQ"], + ) + parser.add_argument("--ontology-terms", default=None) + parser.add_argument( + "--sequence-length", choices=list(SEQUENCE_LENGTH_MAP.keys()), default="16KB", + ) + parser.add_argument("--max-sequences", type=int, default=20) + parser.add_argument( + "--output-mode", choices=["summary", "binned"], default="summary", + ) + parser.add_argument("--bin-size", type=int, default=128) + parser.add_argument("--local-model", action="store_true") + parser.add_argument("--test-fixture", default=None, + help="Test fixture JSON for CI testing (bypasses API)") + parser.add_argument("--verbose", action="store_true") + parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}") + return parser.parse_args() + + +def main(): + args = parse_arguments() + + level = logging.DEBUG if args.verbose else logging.INFO + logging.basicConfig( + level=level, + format="%(asctime)s - %(levelname)s - %(message)s", + handlers=[logging.StreamHandler(sys.stderr)], + ) + + try: + run(args) + except KeyboardInterrupt: + logging.error("Interrupted") + sys.exit(130) + except Exception as e: + logging.error("Fatal error: %s", e) + if args.verbose: + logging.exception("Details:") + sys.exit(1) + + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alphagenome_sequence_predictor.xml Fri Mar 20 20:09:58 2026 +0000 @@ -0,0 +1,144 @@ +<tool id="alphagenome_sequence_predictor" name="AlphaGenome Sequence Predictor" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> + <description>Predict regulatory tracks from DNA sequence</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements"/> + <version_command>echo @TOOL_VERSION@</version_command> + <command detect_errors="exit_code"><![CDATA[ + python '$__tool_directory__/alphagenome_sequence_predictor.py' + --input '$input_fasta' + --output '$output_tsv' + --organism '$organism' + --output-types + #for $otype in $output_types + '$otype' + #end for + --sequence-length '$sequence_length' + --max-sequences $max_sequences + --output-mode '$output_mode.mode' + #if str($output_mode.mode) == "binned" + --bin-size $output_mode.bin_size + #end if + @CMD_ONTOLOGY_TERMS@ + @CMD_TEST_FIXTURE@ + ]]></command> + <inputs> + <param name="input_fasta" type="data" format="fasta" label="Input FASTA file" help="DNA sequences to predict regulatory tracks for"/> + <expand macro="organism_param"/> + <expand macro="output_types_param"/> + <expand macro="ontology_terms_param"/> + <param name="sequence_length" type="select" label="Prediction window size" help="Window size in bases. Sequences shorter than the window are N-padded; longer sequences are center-trimmed."> + <option value="16KB" selected="true">16 kb (recommended for synbio)</option> + <option value="128KB">128 kb</option> + <option value="512KB">512 kb</option> + <option value="1MB">1 Mb</option> + </param> + <param name="max_sequences" type="integer" value="20" min="1" max="1000" label="Maximum sequences to process" help="API is rate-limited; start small to verify results"/> + <conditional name="output_mode"> + <param name="mode" type="select" label="Output mode"> + <option value="summary" selected="true">Summary (one row per sequence x track)</option> + <option value="binned">Binned (spatial resolution within sequences)</option> + </param> + <when value="summary"/> + <when value="binned"> + <param name="bin_size" type="integer" value="128" min="1" max="4096" label="Bin size (bp)" help="Divide each sequence into bins of this width for spatial resolution"/> + </when> + </conditional> + <expand macro="test_fixture_param"/> + </inputs> + <outputs> + <data name="output_tsv" format="tabular"/> + </outputs> + <tests> + <test expect_num_outputs="1"> + <param name="input_fasta" value="test_sequences.fa"/> + <param name="organism" value="human"/> + <param name="output_types" value="RNA_SEQ"/> + <param name="sequence_length" value="16KB"/> + <param name="max_sequences" value="2"/> + <conditional name="output_mode"> + <param name="mode" value="summary"/> + </conditional> + <param name="test_fixture" value="test-data/fixture_sequence_predictor.json"/> + <output name="output_tsv"> + <assert_contents> + <has_text text="sequence_id"/> + <has_text text="mean_signal"/> + <has_text text="max_signal"/> + <has_text text="test_promoter_1"/> + <has_text text="RNA_SEQ"/> + </assert_contents> + </output> + </test> + </tests> + <help><![CDATA[ +**AlphaGenome Sequence Predictor** + +Predicts regulatory tracks from raw DNA sequence using AlphaGenome's +``predict_sequence()`` API. No genomic coordinates needed — designed for +synthetic biology sequences, designed regulatory elements, and +non-reference assemblies. + +----- + +**What it does** + +For each sequence in the input FASTA file: + +1. Pads short sequences with N bases (centered) or center-trims long sequences + to fit the prediction window size +2. Calls ``predict_sequence()`` for selected output types +3. Computes summary statistics over the actual sequence content (excluding N padding) +4. Reports per-track values in tabular format + +----- + +**Sequence handling** + +- Sequences **shorter** than the window size are N-padded (centered). The model + still makes predictions, and statistics are computed only over the real content. +- Sequences **longer** than the window are center-trimmed to fit. +- Sequences **equal** to the window size are used as-is. + +For typical synbio applications (promoters, enhancers, etc.), the 16 KB window +is recommended as sequences are usually short. + +----- + +**Output modes** + +- **Summary**: One row per sequence x track with mean and max signal. +- **Binned**: Divides each sequence into fixed-width bins (default 128bp) and + reports mean signal per bin for spatial resolution. + +----- + +**Output columns — summary mode** + +- ``sequence_id`` — FASTA header ID +- ``sequence_length`` — original sequence length before padding/trimming +- ``output_type`` — RNA_SEQ, ATAC, etc. +- ``track_name`` — specific experimental track +- ``ontology_curie`` — tissue/cell type ontology term +- ``mean_signal`` — mean predicted signal +- ``max_signal`` — maximum predicted signal + +**Output columns — binned mode** + +- ``sequence_id`` — FASTA header ID +- ``bin_start``, ``bin_end`` — bin coordinates within the sequence +- ``output_type``, ``track_name``, ``ontology_curie`` — as above +- ``mean_signal`` — mean predicted signal in the bin + +----- + +**Use cases** + +- Evaluate designed promoter/enhancer sequences +- Predict splicing of synthetic gene constructs +- Characterize regulatory elements from non-reference genomes +- Compare predicted activity of sequence variants in synthetic biology + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alphagenome_variant_effect.py Fri Mar 20 20:09:58 2026 +0000 @@ -0,0 +1,302 @@ +#!/usr/bin/env python +""" +AlphaGenome Variant Effect Predictor for Galaxy + +POC tool that scores genetic variants using the real AlphaGenome API. +Uses predict_variant() to compute log-fold-change effect scores per output type. +""" + +import argparse +import logging +import os +import sys + +import cyvcf2 +import numpy as np +from alphagenome.data import genome +from alphagenome.models import dna_client + +__version__ = "0.6.1" + +OUTPUT_TYPE_MAP = { + "RNA_SEQ": dna_client.OutputType.RNA_SEQ, + "ATAC": dna_client.OutputType.ATAC, + "CAGE": dna_client.OutputType.CAGE, + "DNASE": dna_client.OutputType.DNASE, + "CHIP_HISTONE": dna_client.OutputType.CHIP_HISTONE, + "CHIP_TF": dna_client.OutputType.CHIP_TF, + "SPLICE_SITES": dna_client.OutputType.SPLICE_SITES, + "SPLICE_SITE_USAGE": dna_client.OutputType.SPLICE_SITE_USAGE, + "SPLICE_JUNCTIONS": dna_client.OutputType.SPLICE_JUNCTIONS, + "CONTACT_MAPS": dna_client.OutputType.CONTACT_MAPS, + "PROCAP": dna_client.OutputType.PROCAP, +} + +INFO_FIELD_MAP = { + "RNA_SEQ": "AG_RNA_LFC", + "ATAC": "AG_ATAC_LFC", + "CAGE": "AG_CAGE_LFC", + "DNASE": "AG_DNASE_LFC", + "CHIP_HISTONE": "AG_HISTONE_LFC", + "CHIP_TF": "AG_TF_LFC", + "SPLICE_SITES": "AG_SPLICE_LFC", + "SPLICE_SITE_USAGE": "AG_SPLICEUSE_LFC", + "SPLICE_JUNCTIONS": "AG_SPLICEJNC_LFC", + "CONTACT_MAPS": "AG_CONTACT_LFC", + "PROCAP": "AG_PROCAP_LFC", +} + +ORGANISM_MAP = { + "human": dna_client.Organism.HOMO_SAPIENS, + "mouse": dna_client.Organism.MUS_MUSCULUS, +} + +SEQUENCE_LENGTH_MAP = { + "16KB": 16_384, + "128KB": 131_072, + "512KB": 524_288, + "1MB": 1_048_576, +} + + +def create_model(api_key, local_model=False): + if local_model: + from alphagenome_research.model import dna_model + return dna_model.create_from_huggingface("all_folds") + return dna_client.create(api_key) + + +def compute_max_abs_lfc(ref_values, alt_values): + ref_vals = np.asarray(ref_values, dtype=np.float64) + alt_vals = np.asarray(alt_values, dtype=np.float64) + epsilon = 1e-6 + lfc = np.log2((alt_vals + epsilon) / (ref_vals + epsilon)) + return float(np.max(np.abs(lfc))) + + +def get_track_values(prediction_output, output_type_name): + """Attribute name is lowercase OutputType enum (e.g. RNA_SEQ -> output.rna_seq).""" + attr_name = output_type_name.lower() + track = getattr(prediction_output, attr_name, None) + if track is None: + return None + return track.values + + +def run(args): + logging.info("AlphaGenome Variant Effect Predictor v%s", __version__) + logging.info("Input: %s", args.input) + logging.info("Output types: %s", ", ".join(args.output_types)) + logging.info("Organism: %s", args.organism) + logging.info("Sequence length: %s", args.sequence_length) + logging.info("Max variants: %d", args.max_variants) + + # Fixture mode for CI testing (bypasses API) + fixture_lookup = None + if args.test_fixture: + import json + with open(args.test_fixture) as f: + fixture_data = json.load(f) + fixture_lookup = {} + for v in fixture_data["variants"]: + key = (v["chrom"], v["pos"], v["ref"], v["alt"]) + fixture_lookup[key] = v["scores"] + logging.info("Fixture mode: %d pre-computed variants", len(fixture_lookup)) + + if fixture_lookup is None: + api_key = args.api_key or os.environ.get("ALPHAGENOME_API_KEY") + if not api_key and not args.local_model: + logging.error("No API key provided. Set ALPHAGENOME_API_KEY or use --api-key") + sys.exit(1) + + organism = ORGANISM_MAP[args.organism] + seq_length = SEQUENCE_LENGTH_MAP[args.sequence_length] + requested_outputs = [OUTPUT_TYPE_MAP[t] for t in args.output_types] + ontology_terms = [] + if args.ontology_terms: + ontology_terms = [t.strip() for t in args.ontology_terms.split(",") if t.strip()] + + logging.info("Connecting to AlphaGenome...") + model = create_model(api_key, local_model=args.local_model) + logging.info("Model ready.") + + vcf_reader = cyvcf2.VCF(args.input) + + for otype in args.output_types: + info_id = INFO_FIELD_MAP[otype] + vcf_reader.add_info_to_header({ + "ID": info_id, + "Number": "A", + "Type": "Float", + "Description": f"AlphaGenome {otype} max absolute log-fold-change", + }) + vcf_reader.add_info_to_header({ + "ID": "AG_MAX_EFFECT", + "Number": "A", + "Type": "Float", + "Description": "AlphaGenome max effect across all selected output types", + }) + vcf_reader.add_to_header(f"##AlphaGenomeVariantEffectVersion={__version__}") + + vcf_writer = cyvcf2.Writer(args.output, vcf_reader) + + stats = {"total": 0, "scored": 0, "errors": 0, "skipped": 0} + + try: + for variant_num, record in enumerate(vcf_reader): + stats["total"] += 1 + + if variant_num >= args.max_variants: + stats["skipped"] += 1 + vcf_writer.write_record(record) + continue + + if variant_num > 0 and variant_num % 10 == 0: + logging.info( + "Progress: %d/%d variants processed (%d scored, %d errors)", + variant_num, args.max_variants, stats["scored"], stats["errors"], + ) + + chrom = record.CHROM + pos = record.POS # 1-based, matches API expectation + ref = record.REF + + # Process first ALT allele only for POC + if not record.ALT: + vcf_writer.write_record(record) + continue + + alt = record.ALT[0] + + try: + all_scores = [] + if fixture_lookup is not None: + fixture_scores = fixture_lookup.get((chrom, pos, ref, alt), {}) + for otype in args.output_types: + if otype in fixture_scores: + score = fixture_scores[otype] + info_id = INFO_FIELD_MAP[otype] + record.INFO[info_id] = round(score, 6) + all_scores.append(score) + else: + variant = genome.Variant( + chromosome=chrom, + position=pos, + reference_bases=ref, + alternate_bases=alt, + ) + interval = variant.reference_interval.resize(seq_length) + + outputs = model.predict_variant( + interval=interval, + variant=variant, + organism=organism, + ontology_terms=ontology_terms, + requested_outputs=requested_outputs, + ) + + for otype in args.output_types: + ref_vals = get_track_values(outputs.reference, otype) + alt_vals = get_track_values(outputs.alternate, otype) + if ref_vals is not None and alt_vals is not None: + score = compute_max_abs_lfc(ref_vals, alt_vals) + info_id = INFO_FIELD_MAP[otype] + record.INFO[info_id] = round(score, 6) + all_scores.append(score) + else: + logging.warning( + "No %s track in output for %s:%d %s>%s", + otype, chrom, pos, ref, alt, + ) + + if all_scores: + record.INFO["AG_MAX_EFFECT"] = round(max(all_scores), 6) + stats["scored"] += 1 + else: + stats["errors"] += 1 + + except Exception as e: + logging.error("Error scoring %s:%d %s>%s: %s", chrom, pos, ref, alt, e) + stats["errors"] += 1 + + vcf_writer.write_record(record) + + finally: + vcf_writer.close() + vcf_reader.close() + + logging.info("=" * 50) + logging.info("DONE — %d total, %d scored, %d errors, %d skipped (over limit)", + stats["total"], stats["scored"], stats["errors"], stats["skipped"]) + logging.info("Output: %s", args.output) + + if stats["errors"] > 0 and stats["scored"] == 0: + logging.error("All variants failed. Check API key and network connectivity.") + sys.exit(1) + + +def parse_arguments(): + parser = argparse.ArgumentParser( + description="Score genetic variants using AlphaGenome predict_variant()", + ) + parser.add_argument("--input", required=True, help="Input VCF file") + parser.add_argument("--output", required=True, help="Output VCF file") + parser.add_argument("--api-key", default=None, help="AlphaGenome API key (or set ALPHAGENOME_API_KEY)") + parser.add_argument( + "--organism", choices=["human", "mouse"], default="human", + help="Organism (default: human)", + ) + parser.add_argument( + "--output-types", nargs="+", choices=list(OUTPUT_TYPE_MAP.keys()), + default=["RNA_SEQ"], + help="AlphaGenome output types to predict (default: RNA_SEQ)", + ) + parser.add_argument( + "--ontology-terms", default=None, + help="Comma-separated ontology terms (e.g. UBERON:0001157,CL:0000746)", + ) + parser.add_argument( + "--sequence-length", choices=list(SEQUENCE_LENGTH_MAP.keys()), default="1MB", + help="Prediction window size (default: 1MB)", + ) + parser.add_argument( + "--max-variants", type=int, default=100, + help="Maximum variants to process (default: 100)", + ) + parser.add_argument( + "--local-model", action="store_true", + help="Use local HuggingFace model instead of API", + ) + parser.add_argument("--test-fixture", default=None, + help="Test fixture JSON for CI testing (bypasses API)") + parser.add_argument("--verbose", action="store_true", help="Debug logging") + parser.add_argument( + "--version", action="version", version=f"%(prog)s {__version__}", + ) + return parser.parse_args() + + +def main(): + args = parse_arguments() + + level = logging.DEBUG if args.verbose else logging.INFO + logging.basicConfig( + level=level, + format="%(asctime)s - %(levelname)s - %(message)s", + handlers=[logging.StreamHandler(sys.stderr)], + ) + + try: + run(args) + except KeyboardInterrupt: + logging.error("Interrupted") + sys.exit(130) + except Exception as e: + logging.error("Fatal error: %s", e) + if args.verbose: + logging.exception("Details:") + sys.exit(1) + + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alphagenome_variant_scorer.py Fri Mar 20 20:09:58 2026 +0000 @@ -0,0 +1,206 @@ +#!/usr/bin/env python +""" +AlphaGenome Variant Scorer for Galaxy + +Uses score_variant() for server-side gene-level aggregation with spatial masking +and empirical quantile normalization. Outputs structured per-gene, per-track scores +via tidy_scores(). +""" + +import argparse +import logging +import os +import sys + +import cyvcf2 +from alphagenome.data import genome +from alphagenome.models import dna_client +from alphagenome.models.variant_scorers import RECOMMENDED_VARIANT_SCORERS, tidy_scores + +__version__ = "0.6.1" + +ORGANISM_MAP = { + "human": dna_client.Organism.HOMO_SAPIENS, + "mouse": dna_client.Organism.MUS_MUSCULUS, +} + +SEQUENCE_LENGTH_MAP = { + "16KB": 16_384, + "128KB": 131_072, + "512KB": 524_288, + "1MB": 1_048_576, +} + + +def create_model(api_key, local_model=False): + if local_model: + from alphagenome_research.model import dna_model + return dna_model.create_from_huggingface("all_folds") + return dna_client.create(api_key) + + +def run(args): + logging.info("AlphaGenome Variant Scorer v%s", __version__) + logging.info("Input: %s", args.input) + logging.info("Scorers: %s", ", ".join(args.scorers)) + logging.info("Organism: %s", args.organism) + logging.info("Sequence length: %s", args.sequence_length) + logging.info("Max variants: %d", args.max_variants) + + if args.test_fixture: + import json + import pandas as pd + with open(args.test_fixture) as f: + fixture_data = json.load(f) + df = pd.DataFrame(fixture_data["rows"], columns=fixture_data["columns"]) + df.to_csv(args.output, sep="\t", index=False) + logging.info("Fixture mode: wrote %d rows to %s", len(df), args.output) + return + + api_key = args.api_key or os.environ.get("ALPHAGENOME_API_KEY") + if not api_key and not args.local_model: + logging.error("No API key provided. Set ALPHAGENOME_API_KEY or use --api-key") + sys.exit(1) + + organism = ORGANISM_MAP[args.organism] + seq_length = SEQUENCE_LENGTH_MAP[args.sequence_length] + + available_keys = list(RECOMMENDED_VARIANT_SCORERS.keys()) + selected_keys = args.scorers + for key in selected_keys: + if key not in RECOMMENDED_VARIANT_SCORERS: + logging.error("Unknown scorer key: %s (available: %s)", key, ", ".join(available_keys)) + sys.exit(1) + selected_scorers = [RECOMMENDED_VARIANT_SCORERS[k] for k in selected_keys] + logging.info("Using %d scorers", len(selected_scorers)) + + logging.info("Connecting to AlphaGenome...") + model = create_model(api_key, local_model=args.local_model) + logging.info("Model ready.") + + vcf_reader = cyvcf2.VCF(args.input) + + stats = {"total": 0, "scored": 0, "errors": 0, "skipped": 0} + all_rows = [] + + try: + for variant_num, record in enumerate(vcf_reader): + stats["total"] += 1 + + if variant_num >= args.max_variants: + stats["skipped"] += 1 + continue + + if variant_num > 0 and variant_num % 10 == 0: + logging.info( + "Progress: %d/%d variants processed (%d scored, %d errors)", + variant_num, args.max_variants, stats["scored"], stats["errors"], + ) + + chrom = record.CHROM + pos = record.POS + ref = record.REF + + if not record.ALT: + continue + + alt = record.ALT[0] + variant_id = f"{chrom}:{pos}:{ref}>{alt}" + + try: + variant = genome.Variant( + chromosome=chrom, + position=pos, + reference_bases=ref, + alternate_bases=alt, + ) + interval = variant.reference_interval.resize(seq_length) + + scores = model.score_variant( + interval, variant, selected_scorers, organism=organism, + ) + + df = tidy_scores(scores) + all_rows.append(df) + stats["scored"] += 1 + + logging.debug("Scored %s: %d rows", variant_id, len(df)) + + except Exception as e: + logging.error("Error scoring %s: %s", variant_id, e) + stats["errors"] += 1 + + finally: + vcf_reader.close() + + if all_rows: + import pandas as pd + combined = pd.concat(all_rows, ignore_index=True) + combined.to_csv(args.output, sep="\t", index=False) + logging.info("Wrote %d rows to %s", len(combined), args.output) + else: + with open(args.output, "w") as f: + f.write("variant_id\n") + logging.warning("No variants scored successfully") + + logging.info("=" * 50) + logging.info("DONE — %d total, %d scored, %d errors, %d skipped (over limit)", + stats["total"], stats["scored"], stats["errors"], stats["skipped"]) + + if stats["errors"] > 0 and stats["scored"] == 0: + logging.error("All variants failed. Check API key and network connectivity.") + sys.exit(1) + + +def parse_arguments(): + parser = argparse.ArgumentParser( + description="Score variants using AlphaGenome score_variant() with gene-level aggregation", + ) + parser.add_argument("--input", required=True, help="Input VCF file") + parser.add_argument("--output", required=True, help="Output TSV file") + parser.add_argument("--api-key", default=None, help="AlphaGenome API key (or set ALPHAGENOME_API_KEY)") + parser.add_argument( + "--organism", choices=["human", "mouse"], default="human", + ) + parser.add_argument( + "--scorers", nargs="+", default=["RNA_SEQ", "ATAC", "SPLICE_SITES"], + help="Scorer keys from RECOMMENDED_VARIANT_SCORERS", + ) + parser.add_argument( + "--sequence-length", choices=list(SEQUENCE_LENGTH_MAP.keys()), default="1MB", + ) + parser.add_argument( + "--max-variants", type=int, default=100, + ) + parser.add_argument("--local-model", action="store_true") + parser.add_argument("--test-fixture", default=None, + help="Test fixture JSON for CI testing (bypasses API)") + parser.add_argument("--verbose", action="store_true") + parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}") + return parser.parse_args() + + +def main(): + args = parse_arguments() + + level = logging.DEBUG if args.verbose else logging.INFO + logging.basicConfig( + level=level, + format="%(asctime)s - %(levelname)s - %(message)s", + handlers=[logging.StreamHandler(sys.stderr)], + ) + + try: + run(args) + except KeyboardInterrupt: + logging.error("Interrupted") + sys.exit(130) + except Exception as e: + logging.error("Fatal error: %s", e) + if args.verbose: + logging.exception("Details:") + sys.exit(1) + + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/generate_test_fixtures.sh Fri Mar 20 20:09:58 2026 +0000 @@ -0,0 +1,205 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Regenerate fixture JSON files for all 5 AlphaGenome Galaxy tools. +# Runs each tool against the real API with the same params as the Galaxy <test> +# sections, then converts the output to the fixture format. +# +# Usage: +# export ALPHAGENOME_API_KEY=<your-key> +# bash tools/alphagenome/generate_test_fixtures.sh + +if [[ -z "${ALPHAGENOME_API_KEY:-}" ]]; then + echo "ERROR: ALPHAGENOME_API_KEY is not set" >&2 + exit 1 +fi + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +TMPDIR=$(mktemp -d) +trap 'rm -rf "$TMPDIR"' EXIT + +PASS=0 +FAIL=0 +MAX_FIXTURE_ROWS=30 + +run_tool() { + local name="$1" + shift + echo "--- Running $name ---" + if python3 "$@" 2>"$TMPDIR/${name}.log"; then + tail -3 "$TMPDIR/${name}.log" + echo " OK" + PASS=$((PASS + 1)) + else + tail -10 "$TMPDIR/${name}.log" >&2 + echo " FAILED (see log above)" >&2 + FAIL=$((FAIL + 1)) + return 1 + fi +} + +# ── 1. variant_effect (VCF output → fixture JSON) ────────────────────── + +run_tool "variant_effect" \ + alphagenome_variant_effect.py \ + --input test-data/test_input.vcf \ + --output "$TMPDIR/variant_effect.vcf" \ + --api-key "$ALPHAGENOME_API_KEY" \ + --output-types RNA_SEQ \ + --sequence-length 128KB \ + --max-variants 3 \ + --verbose + +python3 -c ' +import json, sys + +# Reverse the INFO_FIELD_MAP from the tool script +FIELD_TO_TYPE = { + "AG_RNA_LFC": "RNA_SEQ", + "AG_ATAC_LFC": "ATAC", + "AG_CAGE_LFC": "CAGE", + "AG_DNASE_LFC": "DNASE", + "AG_HISTONE_LFC": "CHIP_HISTONE", + "AG_TF_LFC": "CHIP_TF", + "AG_SPLICE_LFC": "SPLICE_SITES", + "AG_SPLICEUSE_LFC": "SPLICE_SITE_USAGE", + "AG_SPLICEJNC_LFC": "SPLICE_JUNCTIONS", + "AG_CONTACT_LFC": "CONTACT_MAPS", + "AG_PROCAP_LFC": "PROCAP", +} + +variants = [] +for line in open(sys.argv[1]): + if line.startswith("#"): + continue + fields = line.strip().split("\t") + chrom, pos, ref, alt = fields[0], int(fields[1]), fields[3], fields[4] + info = fields[7] + scores = {} + for kv in info.split(";"): + if "=" not in kv: + continue + key, val = kv.split("=", 1) + if key in FIELD_TO_TYPE: + scores[FIELD_TO_TYPE[key]] = round(float(val), 6) + variants.append({"chrom": chrom, "pos": pos, "ref": ref, "alt": alt, "scores": scores}) + +with open(sys.argv[2], "w") as f: + json.dump({"variants": variants}, f, indent=2) + f.write("\n") +print(f" -> wrote {len(variants)} variants to {sys.argv[2]}") +' "$TMPDIR/variant_effect.vcf" \ + "test-data/fixture_variant_effect.json" + +# ── Helper: TSV → fixture JSON ────────────────────────────────────────── + +tsv_to_fixture() { + local tsv_file="$1" + local json_file="$2" + python3 -c ' +import csv, json, sys + +max_rows = int(sys.argv[3]) + +with open(sys.argv[1]) as f: + reader = csv.reader(f, delimiter="\t") + columns = next(reader) + rows = [] + total = 0 + for row in reader: + total += 1 + if len(rows) >= max_rows: + continue + typed = [] + for val in row: + try: + typed.append(int(val)) + except ValueError: + try: + typed.append(float(val)) + except ValueError: + typed.append(val) + rows.append(typed) + +with open(sys.argv[2], "w") as f: + json.dump({"columns": columns, "rows": rows}, f, indent=2) + f.write("\n") +if total > max_rows: + print(f" -> wrote {len(columns)} columns, {len(rows)}/{total} rows (capped) to {sys.argv[2]}") +else: + print(f" -> wrote {len(columns)} columns, {len(rows)} rows to {sys.argv[2]}") +' "$tsv_file" "$json_file" "$MAX_FIXTURE_ROWS" +} + +# ── 2. variant_scorer (TSV output) ───────────────────────────────────── + +run_tool "variant_scorer" \ + alphagenome_variant_scorer.py \ + --input test-data/test_input.vcf \ + --output "$TMPDIR/variant_scorer.tsv" \ + --api-key "$ALPHAGENOME_API_KEY" \ + --scorers RNA_SEQ \ + --sequence-length 16KB \ + --max-variants 3 \ + --verbose + +tsv_to_fixture "$TMPDIR/variant_scorer.tsv" \ + "test-data/fixture_variant_scorer.json" + +# ── 3. ism_scanner (TSV output) ──────────────────────────────────────── + +run_tool "ism_scanner" \ + alphagenome_ism_scanner.py \ + --input test-data/test_regions.bed \ + --output "$TMPDIR/ism_scanner.tsv" \ + --api-key "$ALPHAGENOME_API_KEY" \ + --scorers RNA_SEQ \ + --sequence-length 16KB \ + --max-regions 1 \ + --max-region-width 10 \ + --verbose + +tsv_to_fixture "$TMPDIR/ism_scanner.tsv" \ + "test-data/fixture_ism_scanner.json" + +# ── 4. interval_predictor (TSV output) ───────────────────────────────── + +run_tool "interval_predictor" \ + alphagenome_interval_predictor.py \ + --input test-data/test_intervals.bed \ + --output "$TMPDIR/interval_predictor.tsv" \ + --api-key "$ALPHAGENOME_API_KEY" \ + --output-types RNA_SEQ \ + --sequence-length 16KB \ + --max-intervals 3 \ + --output-mode summary \ + --verbose + +tsv_to_fixture "$TMPDIR/interval_predictor.tsv" \ + "test-data/fixture_interval_predictor.json" + +# ── 5. sequence_predictor (TSV output) ───────────────────────────────── + +run_tool "sequence_predictor" \ + alphagenome_sequence_predictor.py \ + --input test-data/test_sequences.fa \ + --output "$TMPDIR/sequence_predictor.tsv" \ + --api-key "$ALPHAGENOME_API_KEY" \ + --output-types RNA_SEQ \ + --sequence-length 16KB \ + --max-sequences 2 \ + --output-mode summary \ + --verbose + +tsv_to_fixture "$TMPDIR/sequence_predictor.tsv" \ + "test-data/fixture_sequence_predictor.json" + +# ── Summary ───────────────────────────────────────────────────────────── + +echo "" +echo "=== Done: $PASS passed, $FAIL failed ===" +if [[ $FAIL -gt 0 ]]; then + exit 1 +fi
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Fri Mar 20 20:09:58 2026 +0000 @@ -0,0 +1,64 @@ +<macros> + <token name="@TOOL_VERSION@">0.6.1</token> + <token name="@VERSION_SUFFIX@">0</token> + <token name="@PROFILE@">25.0</token> + <xml name="requirements"> + <requirements> + <requirement type="package" version="@TOOL_VERSION@">alphagenome</requirement> + <requirement type="package" version="0.31.4">cyvcf2</requirement> + <credentials name="alphagenome" version="1.0" label="AlphaGenome API" description="API key from Google DeepMind AlphaGenome"> + <secret name="api_key" inject_as_env="ALPHAGENOME_API_KEY" label="API Key" description="Your AlphaGenome API key"/> + </credentials> + </requirements> + </xml> + <xml name="organism_param"> + <param name="organism" type="select" label="Organism"> + <option value="human" selected="true">Human (hg38)</option> + <option value="mouse">Mouse (mm10)</option> + </param> + </xml> + <xml name="sequence_length_param"> + <param name="sequence_length" type="select" label="Prediction window size" help="Genomic context window in bases around the target region (e.g. 1 Mb = 1,048,576 bases, not bytes)"> + <option value="16KB">16 kb</option> + <option value="128KB">128 kb</option> + <option value="512KB">512 kb</option> + <option value="1MB" selected="true">1 Mb</option> + </param> + </xml> + <xml name="output_types_param"> + <param name="output_types" type="select" multiple="true" label="Output types to predict"> + <option value="RNA_SEQ" selected="true">RNA-seq</option> + <option value="ATAC">ATAC-seq</option> + <option value="CAGE">CAGE</option> + <option value="DNASE">DNase</option> + <option value="CHIP_HISTONE">ChIP-seq histone</option> + <option value="CHIP_TF">ChIP-seq TF</option> + <option value="SPLICE_SITES">Splice sites</option> + <option value="PROCAP">PRO-cap</option> + <validator type="no_options" message="Select at least one output type"/> + </param> + </xml> + <xml name="ontology_terms_param"> + <param name="ontology_terms" type="text" value="" label="Ontology terms (optional)" help="Comma-separated UBERON/CL terms for tissue context, e.g. UBERON:0002107,CL:0000746"> + <validator type="regex" message="Only alphanumeric characters, colons, commas, and spaces are allowed">[A-Za-z0-9:, ]*</validator> + </param> + </xml> + <xml name="test_fixture_param"> + <param name="test_fixture" type="hidden" value=""/> + </xml> + <token name="@CMD_ONTOLOGY_TERMS@"><![CDATA[ + #if str($ontology_terms).strip() + --ontology-terms '$ontology_terms' + #end if + ]]></token> + <token name="@CMD_TEST_FIXTURE@"><![CDATA[ + #if $test_fixture + --test-fixture '$__tool_directory__/$test_fixture' + #end if + ]]></token> + <xml name="citations"> + <citations> + <citation type="doi">10.1038/s41586-025-10014-0</citation> + </citations> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fixture_interval_predictor.json Fri Mar 20 20:09:58 2026 +0000 @@ -0,0 +1,345 @@ +{ + "columns": [ + "chrom", + "start", + "end", + "name", + "output_type", + "track_name", + "ontology_curie", + "mean_signal", + "max_signal" + ], + "rows": [ + [ + "chr22", + 36201500, + 36202500, + "test_interval_1", + "RNA_SEQ", + "", + "CL:0000047", + 1.4e-05, + 5.9e-05 + ], + [ + "chr22", + 36201500, + 36202500, + "test_interval_1", + "RNA_SEQ", + "", + "CL:0000062", + 4.2e-05, + 0.0001 + ], + [ + "chr22", + 36201500, + 36202500, + "test_interval_1", + "RNA_SEQ", + "", + "CL:0000084", + 2.4e-05, + 0.000137 + ], + [ + "chr22", + 36201500, + 36202500, + "test_interval_1", + "RNA_SEQ", + "", + "CL:0000084", + 1.5e-05, + 4.4e-05 + ], + [ + "chr22", + 36201500, + 36202500, + "test_interval_1", + "RNA_SEQ", + "", + "CL:0000115", + 0.000209, + 0.001122 + ], + [ + "chr22", + 36201500, + 36202500, + "test_interval_1", + "RNA_SEQ", + "", + "CL:0000127", + 6e-06, + 1.8e-05 + ], + [ + "chr22", + 36201500, + 36202500, + "test_interval_1", + "RNA_SEQ", + "", + "CL:0000134", + 8e-06, + 3.6e-05 + ], + [ + "chr22", + 36201500, + 36202500, + "test_interval_1", + "RNA_SEQ", + "", + "CL:0000137", + 2e-05, + 8.4e-05 + ], + [ + "chr22", + 36201500, + 36202500, + "test_interval_1", + "RNA_SEQ", + "", + "CL:0000138", + 1.6e-05, + 7.2e-05 + ], + [ + "chr22", + 36201500, + 36202500, + "test_interval_1", + "RNA_SEQ", + "", + "CL:0000169", + 0.000116, + 0.000366 + ], + [ + "chr22", + 36201500, + 36202500, + "test_interval_1", + "RNA_SEQ", + "", + "CL:0000182", + 3e-05, + 9e-05 + ], + [ + "chr22", + 36201500, + 36202500, + "test_interval_1", + "RNA_SEQ", + "", + "CL:0000187", + 9.1e-05, + 0.000439 + ], + [ + "chr22", + 36201500, + 36202500, + "test_interval_1", + "RNA_SEQ", + "", + "CL:0000192", + 0.000107, + 0.000298 + ], + [ + "chr22", + 36201500, + 36202500, + "test_interval_1", + "RNA_SEQ", + "", + "CL:0000221", + 2.7e-05, + 0.000164 + ], + [ + "chr22", + 36201500, + 36202500, + "test_interval_1", + "RNA_SEQ", + "", + "CL:0000222", + 4.3e-05, + 0.000254 + ], + [ + "chr22", + 36201500, + 36202500, + "test_interval_1", + "RNA_SEQ", + "", + "CL:0000223", + 2.6e-05, + 0.000135 + ], + [ + "chr22", + 36201500, + 36202500, + "test_interval_1", + "RNA_SEQ", + "", + "CL:0000223", + 2.8e-05, + 6.6e-05 + ], + [ + "chr22", + 36201500, + 36202500, + "test_interval_1", + "RNA_SEQ", + "", + "CL:0000236", + 1.1e-05, + 4.1e-05 + ], + [ + "chr22", + 36201500, + 36202500, + "test_interval_1", + "RNA_SEQ", + "", + "CL:0000236", + 5e-06, + 1.7e-05 + ], + [ + "chr22", + 36201500, + 36202500, + "test_interval_1", + "RNA_SEQ", + "", + "CL:0000307", + 4.2e-05, + 8.2e-05 + ], + [ + "chr22", + 36201500, + 36202500, + "test_interval_1", + "RNA_SEQ", + "", + "CL:0000312", + 1e-05, + 2.2e-05 + ], + [ + "chr22", + 36201500, + 36202500, + "test_interval_1", + "RNA_SEQ", + "", + "CL:0000346", + 8.1e-05, + 0.000198 + ], + [ + "chr22", + 36201500, + 36202500, + "test_interval_1", + "RNA_SEQ", + "", + "CL:0000351", + 0.000123, + 0.000305 + ], + [ + "chr22", + 36201500, + 36202500, + "test_interval_1", + "RNA_SEQ", + "", + "CL:0000515", + 2.1e-05, + 9e-05 + ], + [ + "chr22", + 36201500, + 36202500, + "test_interval_1", + "RNA_SEQ", + "", + "CL:0000515", + 8.4e-05, + 0.000626 + ], + [ + "chr22", + 36201500, + 36202500, + "test_interval_1", + "RNA_SEQ", + "", + "CL:0000594", + 8e-05, + 0.00022 + ], + [ + "chr22", + 36201500, + 36202500, + "test_interval_1", + "RNA_SEQ", + "", + "CL:0000623", + 2.6e-05, + 0.000101 + ], + [ + "chr22", + 36201500, + 36202500, + "test_interval_1", + "RNA_SEQ", + "", + "CL:0000623", + 3.4e-05, + 9.8e-05 + ], + [ + "chr22", + 36201500, + 36202500, + "test_interval_1", + "RNA_SEQ", + "", + "CL:0000624", + 9e-06, + 3.4e-05 + ], + [ + "chr22", + 36201500, + 36202500, + "test_interval_1", + "RNA_SEQ", + "", + "CL:0000625", + 9e-06, + 2.9e-05 + ] + ] +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fixture_ism_scanner.json Fri Mar 20 20:09:58 2026 +0000 @@ -0,0 +1,438 @@ +{ + "columns": [ + "region", + "position", + "ref_base", + "alt_base", + "gene_id", + "gene_name", + "gene_type", + "scorer", + "track_name", + "ontology_curie", + "raw_score", + "quantile_score" + ], + "rows": [ + [ + "test_region_1", + 36201696, + "T", + "A", + "ENSG00000100336.18", + "APOL4", + "protein_coding", + "RNA_SEQ", + "CL:0000047 polyA plus RNA-seq", + "CL:0000047", + -1.721732, + -0.99998 + ], + [ + "test_region_1", + 36201696, + "T", + "A", + "ENSG00000100336.18", + "APOL4", + "protein_coding", + "RNA_SEQ", + "CL:0000062 total RNA-seq", + "CL:0000062", + -2.872142, + -0.99998 + ], + [ + "test_region_1", + 36201696, + "T", + "A", + "ENSG00000100336.18", + "APOL4", + "protein_coding", + "RNA_SEQ", + "CL:0000084 polyA plus RNA-seq", + "CL:0000084", + -0.43112, + -0.99998 + ], + [ + "test_region_1", + 36201696, + "T", + "A", + "ENSG00000100336.18", + "APOL4", + "protein_coding", + "RNA_SEQ", + "CL:0000084 total RNA-seq", + "CL:0000084", + -0.323944, + -0.99998 + ], + [ + "test_region_1", + 36201696, + "T", + "A", + "ENSG00000100336.18", + "APOL4", + "protein_coding", + "RNA_SEQ", + "CL:0000115 total RNA-seq", + "CL:0000115", + -1.891316, + -0.99998 + ], + [ + "test_region_1", + 36201696, + "T", + "A", + "ENSG00000100336.18", + "APOL4", + "protein_coding", + "RNA_SEQ", + "CL:0000127 total RNA-seq", + "CL:0000127", + -2.695157, + -0.99998 + ], + [ + "test_region_1", + 36201696, + "T", + "A", + "ENSG00000100336.18", + "APOL4", + "protein_coding", + "RNA_SEQ", + "CL:0000134 polyA plus RNA-seq", + "CL:0000134", + -2.826576, + -0.99998 + ], + [ + "test_region_1", + 36201696, + "T", + "A", + "ENSG00000100336.18", + "APOL4", + "protein_coding", + "RNA_SEQ", + "CL:0000137 total RNA-seq", + "CL:0000137", + -2.560256, + -0.99998 + ], + [ + "test_region_1", + 36201696, + "T", + "A", + "ENSG00000100336.18", + "APOL4", + "protein_coding", + "RNA_SEQ", + "CL:0000138 total RNA-seq", + "CL:0000138", + -2.42312, + -0.99998 + ], + [ + "test_region_1", + 36201696, + "T", + "A", + "ENSG00000100336.18", + "APOL4", + "protein_coding", + "RNA_SEQ", + "CL:0000169 total RNA-seq", + "CL:0000169", + -0.795844, + -0.99998 + ], + [ + "test_region_1", + 36201696, + "T", + "A", + "ENSG00000100336.18", + "APOL4", + "protein_coding", + "RNA_SEQ", + "CL:0000182 total RNA-seq", + "CL:0000182", + -1.609686, + -0.99998 + ], + [ + "test_region_1", + 36201696, + "T", + "A", + "ENSG00000100336.18", + "APOL4", + "protein_coding", + "RNA_SEQ", + "CL:0000187 total RNA-seq", + "CL:0000187", + -3.523364, + -0.99998 + ], + [ + "test_region_1", + 36201696, + "T", + "A", + "ENSG00000100336.18", + "APOL4", + "protein_coding", + "RNA_SEQ", + "CL:0000192 total RNA-seq", + "CL:0000192", + -1.707838, + -0.99998 + ], + [ + "test_region_1", + 36201696, + "T", + "A", + "ENSG00000100336.18", + "APOL4", + "protein_coding", + "RNA_SEQ", + "CL:0000221 polyA plus RNA-seq", + "CL:0000221", + -1.927729, + -0.99998 + ], + [ + "test_region_1", + 36201696, + "T", + "A", + "ENSG00000100336.18", + "APOL4", + "protein_coding", + "RNA_SEQ", + "CL:0000222 polyA plus RNA-seq", + "CL:0000222", + -2.618365, + -0.99998 + ], + [ + "test_region_1", + 36201696, + "T", + "A", + "ENSG00000100336.18", + "APOL4", + "protein_coding", + "RNA_SEQ", + "CL:0000223 polyA plus RNA-seq", + "CL:0000223", + -2.313126, + -0.99998 + ], + [ + "test_region_1", + 36201696, + "T", + "A", + "ENSG00000100336.18", + "APOL4", + "protein_coding", + "RNA_SEQ", + "CL:0000223 total RNA-seq", + "CL:0000223", + -1.154866, + -0.99998 + ], + [ + "test_region_1", + 36201696, + "T", + "A", + "ENSG00000100336.18", + "APOL4", + "protein_coding", + "RNA_SEQ", + "CL:0000236 polyA plus RNA-seq", + "CL:0000236", + -0.299119, + -0.99998 + ], + [ + "test_region_1", + 36201696, + "T", + "A", + "ENSG00000100336.18", + "APOL4", + "protein_coding", + "RNA_SEQ", + "CL:0000236 total RNA-seq", + "CL:0000236", + -0.133313, + -0.999941 + ], + [ + "test_region_1", + 36201696, + "T", + "A", + "ENSG00000100336.18", + "APOL4", + "protein_coding", + "RNA_SEQ", + "CL:0000307 total RNA-seq", + "CL:0000307", + -1.233429, + -0.99998 + ], + [ + "test_region_1", + 36201696, + "T", + "A", + "ENSG00000100336.18", + "APOL4", + "protein_coding", + "RNA_SEQ", + "CL:0000312 polyA plus RNA-seq", + "CL:0000312", + -0.757559, + -0.99998 + ], + [ + "test_region_1", + 36201696, + "T", + "A", + "ENSG00000100336.18", + "APOL4", + "protein_coding", + "RNA_SEQ", + "CL:0000346 total RNA-seq", + "CL:0000346", + -2.658391, + -0.99998 + ], + [ + "test_region_1", + 36201696, + "T", + "A", + "ENSG00000100336.18", + "APOL4", + "protein_coding", + "RNA_SEQ", + "CL:0000351 polyA plus RNA-seq", + "CL:0000351", + -1.029845, + -0.99998 + ], + [ + "test_region_1", + 36201696, + "T", + "A", + "ENSG00000100336.18", + "APOL4", + "protein_coding", + "RNA_SEQ", + "CL:0000515 polyA plus RNA-seq", + "CL:0000515", + -3.428352, + -0.99998 + ], + [ + "test_region_1", + 36201696, + "T", + "A", + "ENSG00000100336.18", + "APOL4", + "protein_coding", + "RNA_SEQ", + "CL:0000515 total RNA-seq", + "CL:0000515", + -3.587678, + -0.99998 + ], + [ + "test_region_1", + 36201696, + "T", + "A", + "ENSG00000100336.18", + "APOL4", + "protein_coding", + "RNA_SEQ", + "CL:0000594 total RNA-seq", + "CL:0000594", + -3.365285, + -0.99998 + ], + [ + "test_region_1", + 36201696, + "T", + "A", + "ENSG00000100336.18", + "APOL4", + "protein_coding", + "RNA_SEQ", + "CL:0000623 polyA plus RNA-seq", + "CL:0000623", + -0.693357, + -0.99998 + ], + [ + "test_region_1", + 36201696, + "T", + "A", + "ENSG00000100336.18", + "APOL4", + "protein_coding", + "RNA_SEQ", + "CL:0000623 total RNA-seq", + "CL:0000623", + -0.57456, + -0.99998 + ], + [ + "test_region_1", + 36201696, + "T", + "A", + "ENSG00000100336.18", + "APOL4", + "protein_coding", + "RNA_SEQ", + "CL:0000624 total RNA-seq", + "CL:0000624", + -0.244827, + -0.99998 + ], + [ + "test_region_1", + 36201696, + "T", + "A", + "ENSG00000100336.18", + "APOL4", + "protein_coding", + "RNA_SEQ", + "CL:0000625 total RNA-seq", + "CL:0000625", + -0.203565, + -0.999972 + ] + ] +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fixture_sequence_predictor.json Fri Mar 20 20:09:58 2026 +0000 @@ -0,0 +1,283 @@ +{ + "columns": [ + "sequence_id", + "sequence_length", + "output_type", + "track_name", + "ontology_curie", + "mean_signal", + "max_signal" + ], + "rows": [ + [ + "test_promoter_1", + 180, + "RNA_SEQ", + "", + "CL:0000047", + 6.3e-05, + 8.2e-05 + ], + [ + "test_promoter_1", + 180, + "RNA_SEQ", + "", + "CL:0000062", + 9.1e-05, + 0.000117 + ], + [ + "test_promoter_1", + 180, + "RNA_SEQ", + "", + "CL:0000084", + 0.000152, + 0.000211 + ], + [ + "test_promoter_1", + 180, + "RNA_SEQ", + "", + "CL:0000084", + 0.000308, + 0.000469 + ], + [ + "test_promoter_1", + 180, + "RNA_SEQ", + "", + "CL:0000115", + 0.000518, + 0.000832 + ], + [ + "test_promoter_1", + 180, + "RNA_SEQ", + "", + "CL:0000127", + 8.1e-05, + 0.000119 + ], + [ + "test_promoter_1", + 180, + "RNA_SEQ", + "", + "CL:0000134", + 8.5e-05, + 0.000133 + ], + [ + "test_promoter_1", + 180, + "RNA_SEQ", + "", + "CL:0000137", + 0.000436, + 0.000717 + ], + [ + "test_promoter_1", + 180, + "RNA_SEQ", + "", + "CL:0000138", + 0.000336, + 0.000553 + ], + [ + "test_promoter_1", + 180, + "RNA_SEQ", + "", + "CL:0000169", + 0.000233, + 0.000366 + ], + [ + "test_promoter_1", + 180, + "RNA_SEQ", + "", + "CL:0000182", + 8.6e-05, + 0.000111 + ], + [ + "test_promoter_1", + 180, + "RNA_SEQ", + "", + "CL:0000187", + 9.1e-05, + 0.000121 + ], + [ + "test_promoter_1", + 180, + "RNA_SEQ", + "", + "CL:0000192", + 0.00032, + 0.000454 + ], + [ + "test_promoter_1", + 180, + "RNA_SEQ", + "", + "CL:0000221", + 0.001081, + 0.001839 + ], + [ + "test_promoter_1", + 180, + "RNA_SEQ", + "", + "CL:0000222", + 0.001735, + 0.00351 + ], + [ + "test_promoter_1", + 180, + "RNA_SEQ", + "", + "CL:0000223", + 0.00145, + 0.002502 + ], + [ + "test_promoter_1", + 180, + "RNA_SEQ", + "", + "CL:0000223", + 0.00069, + 0.001114 + ], + [ + "test_promoter_1", + 180, + "RNA_SEQ", + "", + "CL:0000236", + 0.000311, + 0.000443 + ], + [ + "test_promoter_1", + 180, + "RNA_SEQ", + "", + "CL:0000236", + 0.000246, + 0.000418 + ], + [ + "test_promoter_1", + 180, + "RNA_SEQ", + "", + "CL:0000307", + 0.000283, + 0.000401 + ], + [ + "test_promoter_1", + 180, + "RNA_SEQ", + "", + "CL:0000312", + 0.000164, + 0.000211 + ], + [ + "test_promoter_1", + 180, + "RNA_SEQ", + "", + "CL:0000346", + 9.6e-05, + 0.00013 + ], + [ + "test_promoter_1", + 180, + "RNA_SEQ", + "", + "CL:0000351", + 8.7e-05, + 0.000128 + ], + [ + "test_promoter_1", + 180, + "RNA_SEQ", + "", + "CL:0000515", + 9.9e-05, + 0.000143 + ], + [ + "test_promoter_1", + 180, + "RNA_SEQ", + "", + "CL:0000515", + 0.000128, + 0.000172 + ], + [ + "test_promoter_1", + 180, + "RNA_SEQ", + "", + "CL:0000594", + 0.000181, + 0.000259 + ], + [ + "test_promoter_1", + 180, + "RNA_SEQ", + "", + "CL:0000623", + 0.000139, + 0.00018 + ], + [ + "test_promoter_1", + 180, + "RNA_SEQ", + "", + "CL:0000623", + 0.000363, + 0.000565 + ], + [ + "test_promoter_1", + 180, + "RNA_SEQ", + "", + "CL:0000624", + 0.000282, + 0.000452 + ], + [ + "test_promoter_1", + 180, + "RNA_SEQ", + "", + "CL:0000625", + 0.00025, + 0.000406 + ] + ] +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fixture_variant_effect.json Fri Mar 20 20:09:58 2026 +0000 @@ -0,0 +1,31 @@ +{ + "variants": [ + { + "chrom": "chr22", + "pos": 36201698, + "ref": "A", + "alt": "C", + "scores": { + "RNA_SEQ": 9.74931 + } + }, + { + "chrom": "chr22", + "pos": 36201750, + "ref": "G", + "alt": "T", + "scores": { + "RNA_SEQ": 0.794512 + } + }, + { + "chrom": "chr22", + "pos": 36202000, + "ref": "C", + "alt": "A", + "scores": { + "RNA_SEQ": 0.960974 + } + } + ] +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fixture_variant_scorer.json Fri Mar 20 20:09:58 2026 +0000 @@ -0,0 +1,779 @@ +{ + "columns": [ + "variant_id", + "scored_interval", + "gene_id", + "gene_name", + "gene_type", + "gene_strand", + "junction_Start", + "junction_End", + "output_type", + "variant_scorer", + "track_name", + "track_strand", + "Assay title", + "ontology_curie", + "biosample_name", + "biosample_type", + "biosample_life_stage", + "gtex_tissue", + "data_source", + "endedness", + "genetically_modified", + "raw_score", + "quantile_score" + ], + "rows": [ + [ + "chr22:36201698:A>C", + "chr22:36193506-36209890:.", + "ENSG00000100336", + "APOL4", + "protein_coding", + "-", + "", + "", + "RNA_SEQ", + "GeneMaskLFCScorer(requested_output=RNA_SEQ)", + "CL:0000047 polyA plus RNA-seq", + "-", + "polyA plus RNA-seq", + "CL:0000047", + "neuronal stem cell", + "in_vitro_differentiated_cells", + "embryonic", + "", + "encode", + "paired", + "False", + -1.534143, + -0.99998 + ], + [ + "chr22:36201698:A>C", + "chr22:36193506-36209890:.", + "ENSG00000100336", + "APOL4", + "protein_coding", + "-", + "", + "", + "RNA_SEQ", + "GeneMaskLFCScorer(requested_output=RNA_SEQ)", + "CL:0000062 total RNA-seq", + "-", + "total RNA-seq", + "CL:0000062", + "osteoblast", + "primary_cell", + "adult", + "", + "encode", + "paired", + "False", + -2.4925048, + -0.99998 + ], + [ + "chr22:36201698:A>C", + "chr22:36193506-36209890:.", + "ENSG00000100336", + "APOL4", + "protein_coding", + "-", + "", + "", + "RNA_SEQ", + "GeneMaskLFCScorer(requested_output=RNA_SEQ)", + "CL:0000084 polyA plus RNA-seq", + "-", + "polyA plus RNA-seq", + "CL:0000084", + "T-cell", + "primary_cell", + "adult", + "", + "encode", + "paired", + "False", + -0.48444033, + -0.99998 + ], + [ + "chr22:36201698:A>C", + "chr22:36193506-36209890:.", + "ENSG00000100336", + "APOL4", + "protein_coding", + "-", + "", + "", + "RNA_SEQ", + "GeneMaskLFCScorer(requested_output=RNA_SEQ)", + "CL:0000084 total RNA-seq", + "-", + "total RNA-seq", + "CL:0000084", + "T-cell", + "primary_cell", + "adult", + "", + "encode", + "single", + "False", + -0.3864045, + -0.99998 + ], + [ + "chr22:36201698:A>C", + "chr22:36193506-36209890:.", + "ENSG00000100336", + "APOL4", + "protein_coding", + "-", + "", + "", + "RNA_SEQ", + "GeneMaskLFCScorer(requested_output=RNA_SEQ)", + "CL:0000115 total RNA-seq", + "-", + "total RNA-seq", + "CL:0000115", + "endothelial cell", + "in_vitro_differentiated_cells", + "adult", + "", + "encode", + "single", + "False", + -1.7566066, + -0.99998 + ], + [ + "chr22:36201698:A>C", + "chr22:36193506-36209890:.", + "ENSG00000100336", + "APOL4", + "protein_coding", + "-", + "", + "", + "RNA_SEQ", + "GeneMaskLFCScorer(requested_output=RNA_SEQ)", + "CL:0000127 total RNA-seq", + "-", + "total RNA-seq", + "CL:0000127", + "astrocyte", + "primary_cell", + "unknown", + "", + "encode", + "paired", + "False", + -2.2548208, + -0.99998 + ], + [ + "chr22:36201698:A>C", + "chr22:36193506-36209890:.", + "ENSG00000100336", + "APOL4", + "protein_coding", + "-", + "", + "", + "RNA_SEQ", + "GeneMaskLFCScorer(requested_output=RNA_SEQ)", + "CL:0000134 polyA plus RNA-seq", + "-", + "polyA plus RNA-seq", + "CL:0000134", + "mesenchymal stem cell", + "in_vitro_differentiated_cells", + "embryonic", + "", + "encode", + "paired", + "False", + -2.487202, + -0.99998 + ], + [ + "chr22:36201698:A>C", + "chr22:36193506-36209890:.", + "ENSG00000100336", + "APOL4", + "protein_coding", + "-", + "", + "", + "RNA_SEQ", + "GeneMaskLFCScorer(requested_output=RNA_SEQ)", + "CL:0000137 total RNA-seq", + "-", + "total RNA-seq", + "CL:0000137", + "osteocyte", + "in_vitro_differentiated_cells", + "embryonic", + "", + "encode", + "single", + "False", + -2.1787338, + -0.99998 + ], + [ + "chr22:36201698:A>C", + "chr22:36193506-36209890:.", + "ENSG00000100336", + "APOL4", + "protein_coding", + "-", + "", + "", + "RNA_SEQ", + "GeneMaskLFCScorer(requested_output=RNA_SEQ)", + "CL:0000138 total RNA-seq", + "-", + "total RNA-seq", + "CL:0000138", + "chondrocyte", + "in_vitro_differentiated_cells", + "embryonic", + "", + "encode", + "single", + "False", + -2.0341547, + -0.99998 + ], + [ + "chr22:36201698:A>C", + "chr22:36193506-36209890:.", + "ENSG00000100336", + "APOL4", + "protein_coding", + "-", + "", + "", + "RNA_SEQ", + "GeneMaskLFCScorer(requested_output=RNA_SEQ)", + "CL:0000169 total RNA-seq", + "-", + "total RNA-seq", + "CL:0000169", + "type B pancreatic cell", + "in_vitro_differentiated_cells", + "embryonic", + "", + "encode", + "single", + "False", + -0.5709431, + -0.99998 + ], + [ + "chr22:36201698:A>C", + "chr22:36193506-36209890:.", + "ENSG00000100336", + "APOL4", + "protein_coding", + "-", + "", + "", + "RNA_SEQ", + "GeneMaskLFCScorer(requested_output=RNA_SEQ)", + "CL:0000182 total RNA-seq", + "-", + "total RNA-seq", + "CL:0000182", + "hepatocyte", + "in_vitro_differentiated_cells", + "embryonic", + "", + "encode", + "paired", + "False", + -1.3045998, + -0.99998 + ], + [ + "chr22:36201698:A>C", + "chr22:36193506-36209890:.", + "ENSG00000100336", + "APOL4", + "protein_coding", + "-", + "", + "", + "RNA_SEQ", + "GeneMaskLFCScorer(requested_output=RNA_SEQ)", + "CL:0000187 total RNA-seq", + "-", + "total RNA-seq", + "CL:0000187", + "myocyte", + "in_vitro_differentiated_cells", + "adult", + "", + "encode", + "paired", + "False", + -3.1106043, + -0.99998 + ], + [ + "chr22:36201698:A>C", + "chr22:36193506-36209890:.", + "ENSG00000100336", + "APOL4", + "protein_coding", + "-", + "", + "", + "RNA_SEQ", + "GeneMaskLFCScorer(requested_output=RNA_SEQ)", + "CL:0000192 total RNA-seq", + "-", + "total RNA-seq", + "CL:0000192", + "smooth muscle cell", + "in_vitro_differentiated_cells", + "embryonic", + "", + "encode", + "paired", + "False", + -1.362958, + -0.99998 + ], + [ + "chr22:36201698:A>C", + "chr22:36193506-36209890:.", + "ENSG00000100336", + "APOL4", + "protein_coding", + "-", + "", + "", + "RNA_SEQ", + "GeneMaskLFCScorer(requested_output=RNA_SEQ)", + "CL:0000221 polyA plus RNA-seq", + "-", + "polyA plus RNA-seq", + "CL:0000221", + "ectodermal cell", + "in_vitro_differentiated_cells", + "embryonic", + "", + "encode", + "paired", + "False", + -1.7337704, + -0.99998 + ], + [ + "chr22:36201698:A>C", + "chr22:36193506-36209890:.", + "ENSG00000100336", + "APOL4", + "protein_coding", + "-", + "", + "", + "RNA_SEQ", + "GeneMaskLFCScorer(requested_output=RNA_SEQ)", + "CL:0000222 polyA plus RNA-seq", + "-", + "polyA plus RNA-seq", + "CL:0000222", + "mesodermal cell", + "in_vitro_differentiated_cells", + "embryonic", + "", + "encode", + "paired", + "False", + -2.3969855, + -0.99998 + ], + [ + "chr22:36201698:A>C", + "chr22:36193506-36209890:.", + "ENSG00000100336", + "APOL4", + "protein_coding", + "-", + "", + "", + "RNA_SEQ", + "GeneMaskLFCScorer(requested_output=RNA_SEQ)", + "CL:0000223 polyA plus RNA-seq", + "-", + "polyA plus RNA-seq", + "CL:0000223", + "endodermal cell", + "in_vitro_differentiated_cells", + "embryonic", + "", + "encode", + "paired", + "False", + -2.0788126, + -0.99998 + ], + [ + "chr22:36201698:A>C", + "chr22:36193506-36209890:.", + "ENSG00000100336", + "APOL4", + "protein_coding", + "-", + "", + "", + "RNA_SEQ", + "GeneMaskLFCScorer(requested_output=RNA_SEQ)", + "CL:0000223 total RNA-seq", + "-", + "total RNA-seq", + "CL:0000223", + "endodermal cell", + "in_vitro_differentiated_cells", + "embryonic", + "", + "encode", + "single", + "False", + -1.0922494, + -0.99998 + ], + [ + "chr22:36201698:A>C", + "chr22:36193506-36209890:.", + "ENSG00000100336", + "APOL4", + "protein_coding", + "-", + "", + "", + "RNA_SEQ", + "GeneMaskLFCScorer(requested_output=RNA_SEQ)", + "CL:0000236 polyA plus RNA-seq", + "-", + "polyA plus RNA-seq", + "CL:0000236", + "B cell", + "primary_cell", + "adult", + "", + "encode", + "paired", + "False", + -0.3181591, + -0.99998 + ], + [ + "chr22:36201698:A>C", + "chr22:36193506-36209890:.", + "ENSG00000100336", + "APOL4", + "protein_coding", + "-", + "", + "", + "RNA_SEQ", + "GeneMaskLFCScorer(requested_output=RNA_SEQ)", + "CL:0000236 total RNA-seq", + "-", + "total RNA-seq", + "CL:0000236", + "B cell", + "primary_cell", + "adult", + "", + "encode", + "single", + "False", + -0.17071676, + -0.9999627 + ], + [ + "chr22:36201698:A>C", + "chr22:36193506-36209890:.", + "ENSG00000100336", + "APOL4", + "protein_coding", + "-", + "", + "", + "RNA_SEQ", + "GeneMaskLFCScorer(requested_output=RNA_SEQ)", + "CL:0000307 total RNA-seq", + "-", + "total RNA-seq", + "CL:0000307", + "tracheal epithelial cell", + "primary_cell", + "adult", + "", + "encode", + "paired", + "False", + -1.0373387, + -0.99998 + ], + [ + "chr22:36201698:A>C", + "chr22:36193506-36209890:.", + "ENSG00000100336", + "APOL4", + "protein_coding", + "-", + "", + "", + "RNA_SEQ", + "GeneMaskLFCScorer(requested_output=RNA_SEQ)", + "CL:0000312 polyA plus RNA-seq", + "-", + "polyA plus RNA-seq", + "CL:0000312", + "keratinocyte", + "primary_cell", + "unknown", + "", + "encode", + "paired", + "False", + -0.59641504, + -0.99998 + ], + [ + "chr22:36201698:A>C", + "chr22:36193506-36209890:.", + "ENSG00000100336", + "APOL4", + "protein_coding", + "-", + "", + "", + "RNA_SEQ", + "GeneMaskLFCScorer(requested_output=RNA_SEQ)", + "CL:0000346 total RNA-seq", + "-", + "total RNA-seq", + "CL:0000346", + "hair follicle dermal papilla cell", + "primary_cell", + "adult", + "", + "encode", + "paired", + "False", + -2.2779784, + -0.99998 + ], + [ + "chr22:36201698:A>C", + "chr22:36193506-36209890:.", + "ENSG00000100336", + "APOL4", + "protein_coding", + "-", + "", + "", + "RNA_SEQ", + "GeneMaskLFCScorer(requested_output=RNA_SEQ)", + "CL:0000351 polyA plus RNA-seq", + "-", + "polyA plus RNA-seq", + "CL:0000351", + "trophoblast cell", + "in_vitro_differentiated_cells", + "embryonic", + "", + "encode", + "paired", + "False", + -0.8832159, + -0.99998 + ], + [ + "chr22:36201698:A>C", + "chr22:36193506-36209890:.", + "ENSG00000100336", + "APOL4", + "protein_coding", + "-", + "", + "", + "RNA_SEQ", + "GeneMaskLFCScorer(requested_output=RNA_SEQ)", + "CL:0000515 polyA plus RNA-seq", + "-", + "polyA plus RNA-seq", + "CL:0000515", + "skeletal muscle myoblast", + "primary_cell", + "unknown", + "", + "encode", + "paired", + "False", + -3.0128946, + -0.99998 + ], + [ + "chr22:36201698:A>C", + "chr22:36193506-36209890:.", + "ENSG00000100336", + "APOL4", + "protein_coding", + "-", + "", + "", + "RNA_SEQ", + "GeneMaskLFCScorer(requested_output=RNA_SEQ)", + "CL:0000515 total RNA-seq", + "-", + "total RNA-seq", + "CL:0000515", + "skeletal muscle myoblast", + "primary_cell", + "unknown", + "", + "encode", + "paired", + "False", + -3.1980705, + -0.99998 + ], + [ + "chr22:36201698:A>C", + "chr22:36193506-36209890:.", + "ENSG00000100336", + "APOL4", + "protein_coding", + "-", + "", + "", + "RNA_SEQ", + "GeneMaskLFCScorer(requested_output=RNA_SEQ)", + "CL:0000594 total RNA-seq", + "-", + "total RNA-seq", + "CL:0000594", + "skeletal muscle satellite cell", + "primary_cell", + "adult", + "", + "encode", + "paired", + "False", + -3.0219214, + -0.99998 + ], + [ + "chr22:36201698:A>C", + "chr22:36193506-36209890:.", + "ENSG00000100336", + "APOL4", + "protein_coding", + "-", + "", + "", + "RNA_SEQ", + "GeneMaskLFCScorer(requested_output=RNA_SEQ)", + "CL:0000623 polyA plus RNA-seq", + "-", + "polyA plus RNA-seq", + "CL:0000623", + "natural killer cell", + "primary_cell", + "adult", + "", + "encode", + "paired", + "False", + -0.789505, + -0.99998 + ], + [ + "chr22:36201698:A>C", + "chr22:36193506-36209890:.", + "ENSG00000100336", + "APOL4", + "protein_coding", + "-", + "", + "", + "RNA_SEQ", + "GeneMaskLFCScorer(requested_output=RNA_SEQ)", + "CL:0000623 total RNA-seq", + "-", + "total RNA-seq", + "CL:0000623", + "natural killer cell", + "primary_cell", + "adult", + "", + "encode", + "single", + "False", + -0.672935, + -0.99998 + ], + [ + "chr22:36201698:A>C", + "chr22:36193506-36209890:.", + "ENSG00000100336", + "APOL4", + "protein_coding", + "-", + "", + "", + "RNA_SEQ", + "GeneMaskLFCScorer(requested_output=RNA_SEQ)", + "CL:0000624 total RNA-seq", + "-", + "total RNA-seq", + "CL:0000624", + "CD4-positive, alpha-beta T cell", + "primary_cell", + "adult", + "", + "encode", + "single", + "False", + -0.309525, + -0.99998 + ], + [ + "chr22:36201698:A>C", + "chr22:36193506-36209890:.", + "ENSG00000100336", + "APOL4", + "protein_coding", + "-", + "", + "", + "RNA_SEQ", + "GeneMaskLFCScorer(requested_output=RNA_SEQ)", + "CL:0000625 total RNA-seq", + "-", + "total RNA-seq", + "CL:0000625", + "CD8-positive, alpha-beta T cell", + "primary_cell", + "adult", + "", + "encode", + "single", + "False", + -0.26380253, + -0.99998 + ] + ] +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_input.vcf Fri Mar 20 20:09:58 2026 +0000 @@ -0,0 +1,7 @@ +##fileformat=VCFv4.2 +##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data"> +##contig=<ID=chr22,length=50818468> +#CHROM POS ID REF ALT QUAL FILTER INFO +chr22 36201698 . A C . PASS . +chr22 36201750 . G T . PASS . +chr22 36202000 . C A . PASS .
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_intervals.bed Fri Mar 20 20:09:58 2026 +0000 @@ -0,0 +1,3 @@ +chr22 36201500 36202500 test_interval_1 +chr22 36210000 36211000 test_interval_2 +chr22 36220000 36220500 test_interval_3
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_regions.bed Fri Mar 20 20:09:58 2026 +0000 @@ -0,0 +1,2 @@ +chr22 36201690 36201710 test_region_1 +chr22 36201990 36202010 test_region_2
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_sequences.fa Fri Mar 20 20:09:58 2026 +0000 @@ -0,0 +1,7 @@ +>test_promoter_1 +ATGCGTACGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCATGCATGCATG +CGATCGATCGATCGTAGCTAGCTAGCTAGCATGCATGCATGCGATCGATCGATCGTAGCT +AGCTAGCTAGCTAGCATGCATGCATGCGATCGATCGATCGTAGCTAGCTAGCTAGCATGC +>test_enhancer_1 +GCTAGCTAGCTAGCATGCATGCATGCGATCGATCGATCGTAGCTAGCTAGCTAGCATGCA +TGCATGCGATCGATCGATCGTAGCTAGCTAGCTAGCATGCATGCATGCGATCGATCGATC
