view find_str.py @ 4:5f8efb080f49 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/microsatbed commit 891fc6321cd94c9a63c880d75989d79521f1a9b6
author iuc
date Sat, 14 Sep 2024 12:17:02 +0000
parents 2b970db61912
children
line wrap: on
line source

import argparse
import subprocess

import pytrf  # 1.3.0
from pyfastx import Fastx  # 0.5.2

"""
Allows all STR or those for a subset of motifs to be written to a bed file
Designed to build some of the microsatellite tracks from https://github.com/arangrhie/T2T-Polish/tree/master/pattern for the VGP.
Note that there are only four possible types of dinucleotide repeat, because CA = AC = GT = TG, GA = AG = CT = TC, AT = TA, and GC = CG.
"""


def getDensity(name, bed, chrlen, winwidth):
    """
    pybigtools can write bigwigs and they are processed by other ucsc tools - but jb2 will not read them.
    Swapped the conversion to use a bedgraph file processed by bedGraphToBigWig
    """
    nwin = int(chrlen / winwidth)
    d = [0.0 for x in range(nwin + 1)]
    for b in bed:
        nt = b[5]
        bin = int(b[1] / winwidth)
        d[bin] += nt
    bedg = [
        (name, (x * winwidth), ((x + 1) * winwidth), float(d[x]))
        for x in range(nwin + 1)
        if (x + 1) * winwidth <= chrlen
    ]
    return bedg


def write_ssrs(args):
    """
    The integers in the call change the minimum repeats for mono-, di-, tri-, tetra-, penta-, hexa-nucleotide repeats
    ssrs = pytrf.STRFinder(name, seq, 10, 6, 4, 3, 3, 3)
    NOTE: Dinucleotides GA and AG are reported separately by https://github.com/marbl/seqrequester.
    The reversed pair STRs are about as common in the documentation sample.
    Sequence read bias might be influenced by GC density or some other specific motif.
    """
    bed = []
    wig = []
    chrlens = {}
    specific = None
    if args.specific:
        specific = args.specific.upper().split(",")
    fa = Fastx(args.fasta, uppercase=True)
    for name, seq in fa:
        chrlen = len(seq)
        chrlens[name] = chrlen
        cbed = []
        for ssr in pytrf.STRFinder(
            name,
            seq,
            args.monomin,
            args.dimin,
            args.trimin,
            args.tetramin,
            args.pentamin,
            args.hexamin,
        ):
            row = (
                ssr.chrom,
                ssr.start,
                ssr.end,
                ssr.motif,
                ssr.repeat,
                ssr.length,
            )
            if args.specific and ssr.motif in specific:
                cbed.append(row)
            elif args.mono and len(ssr.motif) == 1:
                cbed.append(row)
            elif args.di and len(ssr.motif) == 2:
                cbed.append(row)
            elif args.tri and len(ssr.motif) == 3:
                cbed.append(row)
            elif args.tetra and len(ssr.motif) == 4:
                cbed.append(row)
            elif args.penta and len(ssr.motif) == 5:
                cbed.append(row)
            elif args.hexa and len(ssr.motif) == 6:
                cbed.append(row)
        if args.bigwig:
            w = getDensity(name, cbed, chrlen, args.winwidth)
            wig.extend(w)
        bed.extend(cbed)
    if args.bigwig:
        wig.sort()
        with open("temp.bedg", "w") as bw:
            for row in wig:
                bw.write("%s %d %d %.2f\n" % row)
        chroms = ["%s\t%s" % (x, chrlens[x]) for x in chrlens.keys()]
        with open("temp.chromlen", "w") as cl:
            cl.write("\n".join(chroms))
        cmd = ["bedGraphToBigWig", "temp.bedg", "temp.chromlen", args.bed]
        subprocess.run(cmd)
    else:
        bed.sort()
        with open(args.bed, "w") as outbed:
            for row in bed:
                outbed.write("%s\t%d\t%d\t%s_%d\t%d\n" % row)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    a = parser.add_argument
    a("--di", action="store_true")
    a("--tri", action="store_true")
    a("--tetra", action="store_true")
    a("--penta", action="store_true")
    a("--hexa", action="store_true")
    a("--mono", action="store_true")
    a("--dimin", default=2, type=int)
    a("--trimin", default=2, type=int)
    a("--tetramin", default=2, type=int)
    a("--pentamin", default=2, type=int)
    a("--hexamin", default=2, type=int)
    a("--monomin", default=2, type=int)
    a("-f", "--fasta", default="humsamp.fa")
    a("-b", "--bed", default="humsamp.bed")
    a("--bigwig", action="store_true")
    a("--winwidth", default=128, type=int)
    a("--specific", default=None)
    a("--minreps", default=2, type=int)
    args = parser.parse_args()
    write_ssrs(args)