comparison find_str.py @ 0:2b970db61912 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/microsatbed commit 275acb787c01484c6e435c8864090d377c3fde75
author iuc
date Sun, 21 Jul 2024 07:19:00 +0000
parents
children 5f8efb080f49
comparison
equal deleted inserted replaced
-1:000000000000 0:2b970db61912
1 import argparse
2 import subprocess
3
4 import pytrf # 1.3.0
5 from pyfastx import Fastx # 0.5.2
6
7 """
8 Allows all STR or those for a subset of motifs to be written to a bed file
9 Designed to build some of the microsatellite tracks from https://github.com/arangrhie/T2T-Polish/tree/master/pattern for the VGP.
10 """
11
12
13 def getDensity(name, bed, chrlen, winwidth):
14 """
15 pybigtools can write bigwigs and they are processed by other ucsc tools - but jb2 will not read them.
16 Swapped the conversion to use a bedgraph file processed by bedGraphToBigWig
17 """
18 nwin = int(chrlen / winwidth)
19 d = [0.0 for x in range(nwin + 1)]
20 for b in bed:
21 nt = b[5]
22 bin = int(b[1] / winwidth)
23 d[bin] += nt
24 bedg = [
25 (name, (x * winwidth), ((x + 1) * winwidth) - 1, float(d[x]))
26 for x in range(nwin + 1)
27 if (x + 1) * winwidth <= chrlen
28 ]
29 return bedg
30
31
32 def write_ssrs(args):
33 """
34 The integers in the call change the minimum repeats for mono-, di-, tri-, tetra-, penta-, hexa-nucleotide repeats
35 ssrs = pytrf.STRFinder(name, seq, 10, 6, 4, 3, 3, 3)
36 NOTE: Dinucleotides GA and AG are reported separately by https://github.com/marbl/seqrequester.
37 The reversed pair STRs are about as common in the documentation sample.
38 Sequence read bias might be influenced by GC density or some other specific motif.
39 """
40 bed = []
41 wig = []
42 chrlens = {}
43 specific = None
44 if args.specific:
45 specific = args.specific.upper().split(",")
46 fa = Fastx(args.fasta, uppercase=True)
47 for name, seq in fa:
48 chrlen = len(seq)
49 chrlens[name] = chrlen
50 cbed = []
51 for ssr in pytrf.STRFinder(
52 name,
53 seq,
54 args.monomin,
55 args.dimin,
56 args.trimin,
57 args.tetramin,
58 args.pentamin,
59 args.hexamin,
60 ):
61 row = (
62 ssr.chrom,
63 ssr.start,
64 ssr.end,
65 ssr.motif,
66 ssr.repeat,
67 ssr.length,
68 )
69 if args.specific and ssr.motif in specific:
70 cbed.append(row)
71 elif args.mono and len(ssr.motif) == 1:
72 cbed.append(row)
73 elif args.di and len(ssr.motif) == 2:
74 cbed.append(row)
75 elif args.tri and len(ssr.motif) == 3:
76 cbed.append(row)
77 elif args.tetra and len(ssr.motif) == 4:
78 cbed.append(row)
79 elif args.penta and len(ssr.motif) == 5:
80 cbed.append(row)
81 elif args.hexa and len(ssr.motif) == 6:
82 cbed.append(row)
83 if args.bigwig:
84 w = getDensity(name, cbed, chrlen, args.winwidth)
85 wig += w
86 bed += cbed
87 if args.bigwig:
88 wig.sort()
89 bedg = ["%s %d %d %.2f" % x for x in wig]
90 with open("temp.bedg", "w") as bw:
91 bw.write("\n".join(bedg))
92 chroms = ["%s\t%s" % (x, chrlens[x]) for x in chrlens.keys()]
93 with open("temp.chromlen", "w") as cl:
94 cl.write("\n".join(chroms))
95 cmd = ["bedGraphToBigWig", "temp.bedg", "temp.chromlen", args.bed]
96 subprocess.run(cmd)
97 else:
98 bed.sort()
99 obed = ["%s\t%d\t%d\t%s_%d\t%d" % x for x in bed]
100 with open(args.bed, "w") as outbed:
101 outbed.write("\n".join(obed))
102 outbed.write("\n")
103
104
105 if __name__ == "__main__":
106 parser = argparse.ArgumentParser()
107 a = parser.add_argument
108 a("--di", action="store_true")
109 a("--tri", action="store_true")
110 a("--tetra", action="store_true")
111 a("--penta", action="store_true")
112 a("--hexa", action="store_true")
113 a("--mono", action="store_true")
114 a("--dimin", default=2, type=int)
115 a("--trimin", default=2, type=int)
116 a("--tetramin", default=2, type=int)
117 a("--pentamin", default=2, type=int)
118 a("--hexamin", default=2, type=int)
119 a("--monomin", default=2, type=int)
120 a("-f", "--fasta", default="humsamp.fa")
121 a("-b", "--bed", default="humsamp.bed")
122 a("--bigwig", action="store_true")
123 a("--winwidth", default=128, type=int)
124 a("--specific", default=None)
125 a("--minreps", default=2, type=int)
126 args = parser.parse_args()
127 write_ssrs(args)