Mercurial > repos > iuc > microsatbed
comparison find_str.py @ 0:2b970db61912 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/microsatbed commit 275acb787c01484c6e435c8864090d377c3fde75
author | iuc |
---|---|
date | Sun, 21 Jul 2024 07:19:00 +0000 |
parents | |
children | 5f8efb080f49 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:2b970db61912 |
---|---|
1 import argparse | |
2 import subprocess | |
3 | |
4 import pytrf # 1.3.0 | |
5 from pyfastx import Fastx # 0.5.2 | |
6 | |
7 """ | |
8 Allows all STR or those for a subset of motifs to be written to a bed file | |
9 Designed to build some of the microsatellite tracks from https://github.com/arangrhie/T2T-Polish/tree/master/pattern for the VGP. | |
10 """ | |
11 | |
12 | |
13 def getDensity(name, bed, chrlen, winwidth): | |
14 """ | |
15 pybigtools can write bigwigs and they are processed by other ucsc tools - but jb2 will not read them. | |
16 Swapped the conversion to use a bedgraph file processed by bedGraphToBigWig | |
17 """ | |
18 nwin = int(chrlen / winwidth) | |
19 d = [0.0 for x in range(nwin + 1)] | |
20 for b in bed: | |
21 nt = b[5] | |
22 bin = int(b[1] / winwidth) | |
23 d[bin] += nt | |
24 bedg = [ | |
25 (name, (x * winwidth), ((x + 1) * winwidth) - 1, float(d[x])) | |
26 for x in range(nwin + 1) | |
27 if (x + 1) * winwidth <= chrlen | |
28 ] | |
29 return bedg | |
30 | |
31 | |
32 def write_ssrs(args): | |
33 """ | |
34 The integers in the call change the minimum repeats for mono-, di-, tri-, tetra-, penta-, hexa-nucleotide repeats | |
35 ssrs = pytrf.STRFinder(name, seq, 10, 6, 4, 3, 3, 3) | |
36 NOTE: Dinucleotides GA and AG are reported separately by https://github.com/marbl/seqrequester. | |
37 The reversed pair STRs are about as common in the documentation sample. | |
38 Sequence read bias might be influenced by GC density or some other specific motif. | |
39 """ | |
40 bed = [] | |
41 wig = [] | |
42 chrlens = {} | |
43 specific = None | |
44 if args.specific: | |
45 specific = args.specific.upper().split(",") | |
46 fa = Fastx(args.fasta, uppercase=True) | |
47 for name, seq in fa: | |
48 chrlen = len(seq) | |
49 chrlens[name] = chrlen | |
50 cbed = [] | |
51 for ssr in pytrf.STRFinder( | |
52 name, | |
53 seq, | |
54 args.monomin, | |
55 args.dimin, | |
56 args.trimin, | |
57 args.tetramin, | |
58 args.pentamin, | |
59 args.hexamin, | |
60 ): | |
61 row = ( | |
62 ssr.chrom, | |
63 ssr.start, | |
64 ssr.end, | |
65 ssr.motif, | |
66 ssr.repeat, | |
67 ssr.length, | |
68 ) | |
69 if args.specific and ssr.motif in specific: | |
70 cbed.append(row) | |
71 elif args.mono and len(ssr.motif) == 1: | |
72 cbed.append(row) | |
73 elif args.di and len(ssr.motif) == 2: | |
74 cbed.append(row) | |
75 elif args.tri and len(ssr.motif) == 3: | |
76 cbed.append(row) | |
77 elif args.tetra and len(ssr.motif) == 4: | |
78 cbed.append(row) | |
79 elif args.penta and len(ssr.motif) == 5: | |
80 cbed.append(row) | |
81 elif args.hexa and len(ssr.motif) == 6: | |
82 cbed.append(row) | |
83 if args.bigwig: | |
84 w = getDensity(name, cbed, chrlen, args.winwidth) | |
85 wig += w | |
86 bed += cbed | |
87 if args.bigwig: | |
88 wig.sort() | |
89 bedg = ["%s %d %d %.2f" % x for x in wig] | |
90 with open("temp.bedg", "w") as bw: | |
91 bw.write("\n".join(bedg)) | |
92 chroms = ["%s\t%s" % (x, chrlens[x]) for x in chrlens.keys()] | |
93 with open("temp.chromlen", "w") as cl: | |
94 cl.write("\n".join(chroms)) | |
95 cmd = ["bedGraphToBigWig", "temp.bedg", "temp.chromlen", args.bed] | |
96 subprocess.run(cmd) | |
97 else: | |
98 bed.sort() | |
99 obed = ["%s\t%d\t%d\t%s_%d\t%d" % x for x in bed] | |
100 with open(args.bed, "w") as outbed: | |
101 outbed.write("\n".join(obed)) | |
102 outbed.write("\n") | |
103 | |
104 | |
105 if __name__ == "__main__": | |
106 parser = argparse.ArgumentParser() | |
107 a = parser.add_argument | |
108 a("--di", action="store_true") | |
109 a("--tri", action="store_true") | |
110 a("--tetra", action="store_true") | |
111 a("--penta", action="store_true") | |
112 a("--hexa", action="store_true") | |
113 a("--mono", action="store_true") | |
114 a("--dimin", default=2, type=int) | |
115 a("--trimin", default=2, type=int) | |
116 a("--tetramin", default=2, type=int) | |
117 a("--pentamin", default=2, type=int) | |
118 a("--hexamin", default=2, type=int) | |
119 a("--monomin", default=2, type=int) | |
120 a("-f", "--fasta", default="humsamp.fa") | |
121 a("-b", "--bed", default="humsamp.bed") | |
122 a("--bigwig", action="store_true") | |
123 a("--winwidth", default=128, type=int) | |
124 a("--specific", default=None) | |
125 a("--minreps", default=2, type=int) | |
126 args = parser.parse_args() | |
127 write_ssrs(args) |