Mercurial > repos > fubar > microsatbedfubar
comparison find_str.py @ 8:01c16e8fbc91 draft
planemo upload for repository https://github.com/fubar2/microsatbed commit 80a8c0db54b6e2cab9dfe7178b1e5b3b39592f2c-dirty
author | fubar |
---|---|
date | Tue, 13 Aug 2024 04:50:47 +0000 |
parents | 4ff60fb9ca4d |
children |
comparison
equal
deleted
inserted
replaced
7:f27be15cc58d | 8:01c16e8fbc91 |
---|---|
1 import argparse | 1 import argparse |
2 import subprocess | |
2 | 3 |
3 import pytrf # 1.3.0 | 4 import pytrf # 1.3.0 |
4 from pyfastx import Fastx # 0.5.2 | 5 from pyfastx import Fastx # 0.5.2 |
5 | 6 |
6 """ | 7 """ |
7 Allows all STR or those for a subset of motifs to be written to a bed file | 8 Allows all STR or those for a subset of motifs to be written to a bed file |
8 Designed to build some of the microsatellite tracks from https://github.com/arangrhie/T2T-Polish/tree/master/pattern for the VGP. | 9 Designed to build some of the microsatellite tracks from https://github.com/arangrhie/T2T-Polish/tree/master/pattern for the VGP. |
9 """ | 10 """ |
11 | |
12 | |
13 def getDensity(name, bed, chrlen, winwidth): | |
14 """ | |
15 pybigtools can write bigwigs and they are processed by other ucsc tools - but jb2 will not read them. | |
16 Swapped the conversion to use a bedgraph file processed by bedGraphToBigWig | |
17 """ | |
18 nwin = int(chrlen / winwidth) | |
19 d = [0.0 for x in range(nwin + 1)] | |
20 for b in bed: | |
21 nt = b[5] | |
22 bin = int(b[1] / winwidth) | |
23 d[bin] += nt | |
24 bedg = [ | |
25 (name, (x * winwidth), ((x + 1) * winwidth) - 1, float(d[x])) | |
26 for x in range(nwin + 1) | |
27 if (x + 1) * winwidth <= chrlen | |
28 ] | |
29 return bedg | |
10 | 30 |
11 | 31 |
12 def write_ssrs(args): | 32 def write_ssrs(args): |
13 """ | 33 """ |
14 The integers in the call change the minimum repeats for mono-, di-, tri-, tetra-, penta-, hexa-nucleotide repeats | 34 The integers in the call change the minimum repeats for mono-, di-, tri-, tetra-, penta-, hexa-nucleotide repeats |
16 NOTE: Dinucleotides GA and AG are reported separately by https://github.com/marbl/seqrequester. | 36 NOTE: Dinucleotides GA and AG are reported separately by https://github.com/marbl/seqrequester. |
17 The reversed pair STRs are about as common in the documentation sample. | 37 The reversed pair STRs are about as common in the documentation sample. |
18 Sequence read bias might be influenced by GC density or some other specific motif. | 38 Sequence read bias might be influenced by GC density or some other specific motif. |
19 """ | 39 """ |
20 bed = [] | 40 bed = [] |
41 wig = [] | |
42 chrlens = {} | |
21 specific = None | 43 specific = None |
22 if args.specific: | 44 if args.specific: |
23 specific = args.specific.upper().split(",") | 45 specific = args.specific.upper().split(",") |
24 fa = Fastx(args.fasta, uppercase=True) | 46 fa = Fastx(args.fasta, uppercase=True) |
25 for name, seq in fa: | 47 for name, seq in fa: |
26 if args.specific: | 48 chrlen = len(seq) |
27 ssrs = pytrf.STRFinder( | 49 chrlens[name] = chrlen |
28 name, | 50 cbed = [] |
29 seq, | 51 for ssr in pytrf.STRFinder( |
30 args.minreps, | 52 name, |
31 args.minreps, | 53 seq, |
32 args.minreps, | 54 args.monomin, |
33 args.minreps, | 55 args.dimin, |
34 args.minreps, | 56 args.trimin, |
35 args.minreps, | 57 args.tetramin, |
36 ) | 58 args.pentamin, |
37 else: | 59 args.hexamin, |
38 ssrs = pytrf.STRFinder( | 60 ): |
39 name, | |
40 seq, | |
41 args.monomin, | |
42 args.dimin, | |
43 args.trimin, | |
44 args.tetramin, | |
45 args.pentamin, | |
46 args.hexamin, | |
47 ) | |
48 for ssr in ssrs: | |
49 row = ( | 61 row = ( |
50 ssr.chrom, | 62 ssr.chrom, |
51 ssr.start - 1, | 63 ssr.start, |
52 ssr.end, | 64 ssr.end, |
53 ssr.motif, | 65 ssr.motif, |
54 ssr.repeat, | 66 ssr.repeat, |
55 ssr.length, | 67 ssr.length, |
56 ) | 68 ) |
57 # pytrf reports a 1 based start position so start-1 fixes the bed interval lengths | |
58 if args.specific and ssr.motif in specific: | 69 if args.specific and ssr.motif in specific: |
59 bed.append(row) | 70 cbed.append(row) |
60 elif args.mono and len(ssr.motif) == 1: | 71 elif args.mono and len(ssr.motif) == 1: |
61 bed.append(row) | 72 cbed.append(row) |
62 elif args.di and len(ssr.motif) == 2: | 73 elif args.di and len(ssr.motif) == 2: |
63 bed.append(row) | 74 cbed.append(row) |
64 elif args.tri and len(ssr.motif) == 3: | 75 elif args.tri and len(ssr.motif) == 3: |
65 bed.append(row) | 76 cbed.append(row) |
66 elif args.tetra and len(ssr.motif) == 4: | 77 elif args.tetra and len(ssr.motif) == 4: |
67 bed.append(row) | 78 cbed.append(row) |
68 elif args.penta and len(ssr.motif) == 5: | 79 elif args.penta and len(ssr.motif) == 5: |
69 bed.append(row) | 80 cbed.append(row) |
70 elif args.hexa and len(ssr.motif) == 6: | 81 elif args.hexa and len(ssr.motif) == 6: |
71 bed.append(row) | 82 cbed.append(row) |
72 bed.sort() | 83 if args.bigwig: |
73 obed = ["%s\t%d\t%d\t%s_%d\t%d" % x for x in bed] | 84 w = getDensity(name, cbed, chrlen, args.winwidth) |
74 with open(args.bed, "w") as outbed: | 85 wig += w |
75 outbed.write("\n".join(obed)) | 86 bed += cbed |
76 outbed.write("\n") | 87 if args.bigwig: |
88 wig.sort() | |
89 bedg = ["%s %d %d %.2f" % x for x in wig] | |
90 with open("temp.bedg", "w") as bw: | |
91 bw.write("\n".join(bedg)) | |
92 chroms = ["%s\t%s" % (x, chrlens[x]) for x in chrlens.keys()] | |
93 with open("temp.chromlen", "w") as cl: | |
94 cl.write("\n".join(chroms)) | |
95 cmd = ["bedGraphToBigWig", "temp.bedg", "temp.chromlen", args.bed] | |
96 subprocess.run(cmd) | |
97 else: | |
98 bed.sort() | |
99 obed = ["%s\t%d\t%d\t%s_%d\t%d" % x for x in bed] | |
100 with open(args.bed, "w") as outbed: | |
101 outbed.write("\n".join(obed)) | |
102 outbed.write("\n") | |
77 | 103 |
78 | 104 |
79 if __name__ == "__main__": | 105 if __name__ == "__main__": |
80 parser = argparse.ArgumentParser() | 106 parser = argparse.ArgumentParser() |
81 a = parser.add_argument | 107 a = parser.add_argument |
91 a("--pentamin", default=2, type=int) | 117 a("--pentamin", default=2, type=int) |
92 a("--hexamin", default=2, type=int) | 118 a("--hexamin", default=2, type=int) |
93 a("--monomin", default=2, type=int) | 119 a("--monomin", default=2, type=int) |
94 a("-f", "--fasta", default="humsamp.fa") | 120 a("-f", "--fasta", default="humsamp.fa") |
95 a("-b", "--bed", default="humsamp.bed") | 121 a("-b", "--bed", default="humsamp.bed") |
122 a("--bigwig", action="store_true") | |
123 a("--winwidth", default=128, type=int) | |
96 a("--specific", default=None) | 124 a("--specific", default=None) |
97 a("--minreps", default=2, type=int) | 125 a("--minreps", default=2, type=int) |
98 args = parser.parse_args() | 126 args = parser.parse_args() |
99 write_ssrs(args) | 127 write_ssrs(args) |