comparison find_str.py @ 8:01c16e8fbc91 draft

planemo upload for repository https://github.com/fubar2/microsatbed commit 80a8c0db54b6e2cab9dfe7178b1e5b3b39592f2c-dirty
author fubar
date Tue, 13 Aug 2024 04:50:47 +0000
parents 4ff60fb9ca4d
children
comparison
equal deleted inserted replaced
7:f27be15cc58d 8:01c16e8fbc91
1 import argparse 1 import argparse
2 import subprocess
2 3
3 import pytrf # 1.3.0 4 import pytrf # 1.3.0
4 from pyfastx import Fastx # 0.5.2 5 from pyfastx import Fastx # 0.5.2
5 6
6 """ 7 """
7 Allows all STR or those for a subset of motifs to be written to a bed file 8 Allows all STR or those for a subset of motifs to be written to a bed file
8 Designed to build some of the microsatellite tracks from https://github.com/arangrhie/T2T-Polish/tree/master/pattern for the VGP. 9 Designed to build some of the microsatellite tracks from https://github.com/arangrhie/T2T-Polish/tree/master/pattern for the VGP.
9 """ 10 """
11
12
13 def getDensity(name, bed, chrlen, winwidth):
14 """
15 pybigtools can write bigwigs and they are processed by other ucsc tools - but jb2 will not read them.
16 Swapped the conversion to use a bedgraph file processed by bedGraphToBigWig
17 """
18 nwin = int(chrlen / winwidth)
19 d = [0.0 for x in range(nwin + 1)]
20 for b in bed:
21 nt = b[5]
22 bin = int(b[1] / winwidth)
23 d[bin] += nt
24 bedg = [
25 (name, (x * winwidth), ((x + 1) * winwidth) - 1, float(d[x]))
26 for x in range(nwin + 1)
27 if (x + 1) * winwidth <= chrlen
28 ]
29 return bedg
10 30
11 31
12 def write_ssrs(args): 32 def write_ssrs(args):
13 """ 33 """
14 The integers in the call change the minimum repeats for mono-, di-, tri-, tetra-, penta-, hexa-nucleotide repeats 34 The integers in the call change the minimum repeats for mono-, di-, tri-, tetra-, penta-, hexa-nucleotide repeats
16 NOTE: Dinucleotides GA and AG are reported separately by https://github.com/marbl/seqrequester. 36 NOTE: Dinucleotides GA and AG are reported separately by https://github.com/marbl/seqrequester.
17 The reversed pair STRs are about as common in the documentation sample. 37 The reversed pair STRs are about as common in the documentation sample.
18 Sequence read bias might be influenced by GC density or some other specific motif. 38 Sequence read bias might be influenced by GC density or some other specific motif.
19 """ 39 """
20 bed = [] 40 bed = []
41 wig = []
42 chrlens = {}
21 specific = None 43 specific = None
22 if args.specific: 44 if args.specific:
23 specific = args.specific.upper().split(",") 45 specific = args.specific.upper().split(",")
24 fa = Fastx(args.fasta, uppercase=True) 46 fa = Fastx(args.fasta, uppercase=True)
25 for name, seq in fa: 47 for name, seq in fa:
26 if args.specific: 48 chrlen = len(seq)
27 ssrs = pytrf.STRFinder( 49 chrlens[name] = chrlen
28 name, 50 cbed = []
29 seq, 51 for ssr in pytrf.STRFinder(
30 args.minreps, 52 name,
31 args.minreps, 53 seq,
32 args.minreps, 54 args.monomin,
33 args.minreps, 55 args.dimin,
34 args.minreps, 56 args.trimin,
35 args.minreps, 57 args.tetramin,
36 ) 58 args.pentamin,
37 else: 59 args.hexamin,
38 ssrs = pytrf.STRFinder( 60 ):
39 name,
40 seq,
41 args.monomin,
42 args.dimin,
43 args.trimin,
44 args.tetramin,
45 args.pentamin,
46 args.hexamin,
47 )
48 for ssr in ssrs:
49 row = ( 61 row = (
50 ssr.chrom, 62 ssr.chrom,
51 ssr.start - 1, 63 ssr.start,
52 ssr.end, 64 ssr.end,
53 ssr.motif, 65 ssr.motif,
54 ssr.repeat, 66 ssr.repeat,
55 ssr.length, 67 ssr.length,
56 ) 68 )
57 # pytrf reports a 1 based start position so start-1 fixes the bed interval lengths
58 if args.specific and ssr.motif in specific: 69 if args.specific and ssr.motif in specific:
59 bed.append(row) 70 cbed.append(row)
60 elif args.mono and len(ssr.motif) == 1: 71 elif args.mono and len(ssr.motif) == 1:
61 bed.append(row) 72 cbed.append(row)
62 elif args.di and len(ssr.motif) == 2: 73 elif args.di and len(ssr.motif) == 2:
63 bed.append(row) 74 cbed.append(row)
64 elif args.tri and len(ssr.motif) == 3: 75 elif args.tri and len(ssr.motif) == 3:
65 bed.append(row) 76 cbed.append(row)
66 elif args.tetra and len(ssr.motif) == 4: 77 elif args.tetra and len(ssr.motif) == 4:
67 bed.append(row) 78 cbed.append(row)
68 elif args.penta and len(ssr.motif) == 5: 79 elif args.penta and len(ssr.motif) == 5:
69 bed.append(row) 80 cbed.append(row)
70 elif args.hexa and len(ssr.motif) == 6: 81 elif args.hexa and len(ssr.motif) == 6:
71 bed.append(row) 82 cbed.append(row)
72 bed.sort() 83 if args.bigwig:
73 obed = ["%s\t%d\t%d\t%s_%d\t%d" % x for x in bed] 84 w = getDensity(name, cbed, chrlen, args.winwidth)
74 with open(args.bed, "w") as outbed: 85 wig += w
75 outbed.write("\n".join(obed)) 86 bed += cbed
76 outbed.write("\n") 87 if args.bigwig:
88 wig.sort()
89 bedg = ["%s %d %d %.2f" % x for x in wig]
90 with open("temp.bedg", "w") as bw:
91 bw.write("\n".join(bedg))
92 chroms = ["%s\t%s" % (x, chrlens[x]) for x in chrlens.keys()]
93 with open("temp.chromlen", "w") as cl:
94 cl.write("\n".join(chroms))
95 cmd = ["bedGraphToBigWig", "temp.bedg", "temp.chromlen", args.bed]
96 subprocess.run(cmd)
97 else:
98 bed.sort()
99 obed = ["%s\t%d\t%d\t%s_%d\t%d" % x for x in bed]
100 with open(args.bed, "w") as outbed:
101 outbed.write("\n".join(obed))
102 outbed.write("\n")
77 103
78 104
79 if __name__ == "__main__": 105 if __name__ == "__main__":
80 parser = argparse.ArgumentParser() 106 parser = argparse.ArgumentParser()
81 a = parser.add_argument 107 a = parser.add_argument
91 a("--pentamin", default=2, type=int) 117 a("--pentamin", default=2, type=int)
92 a("--hexamin", default=2, type=int) 118 a("--hexamin", default=2, type=int)
93 a("--monomin", default=2, type=int) 119 a("--monomin", default=2, type=int)
94 a("-f", "--fasta", default="humsamp.fa") 120 a("-f", "--fasta", default="humsamp.fa")
95 a("-b", "--bed", default="humsamp.bed") 121 a("-b", "--bed", default="humsamp.bed")
122 a("--bigwig", action="store_true")
123 a("--winwidth", default=128, type=int)
96 a("--specific", default=None) 124 a("--specific", default=None)
97 a("--minreps", default=2, type=int) 125 a("--minreps", default=2, type=int)
98 args = parser.parse_args() 126 args = parser.parse_args()
99 write_ssrs(args) 127 write_ssrs(args)