Mercurial > repos > fubar > microsatbed
changeset 20:410144c7b2d6 draft
planemo upload for repository https://github.com/fubar2/microsatbed commit d952bc313f408735456747c3d33e09a3170c8f59-dirty
| author | fubar | 
|---|---|
| date | Wed, 17 Jul 2024 12:08:15 +0000 | 
| parents | db5523378e5c | 
| children | 8406413cb4aa | 
| files | find_str.py humsamp.bed microsatbed.xml test-data/bed_sample test-data/dibed_sample test-data/dibed_wig_sample test.bw | 
| diffstat | 7 files changed, 18751 insertions(+), 18678 deletions(-) [+] | 
line wrap: on
 line diff
--- a/find_str.py Wed Jul 17 07:40:00 2024 +0000 +++ b/find_str.py Wed Jul 17 12:08:15 2024 +0000 @@ -1,4 +1,7 @@ import argparse +import shutil + +import pybigtools import pytrf # 1.3.0 from pyfastx import Fastx # 0.5.2 @@ -8,6 +11,15 @@ Designed to build some of the microsatellite tracks from https://github.com/arangrhie/T2T-Polish/tree/master/pattern for the VGP. """ +def getDensity(name, bed, len, winwidth): + nwin = int(len / winwidth) + d = [0.0 for x in range(nwin+1)] + for b in bed: + nt = b[5] + bin = int(b[1]/winwidth) + d[bin] += nt + dw = [(name,x*winwidth,(x+1)*winwidth,float(d[x])) for x in range(nwin+1) if (x+1)*winwidth <= len] + return dw def write_ssrs(args): """ @@ -18,11 +30,14 @@ Sequence read bias might be influenced by GC density or some other specific motif. """ bed = [] + wig = [] + chrlens = {} specific = None if args.specific: specific = args.specific.upper().split(",") fa = Fastx(args.fasta, uppercase=True) for name, seq in fa: + cbed = [] for ssr in pytrf.STRFinder( name, seq, @@ -43,24 +58,35 @@ ) # pytrf reports a 1 based start position so start-1 fixes the bed interval lengths if args.specific and ssr.motif in specific: - bed.append(row) + cbed.append(row) elif args.mono and len(ssr.motif) == 1: - bed.append(row) + cbed.append(row) elif args.di and len(ssr.motif) == 2: - bed.append(row) + cbed.append(row) elif args.tri and len(ssr.motif) == 3: - bed.append(row) + cbed.append(row) elif args.tetra and len(ssr.motif) == 4: - bed.append(row) + cbed.append(row) elif args.penta and len(ssr.motif) == 5: - bed.append(row) + cbed.append(row) elif args.hexa and len(ssr.motif) == 6: - bed.append(row) - bed.sort() - obed = ["%s\t%d\t%d\t%s_%d\t%d" % x for x in bed] - with open(args.bed, "w") as outbed: - outbed.write("\n".join(obed)) - outbed.write("\n") + cbed.append(row) + bed += cbed + if args.bigwig: + chrlens[name] = len(seq) + w = getDensity(name, cbed, len(seq), args.winwidth) + wig += w + if args.bigwig: + wig.sort() + bw = pybigtools.open("temp.bw", 'w') + bw.write(chrlens,wig) + shutil.move("temp.bw", args.bed) + else: + bed.sort() + obed = ["%s\t%d\t%d\t%s_%d\t%d" % x for x in bed] + with open(args.bed, "w") as outbed: + outbed.write("\n".join(obed)) + outbed.write("\n") if __name__ == "__main__": @@ -80,6 +106,8 @@ a("--monomin", default=2, type=int) a("-f", "--fasta", default="humsamp.fa") a("-b", "--bed", default="humsamp.bed") + a("--bigwig", action="store_true") + a("--winwidth", default=128, type=int) a("--specific", default=None) a("--minreps", default=2, type=int) args = parser.parse_args()
--- a/microsatbed.xml Wed Jul 17 07:40:00 2024 +0000 +++ b/microsatbed.xml Wed Jul 17 12:08:15 2024 +0000 @@ -4,6 +4,7 @@ <requirement version="3.12.3" type="package">python</requirement> <requirement version="2.1.0" type="package">pyfastx</requirement> <requirement version="1.3.0" type="package">pytrf</requirement> + <requirement type="package" version="0.1.4">pybigtools</requirement> </requirements> <required_files> <include path="find_str.py"/> @@ -26,6 +27,10 @@ --bed '$bed' #if $mode_cond.mode == "SPECIFIC": --specific '$mode_cond.specific' + #elif $mode_cond.mode == "SPECIFICBW": + --bigwig + --winwidth '$mode_cond.winwidth' + --specific '$mode_cond.specific' #else: #if "MONO" in $mode_cond.subset: --mono @@ -52,6 +57,10 @@ --tetramin '$tetramin' --pentamin '$pentamin' --hexamin '$hexamin' + #if $mode_cond.mode == "SPECIFICBW": + --bigwig + --winwidth '$mode_cond.winwidth' + #end if #end if ]]></command> <inputs> @@ -73,24 +82,41 @@ <conditional name="mode_cond"> <param name="mode" type="select" label="Select patterns by motif length; or provide a specific motif pattern to report?" help="Choose *By length:* or *By pattern:* to configure STR selection mode"> <option selected="True" value="ALL">By length: Report all motifs of one or more specified lengths (1-6nt) as bed features</option> + <option value="ALLBW">By length as windowed bigwig: Report all motifs of one or more specified lengths (1-6nt) as windowed density</option> <option value="SPECIFIC">By motif: Report one or more specific motifs (such as TCA,GC) as bed features</option> + <option value="SPECIFICBW">By motif as windowed bigwig: Report one or more specific motifs (such as TCA,GC) as windowed density</option> <option value="NATIVE">All exact STR: use the pytrf findstr native command to a create csv, tsv or gtf output</option> </param> <when value="ALL"> <param name="subset" type="select" multiple="true" optional="false" label="Select at least 1 specific motif length to report" help="Bed features will be output for every motif of the selected length(s) with the minimum required repeats or more"> <option value="DI" selected="true">All dimers (AC,AG,AT,...)</option> - <option value="TRI">All trimers (ACG,..)</option> - <option value="TETRA">All tetramers (ACGT,..)</option> + <option value="TRI">All trimers (ACG,..)</option> <option value="PENTA">All pentamers (ACGTC,..)</option> <option value="HEXA">All hexamers (ACGTCG,..)</option> <option value="MONO">All monomers (A,C...). Warning! Can produce overwhelming numbers of bed features</option> </param> </when> + <when value="ALLBW"> + <param name="subset" type="select" multiple="true" optional="false" label="Select at least 1 specific motif length to report" + help="Bed features will be output for every motif of the selected length(s) with the minimum required repeats or more"> + <option value="DI" selected="true">All dimers (AC,AG,AT,...)</option> + <option value="TRI">All trimers (ACG,..)</option> + <option value="PENTA">All pentamers (ACGTC,..)</option> + <option value="HEXA">All hexamers (ACGTCG,..)</option> + <option value="MONO">All monomers (A,C...). Warning! Can produce overwhelming numbers of bed features</option> + </param> + <param type="integer" min="5" name="winwidth" label="Window with for estimating STR bigwig density" value="128"/> + </when> <when value="SPECIFIC"> <param name="specific" type="text" label="Supply a specific motif pattern. Separate multiple patterns with commas such as GA,GC" help="Make bed features only for the nominated specific motifs." optional="false"/> </when> + <when value="SPECIFICBW"> + <param name="specific" type="text" label="Supply a specific motif pattern. Separate multiple patterns with commas such as GA,GC" + help="Make bed features only for the nominated specific motifs." optional="false"/> + <param type="integer" min="5" name="winwidth" label="Window with for estimating STR bigwig density" value="128"/> + </when> <when value="NATIVE"> <param name="outformat" type="select" optional="false" label="Select the required output format" help="Pytrf can create GFF, CSV or TSV output files. Documentation is linked in the help section below"> @@ -113,6 +139,8 @@ <when input="mode_cond.outformat" value="gff" format="gff" /> <when input="mode_cond.outformat" value="csv" format="csv" /> <when input="mode_cond.outformat" value="tsv" format="tabular" /> + <when input="mode_cond.mode" value="ALLBW" format="bigwig" /> + <when input="mode_cond.mode" value="SPECIFICBW" format="bigwig" /> </change_format> </data> </outputs> @@ -168,6 +196,23 @@ <param name="hexamin" value="2"/> <output name="bed" value="nativegff_sample" compare="diff" lines_diff="0"/> </test> + <test expect_num_outputs="1"> + <conditional name="reference_genome"> + <param name="genome_type_select" value="history"/> + <param name="fasta" value="humsamp.fa"/> + </conditional> + <conditional name="mode_cond"> + <param name="mode" value="SPECIFICBW"/> + <param name="specific" value="GC"/> + </conditional> + <param name="monomin" value="20"/> + <param name="dimin" value="1"/> + <param name="trimin" value="20"/> + <param name="tetramin" value="20"/> + <param name="pentamin" value="20"/> + <param name="hexamin" value="20"/> + <output name="bed" value="dibed_wig_sample" compare="sim_size" delta="10"/> + </test> </tests> <help><