Mercurial > repos > lldelisle > fromgtftobed12
changeset 0:418e4d0fe0bd draft
planemo upload for repository https://github.com/lldelisle/tools-lldelisle/tree/master/tools/fromgtfTobed12 commit 1aaffda5b95e0389e315179345642c0d005867c1
author | lldelisle |
---|---|
date | Fri, 04 Nov 2022 15:37:12 +0000 |
parents | |
children | 6fd4b3b90220 |
files | fromgtfTobed12.py fromgtfTobed12.xml test-data/Homo_sapiens.GRCh38.95_491firstLines.gtf.gz test-data/test.bed test-data/testMergeExons.bed test-data/testMergeNotUCSC.bed test-data/testWithGenes.bed |
diffstat | 7 files changed, 528 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fromgtfTobed12.py Fri Nov 04 15:37:12 2022 +0000 @@ -0,0 +1,150 @@ +import argparse +import sys +import warnings + +import gffutils + +warnings.filterwarnings("ignore", message="It appears you have a gene feature" + " in your GTF file. You may want to use the " + "`disable_infer_genes` option to speed up database " + "creation") +warnings.filterwarnings("ignore", message="It appears you have a transcript " + "feature in your GTF file. You may want to use the " + "`disable_infer_transcripts` option to speed up " + "database creation") +# In gffutils v0.10 they changed the error message: +warnings.filterwarnings("ignore", message="It appears you have a gene feature" + " in your GTF file. You may want to use the " + "`disable_infer_genes=True` option to speed up " + "database creation") +warnings.filterwarnings("ignore", message="It appears you have a transcript " + "feature in your GTF file. You may want to use the " + "`disable_infer_transcripts=True` option to speed up " + "database creation") + + +def convert_gtf_to_bed(fn, fo, useGene, mergeTranscripts, + mergeTranscriptsAndOverlappingExons, ucsc): + db = gffutils.create_db(fn, ':memory:') + # For each transcript: + prefered_name = "transcript_name" + if useGene or mergeTranscripts or mergeTranscriptsAndOverlappingExons: + prefered_name = "gene_name" + if mergeTranscripts or mergeTranscriptsAndOverlappingExons: + all_items = db.features_of_type("gene", order_by='start') + else: + all_items = db.features_of_type("transcript", order_by='start') + for tr in all_items: + # The name would be the name of the transcript/gene if exists + try: + # First try to have it directly on the feature + trName = tr.attributes[prefered_name][0] + except KeyError: + # Else try to guess the name of the transcript/gene from exons: + try: + trName = set([e.attributes[prefered_name][0] + for e in + db.children(tr, + featuretype='exon', + order_by='start')]).pop() + except KeyError: + # Else take the transcript id + trName = tr.id + # If the cds is defined in the gtf, + # use it to define the thick start and end + # The gtf is 1-based closed intervalls and + # bed are 0-based half-open so: + # I need to remove one from each start + try: + # In case of multiple CDS (when there is one entry per gene) + # I use the first one to get the start + # and the last one to get the end (order_by=-start) + cds_start = next(db.children(tr, + featuretype='CDS', + order_by='start')).start - 1 + cds_end = next(db.children(tr, + featuretype='CDS', + order_by='-start')).end + except StopIteration: + # If the CDS is not defined, then it is set to the start + # as proposed here: + # https://genome.ucsc.edu/FAQ/FAQformat.html#format1 + cds_start = tr.start - 1 + cds_end = tr.start - 1 + # Get all exons starts and lengths + if mergeTranscriptsAndOverlappingExons: + # We merge overlapping exons: + exons_starts = [] + exons_length = [] + current_start = -1 + current_end = None + for e in db.children(tr, featuretype='exon', order_by='start'): + if current_start == -1: + current_start = e.start - 1 + current_end = e.end + else: + if e.start > current_end: + # This is a non-overlapping exon + # We store the previous exon: + exons_starts.append(current_start) + exons_length.append(current_end - current_start) + # We set the current: + current_start = e.start - 1 + current_end = e.end + else: + # This is an overlapping exon + # We update current_end if necessary + current_end = max(current_end, e.end) + if current_start != -1: + # There is a last exon to store: + exons_starts.append(current_start) + exons_length.append(current_end - current_start) + else: + exons_starts = [e.start - 1 + for e in + db.children(tr, featuretype='exon', + order_by='start')] + exons_length = [len(e) + for e in + db.children(tr, featuretype='exon', + order_by='start')] + # Rewrite the chromosome name if needed: + chrom = tr.chrom + if ucsc and chrom[0:3] != 'chr': + chrom = 'chr' + chrom + fo.write("%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s\n" % + (chrom, tr.start - 1, tr.end, trName, 0, tr.strand, + cds_start, cds_end, "0", len(exons_starts), + ",".join([str(ex_l) for ex_l in exons_length]), + ",".join([str(s - (tr.start - 1)) for s in exons_starts]))) + + +argp = argparse.ArgumentParser( + description=("Convert a gtf to a bed12 with one entry" + " per transcript/gene")) +argp.add_argument('input', default=None, + help="Input gtf file (can be gzip).") +argp.add_argument('--output', default=sys.stdout, + type=argparse.FileType('w'), + help="Output bed12 file.") +argp.add_argument('--useGene', action="store_true", + help="Use the gene name instead of the " + "transcript name.") +argp.add_argument('--ucscformat', action="store_true", + help="If you want that all chromosome names " + "begin with 'chr'.") +group = argp.add_mutually_exclusive_group() +group.add_argument('--mergeTranscripts', action="store_true", + help="Merge all transcripts into a single " + "entry to have one line per gene.") +group.add_argument('--mergeTranscriptsAndOverlappingExons', + action="store_true", + help="Merge all transcripts into a single " + "entry to have one line per gene and merge" + " overlapping exons.") + +args = argp.parse_args() +convert_gtf_to_bed(args.input, args.output, args.useGene, + args.mergeTranscripts, + args.mergeTranscriptsAndOverlappingExons, + args.ucscformat)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fromgtfTobed12.xml Fri Nov 04 15:37:12 2022 +0000 @@ -0,0 +1,76 @@ +<tool id="fromgtfTobed12" name="fromgtftobed12" version="0.11.1+galaxy0"> + <description> Convert a gtf to a bed12.</description> + <requirements> + <requirement type="package" version="0.11.1">gffutils</requirement> + </requirements> + <stdio> + <!-- Anything other than zero is an error --> + <exit_code range="1:" /> + <exit_code range=":-1" /> + <!-- In case the return code has not been set propery check stderr too --> + <regex match="Error:" /> + <regex match="Exception:" /> + </stdio> + <command> +<![CDATA[ + python3 $__tool_directory__/fromgtfTobed12.py + $useGene + $mergeTranscripts + $ucscformat + --output $output + $input +]]> + </command> + <inputs> + <param name="input" multiple="false" type="data" format="gtf" label="Select the gtf to convert."/> + <param argument="--useGene" type="boolean" checked="False" truevalue="--useGene" falsevalue="" label="Uses the gene name instead of the transcript name."/> + <param name="mergeTranscripts" type="select" label="Do you want to merge all transcripts of a gene in a single line?"> + <option value="" selected="true">No</option> + <option value="--mergeTranscripts">Yes</option> + <option value="--mergeTranscriptsAndOverlappingExons">Yes and merge overlapping exons</option> + </param> + <param argument="--ucscformat" type="boolean" checked="True" truevalue="--ucscformat" falsevalue="" label="If you want that all chromosome names begin with 'chr'."/> + </inputs> + + <outputs> + <data format="bed" name="output" label="$input.name as bed12"/> + </outputs> + + <tests> + <test> + <param name="input" value="Homo_sapiens.GRCh38.95_491firstLines.gtf.gz"/> + <param name="ucscformat" value="--ucscformat"/> + <output name="output" file="test.bed"/> + </test> + <test> + <param name="input" value="Homo_sapiens.GRCh38.95_491firstLines.gtf.gz"/> + <param name="ucscformat" value="--ucscformat"/> + <param name="useGene" value="--useGene"/> + <output name="output" file="testWithGenes.bed"/> + </test> + <test> + <param name="input" value="Homo_sapiens.GRCh38.95_491firstLines.gtf.gz"/> + <param name="mergeTranscripts" value="--mergeTranscripts"/> + <param name="useGene" value="--useGene"/> + <param name="ucscformat" value=""/> + <output name="output" file="testMergeNotUCSC.bed"/> + </test> + <test> + <param name="input" value="Homo_sapiens.GRCh38.95_491firstLines.gtf.gz"/> + <param name="mergeTranscripts" value="--mergeTranscriptsAndOverlappingExons"/> + <param name="useGene" value="--useGene"/> + <param name="ucscformat" value=""/> + <output name="output" file="testMergeExons.bed"/> + </test> + </tests> + <help><![CDATA[ + This tool uses gffutils to convert gtf to bed12. One line per transcript. + It will use as names transcript_name or gene_name when available. +]]> </help> + <citations> + <citation type="bibtex">@online{gffutils, + url = {https://pythonhosted.org/gffutils/contents.html} + } + </citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test.bed Fri Nov 04 15:37:12 2022 +0000 @@ -0,0 +1,105 @@ +chr1 11868 14409 DDX11L1-202 0 + 11868 11868 0 3 359,109,1189 0,744,1352 +chr1 12009 13670 DDX11L1-201 0 + 12009 12009 0 6 48,49,85,78,154,218 0,169,603,965,1211,1443 +chr1 14403 29570 WASH7P-201 0 - 14403 14403 0 11 98,34,152,159,198,136,137,147,99,154,37 0,601,1392,2203,2454,2829,3202,3511,3864,10334,15130 +chr1 17368 17436 MIR6859-1-201 0 - 17368 17368 0 1 68 0 +chr1 29553 31097 MIR1302-2HG-202 0 + 29553 29553 0 3 486,104,122 0,1010,1422 +chr1 30266 31109 MIR1302-2HG-201 0 + 30266 30266 0 2 401,134 0,709 +chr1 30365 30503 MIR1302-2-201 0 + 30365 30365 0 1 138 0 +chr1 34553 36081 FAM138A-201 0 - 34553 34553 0 3 621,205,361 0,723,1167 +chr1 35244 36073 FAM138A-202 0 - 35244 35244 0 2 237,353 0,476 +chr1 52472 53312 OR4G4P-201 0 + 52472 52472 0 1 840 0 +chr1 57597 64116 OR4G11P-202 0 + 57597 57597 0 3 56,157,1201 0,1102,5318 +chr1 62948 63887 OR4G11P-201 0 + 62948 62948 0 1 939 0 +chr1 65418 71585 OR4F5-202 0 + 65564 70005 0 3 15,54,2549 0,101,3618 +chr1 69054 70108 OR4F5-201 0 + 69090 70005 0 1 1054 0 +chr1 89294 120932 AL627309.1-201 0 - 89294 89294 0 4 2335,150,105,158 0,2796,23405,31480 +chr1 89550 91105 AL627309.3-201 0 - 89550 89550 0 2 500,819 0,736 +chr1 92229 129217 AL627309.1-202 0 - 92229 92229 0 4 11,105,212,163 0,20470,28491,36825 +chr1 110952 129173 AL627309.1-203 0 - 110952 110952 0 3 405,105,119 0,1747,18102 +chr1 120724 133723 AL627309.1-205 0 - 120724 120724 0 4 145,59,169,350 0,149,8330,12649 +chr1 129080 133566 AL627309.1-204 0 - 129080 129080 0 2 143,193 0,4293 +chr1 131024 134836 CICP27-201 0 + 131024 131024 0 1 3812 0 +chr1 135140 135895 AL627309.6-201 0 - 135140 135140 0 1 755 0 +chr1 137681 137965 AL627309.7-201 0 - 137681 137681 0 1 284 0 +chr1 139789 140339 AL627309.2-201 0 - 139789 139789 0 2 58,265 0,285 +chr1 141473 149707 AL627309.5-201 0 - 141473 141473 0 2 1538,3322 0,4912 +chr1 142807 146831 AL627309.5-202 0 - 142807 142807 0 3 204,124,190 0,3578,3834 +chr1 146385 173862 AL627309.5-203 0 - 146385 146385 0 8 124,65,529,59,66,216,132,110 0,9381,17877,19498,21714,22663,26171,27367 +chr1 157783 157887 RNU6-1100P-201 0 - 157783 157783 0 1 104 0 +chr1 160445 161525 AL627309.4-201 0 + 160445 160445 0 2 245,212 0,868 +chr1 165888 168767 AL627309.5-204 0 - 165888 165888 0 3 54,66,158 0,2211,2721 +chr1 182695 184174 FO538757.1-201 0 + 182695 182695 0 5 51,85,78,162,194 0,436,798,1044,1285 +chr1 185216 195411 WASH9P-201 0 - 185216 185216 0 10 134,69,153,159,202,136,137,146,112,149 0,274,1100,1912,2159,2538,2913,3222,3574,10046 +chr1 187890 187958 MIR6859-2-201 0 - 187890 187890 0 1 68 0 +chr1 257863 264733 AP006222.1-201 0 - 257863 257863 0 2 1162,130 0,6740 +chr1 257912 268816 AP006222.1-202 0 - 257912 257912 0 4 1113,85,902,150 0,3637,9390,10754 +chr1 258143 359681 AP006222.1-203 0 - 258143 258143 0 4 882,902,135,337 0,98541,99905,101201 +chr1 258523 268816 AP006222.1-204 0 - 258523 258523 0 3 502,902,150 0,8779,10143 +chr1 258567 259024 AP006222.1-205 0 - 258567 258567 0 1 457 0 +chr1 263014 297502 AP006222.1-206 0 - 263014 263014 0 4 5190,150,105,158 0,5652,26251,34330 +chr1 347981 348366 RPL23AP24-201 0 - 347981 347981 0 1 385 0 +chr1 358856 365704 AL732372.1-201 0 + 358856 358856 0 2 73,534 0,6314 +chr1 358871 365510 AL732372.1-202 0 + 358871 358871 0 2 86,340 0,6299 +chr1 360056 366052 AL732372.1-203 0 + 360056 360056 0 2 112,882 0,5114 +chr1 365388 366151 AL732372.2-201 0 - 365388 365388 0 2 304,133 0,630 +chr1 365394 368450 AL732372.2-202 0 - 365394 365394 0 2 298,200 0,2856 +chr1 365614 379972 AL732372.2-203 0 - 365614 365614 0 3 78,180,204 0,7529,14154 +chr1 373181 485208 AL732372.2-204 0 - 373181 373181 0 3 142,102,169 0,6587,111858 +chr1 439869 440232 WBP1LP7-201 0 + 439869 439869 0 1 363 0 +chr1 450702 451697 OR4F29-201 0 - 450742 451678 0 1 995 0 +chr1 476363 497259 AL732372.2-205 0 - 476363 476363 0 3 582,169,151 0,8676,20745 +chr1 484831 495476 AL732372.2-206 0 - 484831 484831 0 3 377,58,200 0,10160,10445 +chr1 485025 485208 AL732372.2-207 0 - 485025 485025 0 1 183 0 +chr1 485065 489553 AL732372.2-208 0 - 485065 485065 0 2 143,193 0,4295 +chr1 487100 489906 CICP7-201 0 + 487100 487100 0 2 2287,190 0,2616 +chr1 491224 493241 AL732372.3-201 0 - 491224 491224 0 2 765,474 0,1543 +chr1 494381 496605 AL732372.2-209 0 - 494381 494381 0 2 205,342 0,1882 +chr1 494463 502508 AL732372.2-210 0 - 494463 494463 0 5 435,58,191,65,44 0,528,2645,7092,8001 +chr1 494474 495368 AL732372.2-211 0 - 494474 494474 0 3 424,58,92 0,517,802 +chr1 494610 499175 AL732372.2-212 0 - 494610 494610 0 3 288,58,492 0,381,4073 +chr1 494770 498976 AL732372.2-213 0 - 494770 494770 0 5 128,58,191,58,293 0,221,2338,3628,3913 +chr1 497133 498456 AL732372.2-214 0 - 497133 497133 0 3 166,233,58 0,939,1265 +chr1 497204 502598 AL732372.2-215 0 - 497204 497204 0 6 24,233,58,65,57,134 0,868,1194,4351,4982,5260 +chr1 497209 502873 AL732372.2-216 0 - 497209 497209 0 4 90,58,65,409 0,1189,4346,5255 +chr1 497239 499002 AL732372.2-217 0 - 497239 497239 0 4 60,259,58,319 0,807,1159,1444 +chr1 497244 502598 AL732372.2-218 0 - 497244 497244 0 5 55,233,58,65,134 0,828,1154,4311,5220 +chr1 497274 498976 AL732372.2-219 0 - 497274 497274 0 2 25,578 0,1124 +chr1 498280 499175 AL732372.2-220 0 - 498280 498280 0 3 25,58,492 0,118,403 +chr1 498983 501607 AL732372.2-221 0 - 498983 498983 0 2 386,52 0,2572 +chr1 501587 517252 AL732372.2-222 0 - 501587 501587 0 5 33,94,124,65,68 0,1274,3392,12771,15597 +chr1 501603 517225 AL732372.2-223 0 - 501603 501603 0 5 17,197,124,65,70 0,861,3376,12755,15552 +chr1 504469 514413 AL732372.2-224 0 - 504469 504469 0 2 464,55 0,9889 +chr1 504864 522928 AL732372.2-225 0 - 504864 504864 0 4 239,65,82,70 0,9494,12320,17994 +chr1 516375 516479 RF00026.90-201 0 - 516375 516375 0 1 104 0 +chr1 586070 612813 AL669831.3-201 0 - 586070 586070 0 6 288,135,128,180,102,73 0,750,8558,15327,21884,26670 +chr1 586277 588453 AL669831.3-202 0 - 586277 586277 0 3 81,135,337 0,543,1839 +chr1 586944 720194 AL669831.3-203 0 - 586944 586944 0 4 11,105,212,163 0,116740,124766,133087 +chr1 587628 594768 AC114498.1-201 0 + 587628 587628 0 2 73,534 0,6606 +chr1 587667 594574 AC114498.1-202 0 + 587667 587667 0 2 62,340 0,6567 +chr1 594190 633129 AL669831.3-204 0 - 594190 594190 0 5 566,180,102,88,86 0,7207,13764,34728,38853 +chr1 594197 631204 AL669831.3-205 0 - 594197 594197 0 6 559,180,102,124,88,74 0,7200,13757,18543,34721,36933 +chr1 594307 598551 AL669831.3-206 0 - 594307 594307 0 2 449,1253 0,2991 +chr1 594307 827769 AL669831.3-207 0 - 594307 594307 0 4 449,180,212,100 0,7090,117403,233362 +chr1 594307 827796 AL669831.3-208 0 - 594307 594307 0 5 449,180,102,33,127 0,7090,13647,104619,233362 +chr1 594457 733064 AL669831.3-209 0 - 594457 594457 0 8 299,180,102,33,158,169,191,84 0,6940,13497,104469,117307,125574,137559,138523 +chr1 601435 720200 AL669831.3-210 0 - 601435 601435 0 3 142,102,169 0,6519,118596 +chr1 627376 631150 AL669831.3-211 0 - 627376 627376 0 4 447,263,88,20 0,584,1542,3754 +chr1 629061 629433 MTND1P23-201 0 + 629061 629061 0 1 372 0 +chr1 629639 630683 MTND2P28-201 0 + 629639 629639 0 1 1044 0 +chr1 631073 632616 MTCO1P12-201 0 + 631073 631073 0 1 1543 0 +chr1 632324 632413 MIR6723-201 0 - 632324 632324 0 1 89 0 +chr1 632756 633438 MTCO2P12-201 0 + 632756 632756 0 1 682 0 +chr1 633534 633741 MTATP8P1-201 0 + 633534 633534 0 1 207 0 +chr1 633695 634376 MTATP6P1-201 0 + 633695 633695 0 1 681 0 +chr1 634375 634922 MTCO3P12-201 0 + 634375 634375 0 1 547 0 +chr1 674841 675265 WBP1LP6-201 0 + 674841 674841 0 1 424 0 +chr1 685678 686673 OR4F16-201 0 - 685718 686654 0 1 995 0 +chr1 701935 720150 AL669831.3-212 0 - 701935 701935 0 3 405,105,119 0,1749,18096 +chr1 711866 732212 AL669831.3-213 0 - 711866 711866 0 3 56,169,196 0,8165,20150 +chr1 720023 720206 AL669831.3-214 0 - 720023 720023 0 1 183 0 +chr1 720052 724564 AL669831.3-215 0 - 720052 720052 0 2 148,207 0,4305 +chr1 722091 724903 CICP3-201 0 + 722091 722091 0 2 2269,186 0,2626 +chr1 725884 778626 AL669831.1-201 0 - 725884 725884 0 16 3920,58,191,171,197,98,124,65,157,525,59,66,216,132,110,343 0,7422,9538,17295,18310,18843,20810,30192,33082,38838,40444,42663,43612,47091,48286,52399 +chr1 758232 758336 RNU6-1199P-201 0 - 758232 758232 0 1 104 0 +chr1 760910 761989 AL669831.2-201 0 + 760910 760910 0 2 244,212 0,867 +chr1 764722 774280 AL669831.1-202 0 - 764722 764722 0 5 78,104,59,66,110 0,421,1606,3825,9448
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/testMergeExons.bed Fri Nov 04 15:37:12 2022 +0000 @@ -0,0 +1,46 @@ +1 11868 14409 DDX11L1 0 + 11868 11868 0 4 359,109,78,1189 0,744,1106,1352 +1 14403 29570 WASH7P 0 - 14403 14403 0 11 98,34,152,159,198,136,137,147,99,154,37 0,601,1392,2203,2454,2829,3202,3511,3864,10334,15130 +1 17368 17436 MIR6859-1 0 - 17368 17368 0 1 68 0 +1 29553 31109 MIR1302-2HG 0 + 29553 29553 0 3 486,401,134 0,713,1422 +1 30365 30503 MIR1302-2 0 + 30365 30365 0 1 138 0 +1 34553 36081 FAM138A 0 - 34553 34553 0 3 621,237,361 0,691,1167 +1 52472 53312 OR4G4P 0 + 52472 52472 0 1 840 0 +1 57597 64116 OR4G11P 0 + 57597 57597 0 3 56,157,1201 0,1102,5318 +1 65418 71585 OR4F5 0 + 65564 70005 0 3 15,54,2549 0,101,3618 +1 89294 133723 AL627309.1 0 - 89294 89294 0 7 2335,150,405,105,212,169,350 0,2796,21658,23405,31426,39760,44079 +1 89550 91105 AL627309.3 0 - 89550 89550 0 2 500,819 0,736 +1 131024 134836 CICP27 0 + 131024 131024 0 1 3812 0 +1 135140 135895 AL627309.6 0 - 135140 135140 0 1 755 0 +1 137681 137965 AL627309.7 0 - 137681 137681 0 1 284 0 +1 139789 140339 AL627309.2 0 - 139789 139789 0 2 58,265 0,285 +1 141473 173862 AL627309.5 0 - 141473 141473 0 10 1538,3322,65,529,59,66,158,216,132,110 0,4912,14293,22789,24410,26626,27136,27575,31083,32279 +1 157783 157887 RNU6-1100P 0 - 157783 157783 0 1 104 0 +1 160445 161525 AL627309.4 0 + 160445 160445 0 2 245,212 0,868 +1 182695 184174 FO538757.1 0 + 182695 182695 0 5 51,85,78,162,194 0,436,798,1044,1285 +1 185216 195411 WASH9P 0 - 185216 185216 0 10 134,69,153,159,202,136,137,146,112,149 0,274,1100,1912,2159,2538,2913,3222,3574,10046 +1 187890 187958 MIR6859-2 0 - 187890 187890 0 1 68 0 +1 257863 359681 AP006222.1 0 - 257863 257863 0 9 1162,85,5190,150,105,158,902,135,337 0,3686,5151,10803,31402,39481,98821,100185,101481 +1 347981 348366 RPL23AP24 0 - 347981 347981 0 1 385 0 +1 358856 366052 AL732372.1 0 + 358856 358856 0 3 101,112,882 0,1200,6314 +1 365388 522928 AL732372.2 0 - 365388 365388 0 22 304,133,200,180,204,582,377,193,517,58,200,342,191,259,971,65,57,491,634,65,111,70 0,630,2862,7755,14380,110975,119443,123972,128993,129603,129888,130875,131720,132658,133010,136167,136798,137076,139081,148970,151767,157470 +1 439869 440232 WBP1LP7 0 + 439869 439869 0 1 363 0 +1 450702 451697 OR4F29 0 - 450742 451678 0 1 995 0 +1 487100 489906 CICP7 0 + 487100 487100 0 2 2287,190 0,2616 +1 491224 493241 AL732372.3 0 - 491224 491224 0 2 765,474 0,1543 +1 516375 516479 RF00026 0 - 516375 516375 0 1 104 0 +1 586070 827796 AL669831.3 0 - 586070 586070 0 22 288,135,337,566,1253,180,102,124,447,263,88,74,86,33,405,105,212,183,207,196,84,127 0,750,2046,8120,11228,15327,21884,26670,41306,41890,42848,45060,46973,112856,115865,117614,125640,133953,138287,145946,146910,241599 +1 587628 594768 AC114498.1 0 + 587628 587628 0 2 101,534 0,6606 +1 629061 629433 MTND1P23 0 + 629061 629061 0 1 372 0 +1 629639 630683 MTND2P28 0 + 629639 629639 0 1 1044 0 +1 631073 632616 MTCO1P12 0 + 631073 631073 0 1 1543 0 +1 632324 632413 MIR6723 0 - 632324 632324 0 1 89 0 +1 632756 633438 MTCO2P12 0 + 632756 632756 0 1 682 0 +1 633534 633741 MTATP8P1 0 + 633534 633534 0 1 207 0 +1 633695 634376 MTATP6P1 0 + 633695 633695 0 1 681 0 +1 634375 634922 MTCO3P12 0 + 634375 634375 0 1 547 0 +1 674841 675265 WBP1LP6 0 + 674841 674841 0 1 424 0 +1 685678 686673 OR4F16 0 - 685718 686654 0 1 995 0 +1 722091 724903 CICP3 0 + 722091 722091 0 2 2269,186 0,2626 +1 725884 778626 AL669831.1 0 - 725884 725884 0 16 3920,58,191,171,197,98,124,65,157,525,59,66,216,132,110,343 0,7422,9538,17295,18310,18843,20810,30192,33082,38838,40444,42663,43612,47091,48286,52399 +1 758232 758336 RNU6-1199P 0 - 758232 758232 0 1 104 0 +1 760910 761989 AL669831.2 0 + 760910 760910 0 2 244,212 0,867
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/testMergeNotUCSC.bed Fri Nov 04 15:37:12 2022 +0000 @@ -0,0 +1,46 @@ +1 11868 14409 DDX11L1 0 + 11868 11868 0 9 359,48,49,109,85,78,1189,154,218 0,141,310,744,744,1106,1352,1352,1584 +1 14403 29570 WASH7P 0 - 14403 14403 0 11 98,34,152,159,198,136,137,147,99,154,37 0,601,1392,2203,2454,2829,3202,3511,3864,10334,15130 +1 17368 17436 MIR6859-1 0 - 17368 17368 0 1 68 0 +1 29553 31109 MIR1302-2HG 0 + 29553 29553 0 5 486,401,104,122,134 0,713,1010,1422,1422 +1 30365 30503 MIR1302-2 0 + 30365 30365 0 1 138 0 +1 34553 36081 FAM138A 0 - 34553 34553 0 5 621,237,205,361,353 0,691,723,1167,1167 +1 52472 53312 OR4G4P 0 + 52472 52472 0 1 840 0 +1 57597 64116 OR4G11P 0 + 57597 57597 0 4 56,157,1201,939 0,1102,5318,5351 +1 65418 71585 OR4F5 0 + 65564 70005 0 4 15,54,2549,1054 0,101,3618,3636 +1 89294 133723 AL627309.1 0 - 89294 89294 0 17 2335,150,11,405,105,105,105,212,145,158,59,163,119,169,143,350,193 0,2796,2935,21658,23405,23405,23405,31426,31430,31480,31579,39760,39760,39760,39786,44079,44079 +1 89550 91105 AL627309.3 0 - 89550 89550 0 2 500,819 0,736 +1 131024 134836 CICP27 0 + 131024 131024 0 1 3812 0 +1 135140 135895 AL627309.6 0 - 135140 135140 0 1 755 0 +1 137681 137965 AL627309.7 0 - 137681 137681 0 1 284 0 +1 139789 140339 AL627309.2 0 - 139789 139789 0 2 58,265 0,285 +1 141473 173862 AL627309.5 0 - 141473 141473 0 16 1538,204,3322,124,124,190,65,529,59,54,66,66,158,216,132,110 0,1334,4912,4912,4912,5168,14293,22789,24410,24415,26626,26626,27136,27575,31083,32279 +1 157783 157887 RNU6-1100P 0 - 157783 157783 0 1 104 0 +1 160445 161525 AL627309.4 0 + 160445 160445 0 2 245,212 0,868 +1 182695 184174 FO538757.1 0 + 182695 182695 0 5 51,85,78,162,194 0,436,798,1044,1285 +1 185216 195411 WASH9P 0 - 185216 185216 0 10 134,69,153,159,202,136,137,146,112,149 0,274,1100,1912,2159,2538,2913,3222,3574,10046 +1 187890 187958 MIR6859-2 0 - 187890 187890 0 1 68 0 +1 257863 359681 AP006222.1 0 - 257863 257863 0 18 1162,1113,882,502,457,85,5190,130,902,902,150,150,150,105,158,902,135,337 0,49,280,660,704,3686,5151,6740,9439,9439,10803,10803,10803,31402,39481,98821,100185,101481 +1 347981 348366 RPL23AP24 0 - 347981 347981 0 1 385 0 +1 358856 366052 AL732372.1 0 + 358856 358856 0 6 73,86,112,534,340,882 0,15,1200,6314,6314,6314 +1 365388 522928 AL732372.2 0 - 365388 365388 0 82 304,298,78,133,200,180,142,204,102,582,377,183,169,169,143,193,205,435,424,288,128,58,58,58,58,58,200,92,342,151,191,191,166,24,90,60,55,25,259,233,233,233,25,58,58,58,58,58,58,578,58,492,293,319,492,386,65,65,65,65,52,33,17,57,44,134,409,134,197,94,464,239,124,124,65,65,55,65,70,68,82,70 0,6,226,630,2862,7755,7793,14380,14380,110975,119443,119637,119651,119651,119677,123972,128993,129075,129086,129222,129382,129603,129603,129603,129603,129603,129888,129888,130875,131720,131720,131720,131745,131816,131821,131851,131856,131886,132658,132684,132684,132684,132892,133010,133010,133010,133010,133010,133010,133010,133010,133295,133295,133295,133295,133595,136167,136167,136167,136167,136167,136199,136215,136798,137076,137076,137076,137076,137076,137473,139081,139476,139591,139591,148970,148970,148970,148970,151767,151796,151796,157470 +1 439869 440232 WBP1LP7 0 + 439869 439869 0 1 363 0 +1 450702 451697 OR4F29 0 - 450742 451678 0 1 995 0 +1 487100 489906 CICP7 0 + 487100 487100 0 2 2287,190 0,2616 +1 491224 493241 AL732372.3 0 - 491224 491224 0 2 765,474 0,1543 +1 516375 516479 RF00026 0 - 516375 516375 0 1 104 0 +1 586070 827796 AL669831.3 0 - 586070 586070 0 59 288,81,135,135,11,337,566,559,449,449,449,299,128,1253,180,180,180,180,180,180,142,102,102,102,102,102,102,73,124,447,263,88,88,88,74,20,86,33,33,405,105,105,212,212,158,56,183,163,169,169,119,169,148,207,191,196,84,100,127 0,207,750,750,874,2046,8120,8127,8237,8237,8237,8387,8558,11228,15327,15327,15327,15327,15327,15327,15365,21884,21884,21884,21884,21884,21884,26670,26670,41306,41890,42848,42848,42848,45060,45060,46973,112856,112856,115865,117614,117614,125640,125640,125694,125796,133953,133961,133961,133961,133961,133961,133982,138287,145946,145946,146910,241599,241599 +1 587628 594768 AC114498.1 0 + 587628 587628 0 4 73,62,534,340 0,39,6606,6606 +1 629061 629433 MTND1P23 0 + 629061 629061 0 1 372 0 +1 629639 630683 MTND2P28 0 + 629639 629639 0 1 1044 0 +1 631073 632616 MTCO1P12 0 + 631073 631073 0 1 1543 0 +1 632324 632413 MIR6723 0 - 632324 632324 0 1 89 0 +1 632756 633438 MTCO2P12 0 + 632756 632756 0 1 682 0 +1 633534 633741 MTATP8P1 0 + 633534 633534 0 1 207 0 +1 633695 634376 MTATP6P1 0 + 633695 633695 0 1 681 0 +1 634375 634922 MTCO3P12 0 + 634375 634375 0 1 547 0 +1 674841 675265 WBP1LP6 0 + 674841 674841 0 1 424 0 +1 685678 686673 OR4F16 0 - 685718 686654 0 1 995 0 +1 722091 724903 CICP3 0 + 722091 722091 0 2 2269,186 0,2626 +1 725884 778626 AL669831.1 0 - 725884 725884 0 21 3920,58,191,171,197,98,124,65,157,525,78,104,59,59,66,66,216,132,110,110,343 0,7422,9538,17295,18310,18843,20810,30192,33082,38838,38838,39259,40444,40444,42663,42663,43612,47091,48286,48286,52399 +1 758232 758336 RNU6-1199P 0 - 758232 758232 0 1 104 0 +1 760910 761989 AL669831.2 0 + 760910 760910 0 2 244,212 0,867
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/testWithGenes.bed Fri Nov 04 15:37:12 2022 +0000 @@ -0,0 +1,105 @@ +chr1 11868 14409 DDX11L1 0 + 11868 11868 0 3 359,109,1189 0,744,1352 +chr1 12009 13670 DDX11L1 0 + 12009 12009 0 6 48,49,85,78,154,218 0,169,603,965,1211,1443 +chr1 14403 29570 WASH7P 0 - 14403 14403 0 11 98,34,152,159,198,136,137,147,99,154,37 0,601,1392,2203,2454,2829,3202,3511,3864,10334,15130 +chr1 17368 17436 MIR6859-1 0 - 17368 17368 0 1 68 0 +chr1 29553 31097 MIR1302-2HG 0 + 29553 29553 0 3 486,104,122 0,1010,1422 +chr1 30266 31109 MIR1302-2HG 0 + 30266 30266 0 2 401,134 0,709 +chr1 30365 30503 MIR1302-2 0 + 30365 30365 0 1 138 0 +chr1 34553 36081 FAM138A 0 - 34553 34553 0 3 621,205,361 0,723,1167 +chr1 35244 36073 FAM138A 0 - 35244 35244 0 2 237,353 0,476 +chr1 52472 53312 OR4G4P 0 + 52472 52472 0 1 840 0 +chr1 57597 64116 OR4G11P 0 + 57597 57597 0 3 56,157,1201 0,1102,5318 +chr1 62948 63887 OR4G11P 0 + 62948 62948 0 1 939 0 +chr1 65418 71585 OR4F5 0 + 65564 70005 0 3 15,54,2549 0,101,3618 +chr1 69054 70108 OR4F5 0 + 69090 70005 0 1 1054 0 +chr1 89294 120932 AL627309.1 0 - 89294 89294 0 4 2335,150,105,158 0,2796,23405,31480 +chr1 89550 91105 AL627309.3 0 - 89550 89550 0 2 500,819 0,736 +chr1 92229 129217 AL627309.1 0 - 92229 92229 0 4 11,105,212,163 0,20470,28491,36825 +chr1 110952 129173 AL627309.1 0 - 110952 110952 0 3 405,105,119 0,1747,18102 +chr1 120724 133723 AL627309.1 0 - 120724 120724 0 4 145,59,169,350 0,149,8330,12649 +chr1 129080 133566 AL627309.1 0 - 129080 129080 0 2 143,193 0,4293 +chr1 131024 134836 CICP27 0 + 131024 131024 0 1 3812 0 +chr1 135140 135895 AL627309.6 0 - 135140 135140 0 1 755 0 +chr1 137681 137965 AL627309.7 0 - 137681 137681 0 1 284 0 +chr1 139789 140339 AL627309.2 0 - 139789 139789 0 2 58,265 0,285 +chr1 141473 149707 AL627309.5 0 - 141473 141473 0 2 1538,3322 0,4912 +chr1 142807 146831 AL627309.5 0 - 142807 142807 0 3 204,124,190 0,3578,3834 +chr1 146385 173862 AL627309.5 0 - 146385 146385 0 8 124,65,529,59,66,216,132,110 0,9381,17877,19498,21714,22663,26171,27367 +chr1 157783 157887 RNU6-1100P 0 - 157783 157783 0 1 104 0 +chr1 160445 161525 AL627309.4 0 + 160445 160445 0 2 245,212 0,868 +chr1 165888 168767 AL627309.5 0 - 165888 165888 0 3 54,66,158 0,2211,2721 +chr1 182695 184174 FO538757.1 0 + 182695 182695 0 5 51,85,78,162,194 0,436,798,1044,1285 +chr1 185216 195411 WASH9P 0 - 185216 185216 0 10 134,69,153,159,202,136,137,146,112,149 0,274,1100,1912,2159,2538,2913,3222,3574,10046 +chr1 187890 187958 MIR6859-2 0 - 187890 187890 0 1 68 0 +chr1 257863 264733 AP006222.1 0 - 257863 257863 0 2 1162,130 0,6740 +chr1 257912 268816 AP006222.1 0 - 257912 257912 0 4 1113,85,902,150 0,3637,9390,10754 +chr1 258143 359681 AP006222.1 0 - 258143 258143 0 4 882,902,135,337 0,98541,99905,101201 +chr1 258523 268816 AP006222.1 0 - 258523 258523 0 3 502,902,150 0,8779,10143 +chr1 258567 259024 AP006222.1 0 - 258567 258567 0 1 457 0 +chr1 263014 297502 AP006222.1 0 - 263014 263014 0 4 5190,150,105,158 0,5652,26251,34330 +chr1 347981 348366 RPL23AP24 0 - 347981 347981 0 1 385 0 +chr1 358856 365704 AL732372.1 0 + 358856 358856 0 2 73,534 0,6314 +chr1 358871 365510 AL732372.1 0 + 358871 358871 0 2 86,340 0,6299 +chr1 360056 366052 AL732372.1 0 + 360056 360056 0 2 112,882 0,5114 +chr1 365388 366151 AL732372.2 0 - 365388 365388 0 2 304,133 0,630 +chr1 365394 368450 AL732372.2 0 - 365394 365394 0 2 298,200 0,2856 +chr1 365614 379972 AL732372.2 0 - 365614 365614 0 3 78,180,204 0,7529,14154 +chr1 373181 485208 AL732372.2 0 - 373181 373181 0 3 142,102,169 0,6587,111858 +chr1 439869 440232 WBP1LP7 0 + 439869 439869 0 1 363 0 +chr1 450702 451697 OR4F29 0 - 450742 451678 0 1 995 0 +chr1 476363 497259 AL732372.2 0 - 476363 476363 0 3 582,169,151 0,8676,20745 +chr1 484831 495476 AL732372.2 0 - 484831 484831 0 3 377,58,200 0,10160,10445 +chr1 485025 485208 AL732372.2 0 - 485025 485025 0 1 183 0 +chr1 485065 489553 AL732372.2 0 - 485065 485065 0 2 143,193 0,4295 +chr1 487100 489906 CICP7 0 + 487100 487100 0 2 2287,190 0,2616 +chr1 491224 493241 AL732372.3 0 - 491224 491224 0 2 765,474 0,1543 +chr1 494381 496605 AL732372.2 0 - 494381 494381 0 2 205,342 0,1882 +chr1 494463 502508 AL732372.2 0 - 494463 494463 0 5 435,58,191,65,44 0,528,2645,7092,8001 +chr1 494474 495368 AL732372.2 0 - 494474 494474 0 3 424,58,92 0,517,802 +chr1 494610 499175 AL732372.2 0 - 494610 494610 0 3 288,58,492 0,381,4073 +chr1 494770 498976 AL732372.2 0 - 494770 494770 0 5 128,58,191,58,293 0,221,2338,3628,3913 +chr1 497133 498456 AL732372.2 0 - 497133 497133 0 3 166,233,58 0,939,1265 +chr1 497204 502598 AL732372.2 0 - 497204 497204 0 6 24,233,58,65,57,134 0,868,1194,4351,4982,5260 +chr1 497209 502873 AL732372.2 0 - 497209 497209 0 4 90,58,65,409 0,1189,4346,5255 +chr1 497239 499002 AL732372.2 0 - 497239 497239 0 4 60,259,58,319 0,807,1159,1444 +chr1 497244 502598 AL732372.2 0 - 497244 497244 0 5 55,233,58,65,134 0,828,1154,4311,5220 +chr1 497274 498976 AL732372.2 0 - 497274 497274 0 2 25,578 0,1124 +chr1 498280 499175 AL732372.2 0 - 498280 498280 0 3 25,58,492 0,118,403 +chr1 498983 501607 AL732372.2 0 - 498983 498983 0 2 386,52 0,2572 +chr1 501587 517252 AL732372.2 0 - 501587 501587 0 5 33,94,124,65,68 0,1274,3392,12771,15597 +chr1 501603 517225 AL732372.2 0 - 501603 501603 0 5 17,197,124,65,70 0,861,3376,12755,15552 +chr1 504469 514413 AL732372.2 0 - 504469 504469 0 2 464,55 0,9889 +chr1 504864 522928 AL732372.2 0 - 504864 504864 0 4 239,65,82,70 0,9494,12320,17994 +chr1 516375 516479 RF00026 0 - 516375 516375 0 1 104 0 +chr1 586070 612813 AL669831.3 0 - 586070 586070 0 6 288,135,128,180,102,73 0,750,8558,15327,21884,26670 +chr1 586277 588453 AL669831.3 0 - 586277 586277 0 3 81,135,337 0,543,1839 +chr1 586944 720194 AL669831.3 0 - 586944 586944 0 4 11,105,212,163 0,116740,124766,133087 +chr1 587628 594768 AC114498.1 0 + 587628 587628 0 2 73,534 0,6606 +chr1 587667 594574 AC114498.1 0 + 587667 587667 0 2 62,340 0,6567 +chr1 594190 633129 AL669831.3 0 - 594190 594190 0 5 566,180,102,88,86 0,7207,13764,34728,38853 +chr1 594197 631204 AL669831.3 0 - 594197 594197 0 6 559,180,102,124,88,74 0,7200,13757,18543,34721,36933 +chr1 594307 598551 AL669831.3 0 - 594307 594307 0 2 449,1253 0,2991 +chr1 594307 827769 AL669831.3 0 - 594307 594307 0 4 449,180,212,100 0,7090,117403,233362 +chr1 594307 827796 AL669831.3 0 - 594307 594307 0 5 449,180,102,33,127 0,7090,13647,104619,233362 +chr1 594457 733064 AL669831.3 0 - 594457 594457 0 8 299,180,102,33,158,169,191,84 0,6940,13497,104469,117307,125574,137559,138523 +chr1 601435 720200 AL669831.3 0 - 601435 601435 0 3 142,102,169 0,6519,118596 +chr1 627376 631150 AL669831.3 0 - 627376 627376 0 4 447,263,88,20 0,584,1542,3754 +chr1 629061 629433 MTND1P23 0 + 629061 629061 0 1 372 0 +chr1 629639 630683 MTND2P28 0 + 629639 629639 0 1 1044 0 +chr1 631073 632616 MTCO1P12 0 + 631073 631073 0 1 1543 0 +chr1 632324 632413 MIR6723 0 - 632324 632324 0 1 89 0 +chr1 632756 633438 MTCO2P12 0 + 632756 632756 0 1 682 0 +chr1 633534 633741 MTATP8P1 0 + 633534 633534 0 1 207 0 +chr1 633695 634376 MTATP6P1 0 + 633695 633695 0 1 681 0 +chr1 634375 634922 MTCO3P12 0 + 634375 634375 0 1 547 0 +chr1 674841 675265 WBP1LP6 0 + 674841 674841 0 1 424 0 +chr1 685678 686673 OR4F16 0 - 685718 686654 0 1 995 0 +chr1 701935 720150 AL669831.3 0 - 701935 701935 0 3 405,105,119 0,1749,18096 +chr1 711866 732212 AL669831.3 0 - 711866 711866 0 3 56,169,196 0,8165,20150 +chr1 720023 720206 AL669831.3 0 - 720023 720023 0 1 183 0 +chr1 720052 724564 AL669831.3 0 - 720052 720052 0 2 148,207 0,4305 +chr1 722091 724903 CICP3 0 + 722091 722091 0 2 2269,186 0,2626 +chr1 725884 778626 AL669831.1 0 - 725884 725884 0 16 3920,58,191,171,197,98,124,65,157,525,59,66,216,132,110,343 0,7422,9538,17295,18310,18843,20810,30192,33082,38838,40444,42663,43612,47091,48286,52399 +chr1 758232 758336 RNU6-1199P 0 - 758232 758232 0 1 104 0 +chr1 760910 761989 AL669831.2 0 + 760910 760910 0 2 244,212 0,867 +chr1 764722 774280 AL669831.1 0 - 764722 764722 0 5 78,104,59,66,110 0,421,1606,3825,9448