changeset 1:6fd4b3b90220 draft default tip

planemo upload for repository https://github.com/lldelisle/tools-lldelisle/tree/master/tools/fromgtfTobed12 commit 15b8c2cc83708044413a152322bcbfca8a74d29a
author lldelisle
date Fri, 03 Nov 2023 14:13:51 +0000
parents 418e4d0fe0bd
children
files fromgtfTobed12.py fromgtfTobed12.xml test-data/testWithGeneIds.bed
diffstat 3 files changed, 126 insertions(+), 13 deletions(-) [+]
line wrap: on
line diff
--- a/fromgtfTobed12.py	Fri Nov 04 15:37:12 2022 +0000
+++ b/fromgtfTobed12.py	Fri Nov 03 14:13:51 2023 +0000
@@ -23,13 +23,16 @@
                         "database creation")
 
 
-def convert_gtf_to_bed(fn, fo, useGene, mergeTranscripts,
+def convert_gtf_to_bed(fn, fo, preferedName, mergeTranscripts,
                        mergeTranscriptsAndOverlappingExons, ucsc):
     db = gffutils.create_db(fn, ':memory:')
     # For each transcript:
-    prefered_name = "transcript_name"
-    if useGene or mergeTranscripts or mergeTranscriptsAndOverlappingExons:
+    if preferedName is not None:
+        prefered_name = preferedName
+    elif mergeTranscripts or mergeTranscriptsAndOverlappingExons:
         prefered_name = "gene_name"
+    else:
+        prefered_name = "transcript_name"
     if mergeTranscripts or mergeTranscriptsAndOverlappingExons:
         all_items = db.features_of_type("gene", order_by='start')
     else:
@@ -127,12 +130,11 @@
 argp.add_argument('--output', default=sys.stdout,
                   type=argparse.FileType('w'),
                   help="Output bed12 file.")
-argp.add_argument('--useGene', action="store_true",
-                  help="Use the gene name instead of the "
-                       "transcript name.")
 argp.add_argument('--ucscformat', action="store_true",
                   help="If you want that all chromosome names "
                        "begin with 'chr'.")
+argp.add_argument('--preferedName', default=None,
+                  help="Name to use for bed output.")
 group = argp.add_mutually_exclusive_group()
 group.add_argument('--mergeTranscripts', action="store_true",
                    help="Merge all transcripts into a single "
@@ -144,7 +146,7 @@
                         " overlapping exons.")
 
 args = argp.parse_args()
-convert_gtf_to_bed(args.input, args.output, args.useGene,
+convert_gtf_to_bed(args.input, args.output, args.preferedName,
                    args.mergeTranscripts,
                    args.mergeTranscriptsAndOverlappingExons,
                    args.ucscformat)
--- a/fromgtfTobed12.xml	Fri Nov 04 15:37:12 2022 +0000
+++ b/fromgtfTobed12.xml	Fri Nov 03 14:13:51 2023 +0000
@@ -1,4 +1,4 @@
-<tool id="fromgtfTobed12" name="fromgtftobed12" version="0.11.1+galaxy0">
+<tool id="fromgtfTobed12" name="fromgtftobed12" version="0.11.1+galaxy1">
   <description> Convert a gtf to a bed12.</description>
   <requirements>
     <requirement type="package" version="0.11.1">gffutils</requirement>
@@ -14,7 +14,9 @@
   <command>
 <![CDATA[
         python3 $__tool_directory__/fromgtfTobed12.py
-        $useGene
+        #if str($preferedName) != "":
+          --preferedName $preferedName
+        #end if
         $mergeTranscripts
         $ucscformat
         --output $output
@@ -23,12 +25,12 @@
   </command>
   <inputs>
     <param name="input" multiple="false" type="data" format="gtf" label="Select the gtf to convert."/>
-    <param argument="--useGene" type="boolean" checked="False" truevalue="--useGene" falsevalue="" label="Uses the gene name instead of the transcript name."/>
     <param name="mergeTranscripts" type="select" label="Do you want to merge all transcripts of a gene in a single line?">
       <option value="" selected="true">No</option>
       <option value="--mergeTranscripts">Yes</option>
       <option value="--mergeTranscriptsAndOverlappingExons">Yes and merge overlapping exons</option>
     </param>
+    <param argument="--preferedName" type="text" value="" label="Use a specific name for the 4th column" help="By default the 4th column will be transcript_name or gene_name if you merge transcripts. If you prefer 'gene_id', for example, then set this option." />
     <param argument="--ucscformat" type="boolean" checked="True" truevalue="--ucscformat" falsevalue="" label="If you want that all chromosome names begin with 'chr'."/>
   </inputs>
   
@@ -45,20 +47,24 @@
     <test>
       <param name="input" value="Homo_sapiens.GRCh38.95_491firstLines.gtf.gz"/>
       <param name="ucscformat" value="--ucscformat"/>
-      <param name="useGene" value="--useGene"/>
+      <param name="preferedName" value="gene_name"/>
       <output name="output" file="testWithGenes.bed"/>
     </test>
     <test>
       <param name="input" value="Homo_sapiens.GRCh38.95_491firstLines.gtf.gz"/>
+      <param name="ucscformat" value="--ucscformat"/>
+      <param name="preferedName" value="gene_id"/>
+      <output name="output" file="testWithGeneIds.bed"/>
+    </test>
+    <test>
+      <param name="input" value="Homo_sapiens.GRCh38.95_491firstLines.gtf.gz"/>
       <param name="mergeTranscripts" value="--mergeTranscripts"/>
-      <param name="useGene" value="--useGene"/>
       <param name="ucscformat" value=""/>
       <output name="output" file="testMergeNotUCSC.bed"/>
     </test>
     <test>
       <param name="input" value="Homo_sapiens.GRCh38.95_491firstLines.gtf.gz"/>
       <param name="mergeTranscripts" value="--mergeTranscriptsAndOverlappingExons"/>
-      <param name="useGene" value="--useGene"/>
       <param name="ucscformat" value=""/>
       <output name="output" file="testMergeExons.bed"/>
     </test>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/testWithGeneIds.bed	Fri Nov 03 14:13:51 2023 +0000
@@ -0,0 +1,105 @@
+chr1	11868	14409	ENSG00000223972	0	+	11868	11868	0	3	359,109,1189	0,744,1352
+chr1	12009	13670	ENSG00000223972	0	+	12009	12009	0	6	48,49,85,78,154,218	0,169,603,965,1211,1443
+chr1	14403	29570	ENSG00000227232	0	-	14403	14403	0	11	98,34,152,159,198,136,137,147,99,154,37	0,601,1392,2203,2454,2829,3202,3511,3864,10334,15130
+chr1	17368	17436	ENSG00000278267	0	-	17368	17368	0	1	68	0
+chr1	29553	31097	ENSG00000243485	0	+	29553	29553	0	3	486,104,122	0,1010,1422
+chr1	30266	31109	ENSG00000243485	0	+	30266	30266	0	2	401,134	0,709
+chr1	30365	30503	ENSG00000284332	0	+	30365	30365	0	1	138	0
+chr1	34553	36081	ENSG00000237613	0	-	34553	34553	0	3	621,205,361	0,723,1167
+chr1	35244	36073	ENSG00000237613	0	-	35244	35244	0	2	237,353	0,476
+chr1	52472	53312	ENSG00000268020	0	+	52472	52472	0	1	840	0
+chr1	57597	64116	ENSG00000240361	0	+	57597	57597	0	3	56,157,1201	0,1102,5318
+chr1	62948	63887	ENSG00000240361	0	+	62948	62948	0	1	939	0
+chr1	65418	71585	ENSG00000186092	0	+	65564	70005	0	3	15,54,2549	0,101,3618
+chr1	69054	70108	ENSG00000186092	0	+	69090	70005	0	1	1054	0
+chr1	89294	120932	ENSG00000238009	0	-	89294	89294	0	4	2335,150,105,158	0,2796,23405,31480
+chr1	89550	91105	ENSG00000239945	0	-	89550	89550	0	2	500,819	0,736
+chr1	92229	129217	ENSG00000238009	0	-	92229	92229	0	4	11,105,212,163	0,20470,28491,36825
+chr1	110952	129173	ENSG00000238009	0	-	110952	110952	0	3	405,105,119	0,1747,18102
+chr1	120724	133723	ENSG00000238009	0	-	120724	120724	0	4	145,59,169,350	0,149,8330,12649
+chr1	129080	133566	ENSG00000238009	0	-	129080	129080	0	2	143,193	0,4293
+chr1	131024	134836	ENSG00000233750	0	+	131024	131024	0	1	3812	0
+chr1	135140	135895	ENSG00000268903	0	-	135140	135140	0	1	755	0
+chr1	137681	137965	ENSG00000269981	0	-	137681	137681	0	1	284	0
+chr1	139789	140339	ENSG00000239906	0	-	139789	139789	0	2	58,265	0,285
+chr1	141473	149707	ENSG00000241860	0	-	141473	141473	0	2	1538,3322	0,4912
+chr1	142807	146831	ENSG00000241860	0	-	142807	142807	0	3	204,124,190	0,3578,3834
+chr1	146385	173862	ENSG00000241860	0	-	146385	146385	0	8	124,65,529,59,66,216,132,110	0,9381,17877,19498,21714,22663,26171,27367
+chr1	157783	157887	ENSG00000222623	0	-	157783	157783	0	1	104	0
+chr1	160445	161525	ENSG00000241599	0	+	160445	160445	0	2	245,212	0,868
+chr1	165888	168767	ENSG00000241860	0	-	165888	165888	0	3	54,66,158	0,2211,2721
+chr1	182695	184174	ENSG00000279928	0	+	182695	182695	0	5	51,85,78,162,194	0,436,798,1044,1285
+chr1	185216	195411	ENSG00000279457	0	-	185216	185216	0	10	134,69,153,159,202,136,137,146,112,149	0,274,1100,1912,2159,2538,2913,3222,3574,10046
+chr1	187890	187958	ENSG00000273874	0	-	187890	187890	0	1	68	0
+chr1	257863	264733	ENSG00000228463	0	-	257863	257863	0	2	1162,130	0,6740
+chr1	257912	268816	ENSG00000228463	0	-	257912	257912	0	4	1113,85,902,150	0,3637,9390,10754
+chr1	258143	359681	ENSG00000228463	0	-	258143	258143	0	4	882,902,135,337	0,98541,99905,101201
+chr1	258523	268816	ENSG00000228463	0	-	258523	258523	0	3	502,902,150	0,8779,10143
+chr1	258567	259024	ENSG00000228463	0	-	258567	258567	0	1	457	0
+chr1	263014	297502	ENSG00000228463	0	-	263014	263014	0	4	5190,150,105,158	0,5652,26251,34330
+chr1	347981	348366	ENSG00000236679	0	-	347981	347981	0	1	385	0
+chr1	358856	365704	ENSG00000236601	0	+	358856	358856	0	2	73,534	0,6314
+chr1	358871	365510	ENSG00000236601	0	+	358871	358871	0	2	86,340	0,6299
+chr1	360056	366052	ENSG00000236601	0	+	360056	360056	0	2	112,882	0,5114
+chr1	365388	366151	ENSG00000237094	0	-	365388	365388	0	2	304,133	0,630
+chr1	365394	368450	ENSG00000237094	0	-	365394	365394	0	2	298,200	0,2856
+chr1	365614	379972	ENSG00000237094	0	-	365614	365614	0	3	78,180,204	0,7529,14154
+chr1	373181	485208	ENSG00000237094	0	-	373181	373181	0	3	142,102,169	0,6587,111858
+chr1	439869	440232	ENSG00000269732	0	+	439869	439869	0	1	363	0
+chr1	450702	451697	ENSG00000284733	0	-	450742	451678	0	1	995	0
+chr1	476363	497259	ENSG00000237094	0	-	476363	476363	0	3	582,169,151	0,8676,20745
+chr1	484831	495476	ENSG00000237094	0	-	484831	484831	0	3	377,58,200	0,10160,10445
+chr1	485025	485208	ENSG00000237094	0	-	485025	485025	0	1	183	0
+chr1	485065	489553	ENSG00000237094	0	-	485065	485065	0	2	143,193	0,4295
+chr1	487100	489906	ENSG00000233653	0	+	487100	487100	0	2	2287,190	0,2616
+chr1	491224	493241	ENSG00000250575	0	-	491224	491224	0	2	765,474	0,1543
+chr1	494381	496605	ENSG00000237094	0	-	494381	494381	0	2	205,342	0,1882
+chr1	494463	502508	ENSG00000237094	0	-	494463	494463	0	5	435,58,191,65,44	0,528,2645,7092,8001
+chr1	494474	495368	ENSG00000237094	0	-	494474	494474	0	3	424,58,92	0,517,802
+chr1	494610	499175	ENSG00000237094	0	-	494610	494610	0	3	288,58,492	0,381,4073
+chr1	494770	498976	ENSG00000237094	0	-	494770	494770	0	5	128,58,191,58,293	0,221,2338,3628,3913
+chr1	497133	498456	ENSG00000237094	0	-	497133	497133	0	3	166,233,58	0,939,1265
+chr1	497204	502598	ENSG00000237094	0	-	497204	497204	0	6	24,233,58,65,57,134	0,868,1194,4351,4982,5260
+chr1	497209	502873	ENSG00000237094	0	-	497209	497209	0	4	90,58,65,409	0,1189,4346,5255
+chr1	497239	499002	ENSG00000237094	0	-	497239	497239	0	4	60,259,58,319	0,807,1159,1444
+chr1	497244	502598	ENSG00000237094	0	-	497244	497244	0	5	55,233,58,65,134	0,828,1154,4311,5220
+chr1	497274	498976	ENSG00000237094	0	-	497274	497274	0	2	25,578	0,1124
+chr1	498280	499175	ENSG00000237094	0	-	498280	498280	0	3	25,58,492	0,118,403
+chr1	498983	501607	ENSG00000237094	0	-	498983	498983	0	2	386,52	0,2572
+chr1	501587	517252	ENSG00000237094	0	-	501587	501587	0	5	33,94,124,65,68	0,1274,3392,12771,15597
+chr1	501603	517225	ENSG00000237094	0	-	501603	501603	0	5	17,197,124,65,70	0,861,3376,12755,15552
+chr1	504469	514413	ENSG00000237094	0	-	504469	504469	0	2	464,55	0,9889
+chr1	504864	522928	ENSG00000237094	0	-	504864	504864	0	4	239,65,82,70	0,9494,12320,17994
+chr1	516375	516479	ENSG00000278757	0	-	516375	516375	0	1	104	0
+chr1	586070	612813	ENSG00000230021	0	-	586070	586070	0	6	288,135,128,180,102,73	0,750,8558,15327,21884,26670
+chr1	586277	588453	ENSG00000230021	0	-	586277	586277	0	3	81,135,337	0,543,1839
+chr1	586944	720194	ENSG00000230021	0	-	586944	586944	0	4	11,105,212,163	0,116740,124766,133087
+chr1	587628	594768	ENSG00000235146	0	+	587628	587628	0	2	73,534	0,6606
+chr1	587667	594574	ENSG00000235146	0	+	587667	587667	0	2	62,340	0,6567
+chr1	594190	633129	ENSG00000230021	0	-	594190	594190	0	5	566,180,102,88,86	0,7207,13764,34728,38853
+chr1	594197	631204	ENSG00000230021	0	-	594197	594197	0	6	559,180,102,124,88,74	0,7200,13757,18543,34721,36933
+chr1	594307	598551	ENSG00000230021	0	-	594307	594307	0	2	449,1253	0,2991
+chr1	594307	827769	ENSG00000230021	0	-	594307	594307	0	4	449,180,212,100	0,7090,117403,233362
+chr1	594307	827796	ENSG00000230021	0	-	594307	594307	0	5	449,180,102,33,127	0,7090,13647,104619,233362
+chr1	594457	733064	ENSG00000230021	0	-	594457	594457	0	8	299,180,102,33,158,169,191,84	0,6940,13497,104469,117307,125574,137559,138523
+chr1	601435	720200	ENSG00000230021	0	-	601435	601435	0	3	142,102,169	0,6519,118596
+chr1	627376	631150	ENSG00000230021	0	-	627376	627376	0	4	447,263,88,20	0,584,1542,3754
+chr1	629061	629433	ENSG00000225972	0	+	629061	629061	0	1	372	0
+chr1	629639	630683	ENSG00000225630	0	+	629639	629639	0	1	1044	0
+chr1	631073	632616	ENSG00000237973	0	+	631073	631073	0	1	1543	0
+chr1	632324	632413	ENSG00000278791	0	-	632324	632324	0	1	89	0
+chr1	632756	633438	ENSG00000229344	0	+	632756	632756	0	1	682	0
+chr1	633534	633741	ENSG00000240409	0	+	633534	633534	0	1	207	0
+chr1	633695	634376	ENSG00000248527	0	+	633695	633695	0	1	681	0
+chr1	634375	634922	ENSG00000198744	0	+	634375	634375	0	1	547	0
+chr1	674841	675265	ENSG00000268663	0	+	674841	674841	0	1	424	0
+chr1	685678	686673	ENSG00000284662	0	-	685718	686654	0	1	995	0
+chr1	701935	720150	ENSG00000230021	0	-	701935	701935	0	3	405,105,119	0,1749,18096
+chr1	711866	732212	ENSG00000230021	0	-	711866	711866	0	3	56,169,196	0,8165,20150
+chr1	720023	720206	ENSG00000230021	0	-	720023	720023	0	1	183	0
+chr1	720052	724564	ENSG00000230021	0	-	720052	720052	0	2	148,207	0,4305
+chr1	722091	724903	ENSG00000229376	0	+	722091	722091	0	2	2269,186	0,2626
+chr1	725884	778626	ENSG00000228327	0	-	725884	725884	0	16	3920,58,191,171,197,98,124,65,157,525,59,66,216,132,110,343	0,7422,9538,17295,18310,18843,20810,30192,33082,38838,40444,42663,43612,47091,48286,52399
+chr1	758232	758336	ENSG00000223181	0	-	758232	758232	0	1	104	0
+chr1	760910	761989	ENSG00000229905	0	+	760910	760910	0	2	244,212	0,867
+chr1	764722	774280	ENSG00000228327	0	-	764722	764722	0	5	78,104,59,66,110	0,421,1606,3825,9448