changeset 0:e60f92a8e1c8 draft default tip

Uploaded
author pmac
date Wed, 01 Jun 2016 03:48:29 -0400
parents
children
files map_chromosomes.pl map_chromosomes.xml test-data/map_chrom_ensembl.txt test-data/map_chrom_ucsc.txt tool-data/map_chromosomes.loc.sample tool_data_table_conf.xml.sample
diffstat 6 files changed, 377 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/map_chromosomes.pl	Wed Jun 01 03:48:29 2016 -0400
@@ -0,0 +1,77 @@
+#! /usr/bin/perl -w
+
+use strict;
+use warnings;
+use Scalar::Util qw(looks_like_number);
+
+my @in = ();
+my $column_delimiters_href_split = {
+	'TAB' => q{\t},
+	'COMMA' => ",",
+	'DASH' => "-",
+	'UNDERSCORE' => "_",
+	'PIPE' => q{\|},
+	'DOT' => q{\.},
+	'SPACE' => " "
+};
+
+my $column_delimiters_href_join = {
+        'TAB' => qq{\t},
+        'COMMA' => ",",
+        'DASH' => "-",
+        'UNDERSCORE' => "_",
+        'PIPE' => "|",
+        'DOT' => ".",
+        'SPACE' => " "
+};
+
+# a wrapper for converting between UCSC and ensembl chromosome representations from within galaxy
+# convert_UCSC_ensembl.pl [input] [col] [delimiter] [genome] [out_file1]
+
+die "Check arguments: $0 [input] [col] [delimiter] [map] [out_file1]\n" unless @ARGV == 5;
+die "No columns specified: $ARGV[1]\n" if looks_like_number($ARGV[1]) == 0;
+die "Delimeter must be one of TAB, COMMA, DASH, UNDERSCORE, PIPE, DOT, SPACE\n" unless defined $column_delimiters_href_split->{$ARGV[2]};
+
+# process input
+my $input = $ARGV[0];
+$ARGV[1] =~ s/\s+//g;
+my $col = --$ARGV[1];
+my $delim = $ARGV[2];
+my $map_file = $ARGV[3];
+my $output = $ARGV[4];
+my $delim_split = $column_delimiters_href_split->{$delim};
+my $delim_join = $column_delimiters_href_join->{$delim};
+
+open (MAP, "<$map_file") or die "Cannot open map file $map_file:$!\n";
+my %chr_map;
+while(my $line = <MAP>) {
+	chop $line;
+	next if grep /^#/, $line;
+	my @map = split /\t/, $line;
+	$map[1] = "remove" unless $#map;
+	$chr_map{$map[0]} = $map[1];
+}
+close MAP;
+
+open (IN,  "<$input") or die "Cannot open $input:$!\n";
+open (OUT, ">$output") or die "Cannot create $output:$!\n";
+while (my $line = <IN>) {
+	chop $line;
+	@in = split /$delim_split/, $line; 
+	if(defined $in[$col] && defined $chr_map{$in[$col]}) {
+		$in[$col] = $chr_map{$in[$col]};
+		if($in[$col] eq "remove") {
+			print "Removed line \"$line\" as chromosome does not have a proper mapping\n";
+		} else {
+			print OUT join($delim_join, @in), "\n";
+		}
+	} elsif(grep /^#/, $in[0]) {
+		print OUT join($delim_join, @in), "\n";
+	} else {
+		print "Removed line \"$line\" as \"$in[$col]\" is not a valid chromosome name\n";
+	}
+}
+close IN;
+close OUT;
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/map_chromosomes.xml	Wed Jun 01 03:48:29 2016 -0400
@@ -0,0 +1,84 @@
+<tool id="map_chromosomes" name="Convert between UCSC and ensembl chromosome names" version="0.0.1">
+    <description>chromosome representation</description>
+    <command  interpreter="perl">map_chromosomes.pl $input1 $col $delimiter ${map.fields.path} $out_file1</command>
+    <inputs>
+        <param format="tabular,txt" name="input1" type="data" label="this dataset"/>
+        <param name="col" type="integer" value="1" label="Column containing chromosome ids"/>
+        <param name="delimiter" type="select" label="Delimited by">
+            <option value="TAB">Tab</option>
+            <option value="SPACE">Whitespace</option>
+            <option value="DOT">Dot</option>
+            <option value="COMMA">Comma</option>
+            <option value="DASH">Dash</option>
+            <option value="UNDERSCORE">Underscore</option>
+            <option value="PIPE">Pipe</option>
+        </param>
+        <param name="map" type="select" label="Select a mapping file" help="Ensure you select the correct mapping file">
+            <options from_data_table="map_chromosomes">
+                <filter type="sort_by" column="2"/>
+                <validator type="no_options" message="No indexes are available for the selected input dataset"/>
+            </options>
+        </param>
+    </inputs>
+    <outputs>
+        <data name="out_file1" format="input" metadata_source="input1"/>
+    </outputs>
+
+    <tests>
+	<test>
+		<param name="col" value="1"/>
+		<param name="delimiter" value="TAB"/>
+		<param name="map" value="hg19 UCSC to ensembl"/>
+		<param name="input1" value="map_chrom_ucsc.txt" dbkey="hg19" />
+		<output name="out_file1" file="map_chrom_ensembl.txt"/>
+    	</test>
+        <test>
+                <param name="col" value="1"/>
+                <param name="delimiter" value="TAB"/>
+                <param name="map" value="hg19 ensembl to UCSC"/>
+                <param name="input1" value="map_chrom_ensembl.txt" dbkey="hg19" />
+                <output name="out_file1" file="map_chrom_ucsc.txt"/>
+        </test>
+    </tests>
+    <help>
+
+
+**What it does**
+
+Converts a column between the UCSC and ensembl representation of chromosomes.
+
+.. class:: warningmark
+
+**Warning any lines containing chromosomes that don't map or are invalid will be removed**
+
+-----
+
+**Example 1**
+
+Converting this dataset::
+
+  chr21 12      123
+  chrM  45      345
+
+by setting **Column** to *1*, **Delimiter** to *Tab* and **Mapping** file to *hg19 UCSC to ensembl** will produce::
+
+  21    12      123
+  MT    45      345
+
+-----
+
+**Example 2**
+
+Converting this dataset::
+
+  21    12      123
+  MT    45      345
+
+by setting **Column** to *1*, **Delimiter** to *Tab* and **Mapping** file to *hg19 ensembl to UCSC** will produce::
+
+  chr21 12      123
+  chrM  45      345
+
+
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/map_chrom_ensembl.txt	Wed Jun 01 03:48:29 2016 -0400
@@ -0,0 +1,93 @@
+1	chr1	1
+2	chr2	2
+3	chr3	3
+4	chr4	4
+5	chr5	5
+6	chr6	6
+7	chr7	7
+8	chr8	8
+9	chr9	9
+10	chr10	10
+11	chr11	11
+12	chr12	12
+13	chr13	13
+14	chr14	14
+15	chr15	15
+16	chr16	16
+17	chr17	17
+18	chr18	18
+19	chr19	19
+20	chr20	20
+21	chr21	21
+22	chr22	22
+X	chrX	X
+Y	chrY	Y
+MT	chrM	MT
+GL000191.1	chr1_gl000191_random	GL000191.1
+GL000192.1	chr1_gl000192_random	GL000192.1
+GL000193.1	chr4_gl000193_random	GL000193.1
+GL000194.1	chr4_gl000194_random	GL000194.1
+GL000195.1	chr7_gl000195_random	GL000195.1
+GL000196.1	chr8_gl000196_random	GL000196.1
+GL000197.1	chr8_gl000197_random	GL000197.1
+GL000198.1	chr9_gl000198_random	GL000198.1
+GL000199.1	chr9_gl000199_random	GL000199.1
+GL000200.1	chr9_gl000200_random	GL000200.1
+GL000201.1	chr9_gl000201_random	GL000201.1
+GL000202.1	chr11_gl000202_random	GL000202.1
+GL000203.1	chr17_gl000203_random	GL000203.1
+GL000204.1	chr17_gl000204_random	GL000204.1
+GL000205.1	chr17_gl000205_random	GL000205.1
+GL000206.1	chr17_gl000206_random	GL000206.1
+GL000207.1	chr18_gl000207_random	GL000207.1
+GL000208.1	chr19_gl000208_random	GL000208.1
+GL000209.1	chr19_gl000209_random	GL000209.1
+GL000210.1	chr21_gl000210_random	GL000210.1
+GL000211.1	chrUn_gl000211	GL000211.1
+GL000212.1	chrUn_gl000212	GL000212.1
+GL000213.1	chrUn_gl000213	GL000213.1
+GL000214.1	chrUn_gl000214	GL000214.1
+GL000215.1	chrUn_gl000215	GL000215.1
+GL000216.1	chrUn_gl000216	GL000216.1
+GL000217.1	chrUn_gl000217	GL000217.1
+GL000218.1	chrUn_gl000218	GL000218.1
+GL000219.1	chrUn_gl000219	GL000219.1
+GL000220.1	chrUn_gl000220	GL000220.1
+GL000221.1	chrUn_gl000221	GL000221.1
+GL000222.1	chrUn_gl000222	GL000222.1
+GL000223.1	chrUn_gl000223	GL000223.1
+GL000224.1	chrUn_gl000224	GL000224.1
+GL000225.1	chrUn_gl000225	GL000225.1
+GL000226.1	chrUn_gl000226	GL000226.1
+GL000227.1	chrUn_gl000227	GL000227.1
+GL000228.1	chrUn_gl000228	GL000228.1
+GL000229.1	chrUn_gl000229	GL000229.1
+GL000230.1	chrUn_gl000230	GL000230.1
+GL000231.1	chrUn_gl000231	GL000231.1
+GL000232.1	chrUn_gl000232	GL000232.1
+GL000233.1	chrUn_gl000233	GL000233.1
+GL000234.1	chrUn_gl000234	GL000234.1
+GL000235.1	chrUn_gl000235	GL000235.1
+GL000236.1	chrUn_gl000236	GL000236.1
+GL000237.1	chrUn_gl000237	GL000237.1
+GL000238.1	chrUn_gl000238	GL000238.1
+GL000239.1	chrUn_gl000239	GL000239.1
+GL000240.1	chrUn_gl000240	GL000240.1
+GL000241.1	chrUn_gl000241	GL000241.1
+GL000242.1	chrUn_gl000242	GL000242.1
+GL000243.1	chrUn_gl000243	GL000243.1
+GL000244.1	chrUn_gl000244	GL000244.1
+GL000245.1	chrUn_gl000245	GL000245.1
+GL000246.1	chrUn_gl000246	GL000246.1
+GL000247.1	chrUn_gl000247	GL000247.1
+GL000248.1	chrUn_gl000248	GL000248.1
+GL000249.1	chrUn_gl000249	GL000249.1
+HSCHR4_1	chr4_ctg9_hap1	HSCHR4_1
+HSCHR6_MHC_APD	chr6_apd_hap1	HSCHR6_MHC_APD
+HSCHR6_MHC_COX	chr6_cox_hap2	HSCHR6_MHC_COX
+HSCHR6_MHC_DBB	chr6_dbb_hap3	HSCHR6_MHC_DBB
+HSCHR6_MHC_MANN	chr6_mann_hap4	HSCHR6_MHC_MANN
+HSCHR6_MHC_MCF	chr6_mcf_hap5	HSCHR6_MHC_MCF
+HSCHR6_MHC_QBL	chr6_qbl_hap6	HSCHR6_MHC_QBL
+HSCHR6_MHC_SSTO	chr6_ssto_hap7	HSCHR6_MHC_SSTO
+HSCHR17_1	chr17_ctg5_hap1	HSCHR17_1
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/map_chrom_ucsc.txt	Wed Jun 01 03:48:29 2016 -0400
@@ -0,0 +1,93 @@
+chr1	chr1	1
+chr2	chr2	2
+chr3	chr3	3
+chr4	chr4	4
+chr5	chr5	5
+chr6	chr6	6
+chr7	chr7	7
+chr8	chr8	8
+chr9	chr9	9
+chr10	chr10	10
+chr11	chr11	11
+chr12	chr12	12
+chr13	chr13	13
+chr14	chr14	14
+chr15	chr15	15
+chr16	chr16	16
+chr17	chr17	17
+chr18	chr18	18
+chr19	chr19	19
+chr20	chr20	20
+chr21	chr21	21
+chr22	chr22	22
+chrX	chrX	X
+chrY	chrY	Y
+chrM	chrM	MT
+chr1_gl000191_random	chr1_gl000191_random	GL000191.1
+chr1_gl000192_random	chr1_gl000192_random	GL000192.1
+chr4_gl000193_random	chr4_gl000193_random	GL000193.1
+chr4_gl000194_random	chr4_gl000194_random	GL000194.1
+chr7_gl000195_random	chr7_gl000195_random	GL000195.1
+chr8_gl000196_random	chr8_gl000196_random	GL000196.1
+chr8_gl000197_random	chr8_gl000197_random	GL000197.1
+chr9_gl000198_random	chr9_gl000198_random	GL000198.1
+chr9_gl000199_random	chr9_gl000199_random	GL000199.1
+chr9_gl000200_random	chr9_gl000200_random	GL000200.1
+chr9_gl000201_random	chr9_gl000201_random	GL000201.1
+chr11_gl000202_random	chr11_gl000202_random	GL000202.1
+chr17_gl000203_random	chr17_gl000203_random	GL000203.1
+chr17_gl000204_random	chr17_gl000204_random	GL000204.1
+chr17_gl000205_random	chr17_gl000205_random	GL000205.1
+chr17_gl000206_random	chr17_gl000206_random	GL000206.1
+chr18_gl000207_random	chr18_gl000207_random	GL000207.1
+chr19_gl000208_random	chr19_gl000208_random	GL000208.1
+chr19_gl000209_random	chr19_gl000209_random	GL000209.1
+chr21_gl000210_random	chr21_gl000210_random	GL000210.1
+chrUn_gl000211	chrUn_gl000211	GL000211.1
+chrUn_gl000212	chrUn_gl000212	GL000212.1
+chrUn_gl000213	chrUn_gl000213	GL000213.1
+chrUn_gl000214	chrUn_gl000214	GL000214.1
+chrUn_gl000215	chrUn_gl000215	GL000215.1
+chrUn_gl000216	chrUn_gl000216	GL000216.1
+chrUn_gl000217	chrUn_gl000217	GL000217.1
+chrUn_gl000218	chrUn_gl000218	GL000218.1
+chrUn_gl000219	chrUn_gl000219	GL000219.1
+chrUn_gl000220	chrUn_gl000220	GL000220.1
+chrUn_gl000221	chrUn_gl000221	GL000221.1
+chrUn_gl000222	chrUn_gl000222	GL000222.1
+chrUn_gl000223	chrUn_gl000223	GL000223.1
+chrUn_gl000224	chrUn_gl000224	GL000224.1
+chrUn_gl000225	chrUn_gl000225	GL000225.1
+chrUn_gl000226	chrUn_gl000226	GL000226.1
+chrUn_gl000227	chrUn_gl000227	GL000227.1
+chrUn_gl000228	chrUn_gl000228	GL000228.1
+chrUn_gl000229	chrUn_gl000229	GL000229.1
+chrUn_gl000230	chrUn_gl000230	GL000230.1
+chrUn_gl000231	chrUn_gl000231	GL000231.1
+chrUn_gl000232	chrUn_gl000232	GL000232.1
+chrUn_gl000233	chrUn_gl000233	GL000233.1
+chrUn_gl000234	chrUn_gl000234	GL000234.1
+chrUn_gl000235	chrUn_gl000235	GL000235.1
+chrUn_gl000236	chrUn_gl000236	GL000236.1
+chrUn_gl000237	chrUn_gl000237	GL000237.1
+chrUn_gl000238	chrUn_gl000238	GL000238.1
+chrUn_gl000239	chrUn_gl000239	GL000239.1
+chrUn_gl000240	chrUn_gl000240	GL000240.1
+chrUn_gl000241	chrUn_gl000241	GL000241.1
+chrUn_gl000242	chrUn_gl000242	GL000242.1
+chrUn_gl000243	chrUn_gl000243	GL000243.1
+chrUn_gl000244	chrUn_gl000244	GL000244.1
+chrUn_gl000245	chrUn_gl000245	GL000245.1
+chrUn_gl000246	chrUn_gl000246	GL000246.1
+chrUn_gl000247	chrUn_gl000247	GL000247.1
+chrUn_gl000248	chrUn_gl000248	GL000248.1
+chrUn_gl000249	chrUn_gl000249	GL000249.1
+chr4_ctg9_hap1	chr4_ctg9_hap1	HSCHR4_1
+chr6_apd_hap1	chr6_apd_hap1	HSCHR6_MHC_APD
+chr6_cox_hap2	chr6_cox_hap2	HSCHR6_MHC_COX
+chr6_dbb_hap3	chr6_dbb_hap3	HSCHR6_MHC_DBB
+chr6_mann_hap4	chr6_mann_hap4	HSCHR6_MHC_MANN
+chr6_mcf_hap5	chr6_mcf_hap5	HSCHR6_MHC_MCF
+chr6_qbl_hap6	chr6_qbl_hap6	HSCHR6_MHC_QBL
+chr6_ssto_hap7	chr6_ssto_hap7	HSCHR6_MHC_SSTO
+chr17_ctg5_hap1	chr17_ctg5_hap1	HSCHR17_1
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/map_chromosomes.loc.sample	Wed Jun 01 03:48:29 2016 -0400
@@ -0,0 +1,23 @@
+#This file lists the locations and dbkeys of all the mapping files
+#
+# Mapping files can be optained using 'git clone https://github.com/dpryan79/ChromosomeMappings.git'
+# See https://github.com/dpryan79/ChromosomeMappings
+#
+#<unique_build_id>	<dbkey>	<file_path>
+#
+#
+#Contains one entry per file.
+#
+#
+#BDGP6_ensembl2UCSC	dm6 ensembl to UCSC	/galaxy/ChromosomeMappings/BDGP6_ensembl2UCSC.txt
+#BDGP6_UCSC2ensembl	dm6 UCSC to ensembl	/galaxy/ChromosomeMappings/BDGP6_UCSC2ensembl.txt
+#dm3_ensembl2UCSC	dm3 ensembl to UCSC	/galaxy/ChromosomeMappings/dm3_ensembl2UCSC.txt
+#dm3_UCSC2ensembl	dm3 UCSC to ensembl	/galaxy/ChromosomeMappings/dm3_UCSC2ensembl.txt
+#GRCh37_ensembl2UCSC	hg19 ensembl to UCSC	/galaxy/ChromosomeMappings/GRCh37_ensembl2UCSC.txt
+#GRCh37_UCSC2ensembl	hg19 UCSC to ensembl	/galaxy/ChromosomeMappings/GRCh37_UCSC2ensembl.txt
+#GRCh38_ensembl2UCSC	hg38 ensembl to UCSC	/galaxy/ChromosomeMappings/GRCh38_ensembl2UCSC.txt
+#GRCh38_UCSC2ensembl	hg38 UCSC to ensembl	/galaxy/ChromosomeMappings/GRCh38_UCSC2ensembl.txt
+#GRCm37_ensembl2UCSC	mm9 ensembl to UCSC	/galaxy/ChromosomeMappings/GRCm37_ensembl2UCSC.txt
+#GRCm37_UCSC2ensembl	mm9 UCSC to ensembl	/galaxy/ChromosomeMappings/GRCm37_UCSC2ensembl.txt
+#GRCm38_ensembl2UCSC	mm10 ensembl to UCSC	/galaxy/ChromosomeMappings/GRCm38_ensembl2UCSC.txt
+#GRCm38_UCSC2ensembl	mm10 UCSC to ensembl	/galaxy/ChromosomeMappings/GRCm38_UCSC2ensembl.txt
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Wed Jun 01 03:48:29 2016 -0400
@@ -0,0 +1,7 @@
+<tables>
+    <!--Locations of mapping files-->
+    <table name="map_chromosomes" comment_char="#">
+        <columns>value, name, path</columns>
+        <file path="tool-data/map_chromosomes.loc" />
+    </table>
+</tables>