Mercurial > repos > pmac > map_chromosomes
changeset 0:e60f92a8e1c8 draft default tip
Uploaded
author | pmac |
---|---|
date | Wed, 01 Jun 2016 03:48:29 -0400 |
parents | |
children | |
files | map_chromosomes.pl map_chromosomes.xml test-data/map_chrom_ensembl.txt test-data/map_chrom_ucsc.txt tool-data/map_chromosomes.loc.sample tool_data_table_conf.xml.sample |
diffstat | 6 files changed, 377 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/map_chromosomes.pl Wed Jun 01 03:48:29 2016 -0400 @@ -0,0 +1,77 @@ +#! /usr/bin/perl -w + +use strict; +use warnings; +use Scalar::Util qw(looks_like_number); + +my @in = (); +my $column_delimiters_href_split = { + 'TAB' => q{\t}, + 'COMMA' => ",", + 'DASH' => "-", + 'UNDERSCORE' => "_", + 'PIPE' => q{\|}, + 'DOT' => q{\.}, + 'SPACE' => " " +}; + +my $column_delimiters_href_join = { + 'TAB' => qq{\t}, + 'COMMA' => ",", + 'DASH' => "-", + 'UNDERSCORE' => "_", + 'PIPE' => "|", + 'DOT' => ".", + 'SPACE' => " " +}; + +# a wrapper for converting between UCSC and ensembl chromosome representations from within galaxy +# convert_UCSC_ensembl.pl [input] [col] [delimiter] [genome] [out_file1] + +die "Check arguments: $0 [input] [col] [delimiter] [map] [out_file1]\n" unless @ARGV == 5; +die "No columns specified: $ARGV[1]\n" if looks_like_number($ARGV[1]) == 0; +die "Delimeter must be one of TAB, COMMA, DASH, UNDERSCORE, PIPE, DOT, SPACE\n" unless defined $column_delimiters_href_split->{$ARGV[2]}; + +# process input +my $input = $ARGV[0]; +$ARGV[1] =~ s/\s+//g; +my $col = --$ARGV[1]; +my $delim = $ARGV[2]; +my $map_file = $ARGV[3]; +my $output = $ARGV[4]; +my $delim_split = $column_delimiters_href_split->{$delim}; +my $delim_join = $column_delimiters_href_join->{$delim}; + +open (MAP, "<$map_file") or die "Cannot open map file $map_file:$!\n"; +my %chr_map; +while(my $line = <MAP>) { + chop $line; + next if grep /^#/, $line; + my @map = split /\t/, $line; + $map[1] = "remove" unless $#map; + $chr_map{$map[0]} = $map[1]; +} +close MAP; + +open (IN, "<$input") or die "Cannot open $input:$!\n"; +open (OUT, ">$output") or die "Cannot create $output:$!\n"; +while (my $line = <IN>) { + chop $line; + @in = split /$delim_split/, $line; + if(defined $in[$col] && defined $chr_map{$in[$col]}) { + $in[$col] = $chr_map{$in[$col]}; + if($in[$col] eq "remove") { + print "Removed line \"$line\" as chromosome does not have a proper mapping\n"; + } else { + print OUT join($delim_join, @in), "\n"; + } + } elsif(grep /^#/, $in[0]) { + print OUT join($delim_join, @in), "\n"; + } else { + print "Removed line \"$line\" as \"$in[$col]\" is not a valid chromosome name\n"; + } +} +close IN; +close OUT; + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/map_chromosomes.xml Wed Jun 01 03:48:29 2016 -0400 @@ -0,0 +1,84 @@ +<tool id="map_chromosomes" name="Convert between UCSC and ensembl chromosome names" version="0.0.1"> + <description>chromosome representation</description> + <command interpreter="perl">map_chromosomes.pl $input1 $col $delimiter ${map.fields.path} $out_file1</command> + <inputs> + <param format="tabular,txt" name="input1" type="data" label="this dataset"/> + <param name="col" type="integer" value="1" label="Column containing chromosome ids"/> + <param name="delimiter" type="select" label="Delimited by"> + <option value="TAB">Tab</option> + <option value="SPACE">Whitespace</option> + <option value="DOT">Dot</option> + <option value="COMMA">Comma</option> + <option value="DASH">Dash</option> + <option value="UNDERSCORE">Underscore</option> + <option value="PIPE">Pipe</option> + </param> + <param name="map" type="select" label="Select a mapping file" help="Ensure you select the correct mapping file"> + <options from_data_table="map_chromosomes"> + <filter type="sort_by" column="2"/> + <validator type="no_options" message="No indexes are available for the selected input dataset"/> + </options> + </param> + </inputs> + <outputs> + <data name="out_file1" format="input" metadata_source="input1"/> + </outputs> + + <tests> + <test> + <param name="col" value="1"/> + <param name="delimiter" value="TAB"/> + <param name="map" value="hg19 UCSC to ensembl"/> + <param name="input1" value="map_chrom_ucsc.txt" dbkey="hg19" /> + <output name="out_file1" file="map_chrom_ensembl.txt"/> + </test> + <test> + <param name="col" value="1"/> + <param name="delimiter" value="TAB"/> + <param name="map" value="hg19 ensembl to UCSC"/> + <param name="input1" value="map_chrom_ensembl.txt" dbkey="hg19" /> + <output name="out_file1" file="map_chrom_ucsc.txt"/> + </test> + </tests> + <help> + + +**What it does** + +Converts a column between the UCSC and ensembl representation of chromosomes. + +.. class:: warningmark + +**Warning any lines containing chromosomes that don't map or are invalid will be removed** + +----- + +**Example 1** + +Converting this dataset:: + + chr21 12 123 + chrM 45 345 + +by setting **Column** to *1*, **Delimiter** to *Tab* and **Mapping** file to *hg19 UCSC to ensembl** will produce:: + + 21 12 123 + MT 45 345 + +----- + +**Example 2** + +Converting this dataset:: + + 21 12 123 + MT 45 345 + +by setting **Column** to *1*, **Delimiter** to *Tab* and **Mapping** file to *hg19 ensembl to UCSC** will produce:: + + chr21 12 123 + chrM 45 345 + + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/map_chrom_ensembl.txt Wed Jun 01 03:48:29 2016 -0400 @@ -0,0 +1,93 @@ +1 chr1 1 +2 chr2 2 +3 chr3 3 +4 chr4 4 +5 chr5 5 +6 chr6 6 +7 chr7 7 +8 chr8 8 +9 chr9 9 +10 chr10 10 +11 chr11 11 +12 chr12 12 +13 chr13 13 +14 chr14 14 +15 chr15 15 +16 chr16 16 +17 chr17 17 +18 chr18 18 +19 chr19 19 +20 chr20 20 +21 chr21 21 +22 chr22 22 +X chrX X +Y chrY Y +MT chrM MT +GL000191.1 chr1_gl000191_random GL000191.1 +GL000192.1 chr1_gl000192_random GL000192.1 +GL000193.1 chr4_gl000193_random GL000193.1 +GL000194.1 chr4_gl000194_random GL000194.1 +GL000195.1 chr7_gl000195_random GL000195.1 +GL000196.1 chr8_gl000196_random GL000196.1 +GL000197.1 chr8_gl000197_random GL000197.1 +GL000198.1 chr9_gl000198_random GL000198.1 +GL000199.1 chr9_gl000199_random GL000199.1 +GL000200.1 chr9_gl000200_random GL000200.1 +GL000201.1 chr9_gl000201_random GL000201.1 +GL000202.1 chr11_gl000202_random GL000202.1 +GL000203.1 chr17_gl000203_random GL000203.1 +GL000204.1 chr17_gl000204_random GL000204.1 +GL000205.1 chr17_gl000205_random GL000205.1 +GL000206.1 chr17_gl000206_random GL000206.1 +GL000207.1 chr18_gl000207_random GL000207.1 +GL000208.1 chr19_gl000208_random GL000208.1 +GL000209.1 chr19_gl000209_random GL000209.1 +GL000210.1 chr21_gl000210_random GL000210.1 +GL000211.1 chrUn_gl000211 GL000211.1 +GL000212.1 chrUn_gl000212 GL000212.1 +GL000213.1 chrUn_gl000213 GL000213.1 +GL000214.1 chrUn_gl000214 GL000214.1 +GL000215.1 chrUn_gl000215 GL000215.1 +GL000216.1 chrUn_gl000216 GL000216.1 +GL000217.1 chrUn_gl000217 GL000217.1 +GL000218.1 chrUn_gl000218 GL000218.1 +GL000219.1 chrUn_gl000219 GL000219.1 +GL000220.1 chrUn_gl000220 GL000220.1 +GL000221.1 chrUn_gl000221 GL000221.1 +GL000222.1 chrUn_gl000222 GL000222.1 +GL000223.1 chrUn_gl000223 GL000223.1 +GL000224.1 chrUn_gl000224 GL000224.1 +GL000225.1 chrUn_gl000225 GL000225.1 +GL000226.1 chrUn_gl000226 GL000226.1 +GL000227.1 chrUn_gl000227 GL000227.1 +GL000228.1 chrUn_gl000228 GL000228.1 +GL000229.1 chrUn_gl000229 GL000229.1 +GL000230.1 chrUn_gl000230 GL000230.1 +GL000231.1 chrUn_gl000231 GL000231.1 +GL000232.1 chrUn_gl000232 GL000232.1 +GL000233.1 chrUn_gl000233 GL000233.1 +GL000234.1 chrUn_gl000234 GL000234.1 +GL000235.1 chrUn_gl000235 GL000235.1 +GL000236.1 chrUn_gl000236 GL000236.1 +GL000237.1 chrUn_gl000237 GL000237.1 +GL000238.1 chrUn_gl000238 GL000238.1 +GL000239.1 chrUn_gl000239 GL000239.1 +GL000240.1 chrUn_gl000240 GL000240.1 +GL000241.1 chrUn_gl000241 GL000241.1 +GL000242.1 chrUn_gl000242 GL000242.1 +GL000243.1 chrUn_gl000243 GL000243.1 +GL000244.1 chrUn_gl000244 GL000244.1 +GL000245.1 chrUn_gl000245 GL000245.1 +GL000246.1 chrUn_gl000246 GL000246.1 +GL000247.1 chrUn_gl000247 GL000247.1 +GL000248.1 chrUn_gl000248 GL000248.1 +GL000249.1 chrUn_gl000249 GL000249.1 +HSCHR4_1 chr4_ctg9_hap1 HSCHR4_1 +HSCHR6_MHC_APD chr6_apd_hap1 HSCHR6_MHC_APD +HSCHR6_MHC_COX chr6_cox_hap2 HSCHR6_MHC_COX +HSCHR6_MHC_DBB chr6_dbb_hap3 HSCHR6_MHC_DBB +HSCHR6_MHC_MANN chr6_mann_hap4 HSCHR6_MHC_MANN +HSCHR6_MHC_MCF chr6_mcf_hap5 HSCHR6_MHC_MCF +HSCHR6_MHC_QBL chr6_qbl_hap6 HSCHR6_MHC_QBL +HSCHR6_MHC_SSTO chr6_ssto_hap7 HSCHR6_MHC_SSTO +HSCHR17_1 chr17_ctg5_hap1 HSCHR17_1
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/map_chrom_ucsc.txt Wed Jun 01 03:48:29 2016 -0400 @@ -0,0 +1,93 @@ +chr1 chr1 1 +chr2 chr2 2 +chr3 chr3 3 +chr4 chr4 4 +chr5 chr5 5 +chr6 chr6 6 +chr7 chr7 7 +chr8 chr8 8 +chr9 chr9 9 +chr10 chr10 10 +chr11 chr11 11 +chr12 chr12 12 +chr13 chr13 13 +chr14 chr14 14 +chr15 chr15 15 +chr16 chr16 16 +chr17 chr17 17 +chr18 chr18 18 +chr19 chr19 19 +chr20 chr20 20 +chr21 chr21 21 +chr22 chr22 22 +chrX chrX X +chrY chrY Y +chrM chrM MT +chr1_gl000191_random chr1_gl000191_random GL000191.1 +chr1_gl000192_random chr1_gl000192_random GL000192.1 +chr4_gl000193_random chr4_gl000193_random GL000193.1 +chr4_gl000194_random chr4_gl000194_random GL000194.1 +chr7_gl000195_random chr7_gl000195_random GL000195.1 +chr8_gl000196_random chr8_gl000196_random GL000196.1 +chr8_gl000197_random chr8_gl000197_random GL000197.1 +chr9_gl000198_random chr9_gl000198_random GL000198.1 +chr9_gl000199_random chr9_gl000199_random GL000199.1 +chr9_gl000200_random chr9_gl000200_random GL000200.1 +chr9_gl000201_random chr9_gl000201_random GL000201.1 +chr11_gl000202_random chr11_gl000202_random GL000202.1 +chr17_gl000203_random chr17_gl000203_random GL000203.1 +chr17_gl000204_random chr17_gl000204_random GL000204.1 +chr17_gl000205_random chr17_gl000205_random GL000205.1 +chr17_gl000206_random chr17_gl000206_random GL000206.1 +chr18_gl000207_random chr18_gl000207_random GL000207.1 +chr19_gl000208_random chr19_gl000208_random GL000208.1 +chr19_gl000209_random chr19_gl000209_random GL000209.1 +chr21_gl000210_random chr21_gl000210_random GL000210.1 +chrUn_gl000211 chrUn_gl000211 GL000211.1 +chrUn_gl000212 chrUn_gl000212 GL000212.1 +chrUn_gl000213 chrUn_gl000213 GL000213.1 +chrUn_gl000214 chrUn_gl000214 GL000214.1 +chrUn_gl000215 chrUn_gl000215 GL000215.1 +chrUn_gl000216 chrUn_gl000216 GL000216.1 +chrUn_gl000217 chrUn_gl000217 GL000217.1 +chrUn_gl000218 chrUn_gl000218 GL000218.1 +chrUn_gl000219 chrUn_gl000219 GL000219.1 +chrUn_gl000220 chrUn_gl000220 GL000220.1 +chrUn_gl000221 chrUn_gl000221 GL000221.1 +chrUn_gl000222 chrUn_gl000222 GL000222.1 +chrUn_gl000223 chrUn_gl000223 GL000223.1 +chrUn_gl000224 chrUn_gl000224 GL000224.1 +chrUn_gl000225 chrUn_gl000225 GL000225.1 +chrUn_gl000226 chrUn_gl000226 GL000226.1 +chrUn_gl000227 chrUn_gl000227 GL000227.1 +chrUn_gl000228 chrUn_gl000228 GL000228.1 +chrUn_gl000229 chrUn_gl000229 GL000229.1 +chrUn_gl000230 chrUn_gl000230 GL000230.1 +chrUn_gl000231 chrUn_gl000231 GL000231.1 +chrUn_gl000232 chrUn_gl000232 GL000232.1 +chrUn_gl000233 chrUn_gl000233 GL000233.1 +chrUn_gl000234 chrUn_gl000234 GL000234.1 +chrUn_gl000235 chrUn_gl000235 GL000235.1 +chrUn_gl000236 chrUn_gl000236 GL000236.1 +chrUn_gl000237 chrUn_gl000237 GL000237.1 +chrUn_gl000238 chrUn_gl000238 GL000238.1 +chrUn_gl000239 chrUn_gl000239 GL000239.1 +chrUn_gl000240 chrUn_gl000240 GL000240.1 +chrUn_gl000241 chrUn_gl000241 GL000241.1 +chrUn_gl000242 chrUn_gl000242 GL000242.1 +chrUn_gl000243 chrUn_gl000243 GL000243.1 +chrUn_gl000244 chrUn_gl000244 GL000244.1 +chrUn_gl000245 chrUn_gl000245 GL000245.1 +chrUn_gl000246 chrUn_gl000246 GL000246.1 +chrUn_gl000247 chrUn_gl000247 GL000247.1 +chrUn_gl000248 chrUn_gl000248 GL000248.1 +chrUn_gl000249 chrUn_gl000249 GL000249.1 +chr4_ctg9_hap1 chr4_ctg9_hap1 HSCHR4_1 +chr6_apd_hap1 chr6_apd_hap1 HSCHR6_MHC_APD +chr6_cox_hap2 chr6_cox_hap2 HSCHR6_MHC_COX +chr6_dbb_hap3 chr6_dbb_hap3 HSCHR6_MHC_DBB +chr6_mann_hap4 chr6_mann_hap4 HSCHR6_MHC_MANN +chr6_mcf_hap5 chr6_mcf_hap5 HSCHR6_MHC_MCF +chr6_qbl_hap6 chr6_qbl_hap6 HSCHR6_MHC_QBL +chr6_ssto_hap7 chr6_ssto_hap7 HSCHR6_MHC_SSTO +chr17_ctg5_hap1 chr17_ctg5_hap1 HSCHR17_1
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/map_chromosomes.loc.sample Wed Jun 01 03:48:29 2016 -0400 @@ -0,0 +1,23 @@ +#This file lists the locations and dbkeys of all the mapping files +# +# Mapping files can be optained using 'git clone https://github.com/dpryan79/ChromosomeMappings.git' +# See https://github.com/dpryan79/ChromosomeMappings +# +#<unique_build_id> <dbkey> <file_path> +# +# +#Contains one entry per file. +# +# +#BDGP6_ensembl2UCSC dm6 ensembl to UCSC /galaxy/ChromosomeMappings/BDGP6_ensembl2UCSC.txt +#BDGP6_UCSC2ensembl dm6 UCSC to ensembl /galaxy/ChromosomeMappings/BDGP6_UCSC2ensembl.txt +#dm3_ensembl2UCSC dm3 ensembl to UCSC /galaxy/ChromosomeMappings/dm3_ensembl2UCSC.txt +#dm3_UCSC2ensembl dm3 UCSC to ensembl /galaxy/ChromosomeMappings/dm3_UCSC2ensembl.txt +#GRCh37_ensembl2UCSC hg19 ensembl to UCSC /galaxy/ChromosomeMappings/GRCh37_ensembl2UCSC.txt +#GRCh37_UCSC2ensembl hg19 UCSC to ensembl /galaxy/ChromosomeMappings/GRCh37_UCSC2ensembl.txt +#GRCh38_ensembl2UCSC hg38 ensembl to UCSC /galaxy/ChromosomeMappings/GRCh38_ensembl2UCSC.txt +#GRCh38_UCSC2ensembl hg38 UCSC to ensembl /galaxy/ChromosomeMappings/GRCh38_UCSC2ensembl.txt +#GRCm37_ensembl2UCSC mm9 ensembl to UCSC /galaxy/ChromosomeMappings/GRCm37_ensembl2UCSC.txt +#GRCm37_UCSC2ensembl mm9 UCSC to ensembl /galaxy/ChromosomeMappings/GRCm37_UCSC2ensembl.txt +#GRCm38_ensembl2UCSC mm10 ensembl to UCSC /galaxy/ChromosomeMappings/GRCm38_ensembl2UCSC.txt +#GRCm38_UCSC2ensembl mm10 UCSC to ensembl /galaxy/ChromosomeMappings/GRCm38_UCSC2ensembl.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Wed Jun 01 03:48:29 2016 -0400 @@ -0,0 +1,7 @@ +<tables> + <!--Locations of mapping files--> + <table name="map_chromosomes" comment_char="#"> + <columns>value, name, path</columns> + <file path="tool-data/map_chromosomes.loc" /> + </table> +</tables>