Mercurial > repos > bgruening > replace_column_by_key_value_file

<tool id="replace_column_with_key_value_file" name="Replace column" version="0.2">
    <description>by values which are defined in a convert file</description>
    <command>
        <![CDATA[
        python '$replaceColumnScript'
        ]]>
    </command>
    <configfiles>
        <configfile name="replaceColumnScript">
            <![CDATA[
import sys

replace_file = '$replace_information'
original_file = '$original_file'
column = int("$column_replace") - 1
ignore_start_lines = int("$skip_lines")
delimiter_local = "\t" if str("$delimiter") == "tab" else str("$delimiter")
comment_str = str("$pass_comments")
unk_strat = str("$unknowns_strategy")

## read conversion information to index
conversion = {}

with open(replace_file, 'r') as conversion_file:
    for line in conversion_file:
        conv_key_value = line.strip().split()
        if len(conv_key_value) == 2:
            conversion[conv_key_value[0]] = conv_key_value[1]

## read file line by line, search for column entry if it can be replaced. Otherwise it will be skipped.
with open("output_file", 'w') as output:
    with open(original_file) as original:
        for i, line in enumerate(original):
            if i < ignore_start_lines or (comment_str and line.startswith(comment_str)):
                output.write(line)
                continue

            line_content = line.rstrip().split(delimiter_local)

            out = list()
            for j, line_content_column in enumerate(line_content):
                if j == column:

                    if line_content_column in conversion:
                        out.append(conversion[line_content_column])
                    elif unk_strat == "print":
                        out.append(line_content_column)
                    elif unk_strat == "error":
                        raise Exception('ERROR: Encountered a value [%s] in the file that is not in the replacements file and is not commented with [%s]' % (line_content_column, comment_str))
                else:
                    out.append(line_content_column)

            if len(out) == len(line_content):
                output.write('%s\n' % delimiter_local.join(out))

]]>
        </configfile>
    </configfiles>
    <inputs>
        <param name="original_file" type="data" format="tabular"
               label="File in which you want to replace some values"
               help="The entries of a specific column are replaced by the information given by the next input file." />
        <param name="replace_information" type="data" format="tabular"
               label="Replace information file"
               help="This file contains in the first column the entries that should be replaced by the values of the second column." />
        <param name="column_replace" type="data_column" data_ref="original_file" multiple="false"
               label="Which column should be replaced?" />
        <param name="skip_lines" type='integer' value='0' label="Skip this many starting lines" />
        <param name="delimiter" type="select" label="Delimited by">
            <option value="tab" selected="True">Tab</option>
            <option value=" ">Space</option>
            <option value=".">Dot</option>
            <option value=",">Comma</option>
            <option value="-">Dash</option>
            <option value="_">Underscore</option>
            <option value="|">Pipe</option>
        </param>
        <param name="unknowns_strategy" type="select" label="When an unknown value is encountered">
            <option value="skip" selected="True">Skip / Do not print</option>
            <option value="print">Print without modification</option>
            <option value="error">Exit with an error</option>
        </param>
        <param name="pass_comments" type="text" value="#" label="Do not perform replacement on lines starting with">
            <sanitizer>
                <valid>
                    <add value="#" />
                </valid>
            </sanitizer>
        </param>
    </inputs>
    <outputs>
        <data  name="outfile_replace" format="txt" from_work_dir="output_file"/>
    </outputs>
    <tests>
        <test>
            <param name="replace_information" value="GRCh38_ensembl2UCSC.txt" ftype="tabular" />
            <param name="original_file" value="original_file" ftype="tabular" />
            <param name="column_replace" value="1"/>
            <param name="skip_lines" value="1"/>
            <param name="delimiter" value="tab" />
            <param name="unknowns_strategy" value="skip"/>
            <param name="pass_comments" value="#"/>
            <output name="outfile_replace" file="result_file"/>
        </test>
        <test>
            <param name="replace_information" value="GRCh38_ensembl2UCSC.txt" ftype="tabular" />
            <param name="original_file" value="empty_mapping" ftype="tabular" />
            <param name="column_replace" value="1"/>
            <param name="skip_lines" value="1"/>
            <param name="delimiter" value="tab" />
            <param name="unknowns_strategy" value="skip"/>
            <param name="pass_comments" value="#"/>
            <output name="outfile_replace" file="result_file_empty_mapping"/>
        </test>
        <test expect_failure="True">
            <param name="replace_information" value="neg_test_map.txt" ftype="tabular" />
            <param name="original_file" value="neg_test_commented.txt" ftype="tabular" />
            <param name="column_replace" value="1"/>
            <param name="skip_lines" value="0"/>
            <param name="delimiter" value="tab" />
            <param name="unknowns_strategy" value="error"/>
            <param name="pass_comments" value="#"/>
        </test>
        <test>
            <param name="replace_information" value="neg_test_map.txt" ftype="tabular" />
            <param name="original_file" value="neg_test_commented.txt" ftype="tabular" />
            <param name="column_replace" value="1"/>
            <param name="skip_lines" value="0"/>
            <param name="delimiter" value="tab" />
            <param name="unknowns_strategy" value="print"/>
            <param name="pass_comments" value="#"/>
            <output name="outfile_replace" file="neg_test_commented.txt"/>
        </test>
    </tests>
    <help>
        <![CDATA[
**What it does**

This tool replaces the entries of a defined column with entries given by a replacement file.
For example the replacement file holds the information of the naming scheme of ensembl annotated chromosomes in the frist column and in the second the UCSC annotation.
A file which is having information about chromosomes in ensembl notation in column x can now be converted to a file which holds the same information but in UCSC annotation.

A useful repository for ensembl and UCSC chromosomes mapping is: https://github.com/dpryan79/ChromosomeMappings
        ]]>
    </help>
</tool>
author	bgruening
date	Sun, 23 Sep 2018 04:03:34 -0400
parents	cc18bac5afdb
children