Mercurial > repos > nml > biohansel_bionumeric_converter
changeset 0:b000a3130db8 draft
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
author | nml |
---|---|
date | Mon, 18 Mar 2019 13:15:57 -0400 |
parents | |
children | 07dfb8fd47f4 |
files | bionumeric_convert.xml bionumeric_converter.py test-data/Output.csv test-data/results.tab |
diffstat | 4 files changed, 99 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bionumeric_convert.xml Mon Mar 18 13:15:57 2019 -0400 @@ -0,0 +1,40 @@ +<tool id="bionumeric_convert" name="biohansel2bionumerics" version="0.1.0"> + <description>compliant results</description> + <requirements> + <requirement type="package" version="0.24.1">pandas</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + $__tool_directory__/bionumeric_converter.py -f '$Input' -o '$output' + ]]></command> + <inputs> + <param type="data" name="Input" format="tabular"/> + </inputs> + <outputs> + <data name="output" format="csv" from_work_dir="output" label="Output.csv"/> + </outputs> + <tests> + <test> + <param name="Input" value="results.tab"/> + <output name="output" value="Output.csv"/> + </test> + </tests> + <help><![CDATA[ + **What it does** + + This tool is a supplementary script that takes *only* BioHansel output data and converts it into a format compatible with bionumerics. + + **How to run it** + + 1. Input any of your BioHansel output files (tech_results.tab, match_results.tab, and results.tab) + 2. Click Execute + + **Specific modifications done on the data** + + 1. Converts all commas in the output to "/" + 2. Shortens BioHansel qc_messages if they are over 150 characters + 3. Converts the .tab file to a .csv file + + ]]></help> + <citations> + </citations> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bionumeric_converter.py Mon Mar 18 13:15:57 2019 -0400 @@ -0,0 +1,55 @@ +#!/usr/bin/env python + +# Import dependancies needed +import argparse + +import pandas as pd + +# Define the main function: + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + '-f', + '--filename', + required=True, + help='Specify your tsv input') + parser.add_argument( + '-o', + '--output', + default='output.csv', + help='Specify output name') + args = parser.parse_args() + tsv_file = args.filename + out_name = args.output + + no_comma_tsv = comma_remover(tsv_file) + df = qc_shortener(no_comma_tsv) + df.to_csv(out_name, index=False) + +# Remove comma function: + + +def comma_remover(tsv_file): + # Create a table from the tsv file as an input into the dataframe. + df = pd.read_csv(tsv_file, sep='\t') + # Change all commas to / in the QC message + no_comma_tsv = df.replace(',', '/', regex=True) + return no_comma_tsv + +# Shorten QC results: + + +def qc_shortener(df): + for count in df.index: + message = str(df.at[count, 'qc_message']) + if len(message) > 150: + results = message.find('|') + new_message = "Truncated after first '|' : " + message[0:results] + df['qc_message'] = df['qc_message'].replace(message, new_message) + return df + + +if __name__ == '__main__': + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/Output.csv Mon Mar 18 13:15:57 2019 -0400 @@ -0,0 +1,2 @@ +sample,scheme,scheme_version,subtype,all_subtypes,tiles_matching_subtype,are_subtypes_consistent,inconsistent_subtypes,n_tiles_matching_all,n_tiles_matching_all_expected,n_tiles_matching_positive,n_tiles_matching_positive_expected,n_tiles_matching_subtype,n_tiles_matching_subtype_expected,file_path,avg_tile_coverage,qc_status,qc_message +2019C-111,heidelberg,0.5.0,2.2.3.1.2,2; 2.2; 2.2.3; 2.2.3.1; 2.2.3.1.2,2.2.3.1.2,True,,202,202,14,14,3,3,['2019C-111_1.fastq'/ '2019C-111_2.fastq'],30.07,PASS,Truncated after first '|' : This is a trial to the cut /off/ system as this data all passed the checks.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/results.tab Mon Mar 18 13:15:57 2019 -0400 @@ -0,0 +1,2 @@ +sample scheme scheme_version subtype all_subtypes tiles_matching_subtype are_subtypes_consistent inconsistent_subtypes n_tiles_matching_all n_tiles_matching_all_expected n_tiles_matching_positive n_tiles_matching_positive_expected n_tiles_matching_subtype n_tiles_matching_subtype_expected file_path avg_tile_coverage qc_status qc_message +2019C-111 heidelberg 0.5.0 2.2.3.1.2 2; 2.2; 2.2.3; 2.2.3.1; 2.2.3.1.2 2.2.3.1.2 True 202 202 14 14 3 3 ['2019C-111_1.fastq', '2019C-111_2.fastq'] 30.070 PASS This is a trial to the cut ,off, system as this data all passed the checks. | I will attemp to get 150 characters into here in a way that is not awful and sounds decent. We can try counting the letters and as of now, it should be ok!