Mercurial > repos > nml > biohansel_bionumeric_converter

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bionumeric_convert.xml	Mon Mar 18 13:15:57 2019 -0400
@@ -0,0 +1,40 @@
+<tool id="bionumeric_convert" name="biohansel2bionumerics" version="0.1.0">
+    <description>compliant results</description>
+    <requirements>
+        <requirement type="package" version="0.24.1">pandas</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        $__tool_directory__/bionumeric_converter.py -f '$Input' -o '$output'
+    ]]></command>
+    <inputs>
+        <param type="data" name="Input" format="tabular"/>
+    </inputs>
+    <outputs>
+        <data name="output" format="csv" from_work_dir="output" label="Output.csv"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="Input" value="results.tab"/>
+            <output name="output" value="Output.csv"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+        **What it does**
+
+        This tool is a supplementary script that takes *only* BioHansel output data and converts it into a format compatible with bionumerics.
+
+        **How to run it**
+
+        1. Input any of your BioHansel output files (tech_results.tab, match_results.tab, and results.tab)
+        2. Click Execute
+
+        **Specific modifications done on the data**
+
+        1. Converts all commas in the output to "/"
+        2. Shortens BioHansel qc_messages if they are over 150 characters
+        3. Converts the .tab file to a .csv file
+
+    ]]></help>
+    <citations>
+    </citations>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bionumeric_converter.py	Mon Mar 18 13:15:57 2019 -0400
@@ -0,0 +1,55 @@
+#!/usr/bin/env python
+
+# Import dependancies needed
+import argparse
+
+import pandas as pd
+
+# Define the main function:
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '-f',
+        '--filename',
+        required=True,
+        help='Specify your tsv input')
+    parser.add_argument(
+        '-o',
+        '--output',
+        default='output.csv',
+        help='Specify output name')
+    args = parser.parse_args()
+    tsv_file = args.filename
+    out_name = args.output
+
+    no_comma_tsv = comma_remover(tsv_file)
+    df = qc_shortener(no_comma_tsv)
+    df.to_csv(out_name, index=False)
+
+# Remove comma function:
+
+
+def comma_remover(tsv_file):
+    # Create a table from the tsv file as an input into the dataframe.
+    df = pd.read_csv(tsv_file, sep='\t')
+    # Change all commas to / in the QC message
+    no_comma_tsv = df.replace(',', '/', regex=True)
+    return no_comma_tsv
+
+# Shorten QC results:
+
+
+def qc_shortener(df):
+    for count in df.index:
+        message = str(df.at[count, 'qc_message'])
+        if len(message) > 150:
+            results = message.find('|')
+            new_message = "Truncated after first '|' : " + message[0:results]
+            df['qc_message'] = df['qc_message'].replace(message, new_message)
+    return df
+
+
+if __name__ == '__main__':
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/Output.csv	Mon Mar 18 13:15:57 2019 -0400
@@ -0,0 +1,2 @@
+sample,scheme,scheme_version,subtype,all_subtypes,tiles_matching_subtype,are_subtypes_consistent,inconsistent_subtypes,n_tiles_matching_all,n_tiles_matching_all_expected,n_tiles_matching_positive,n_tiles_matching_positive_expected,n_tiles_matching_subtype,n_tiles_matching_subtype_expected,file_path,avg_tile_coverage,qc_status,qc_message
+2019C-111,heidelberg,0.5.0,2.2.3.1.2,2; 2.2; 2.2.3; 2.2.3.1; 2.2.3.1.2,2.2.3.1.2,True,,202,202,14,14,3,3,['2019C-111_1.fastq'/ '2019C-111_2.fastq'],30.07,PASS,Truncated after first '|' : This is a trial to the cut /off/ system as this data all passed the checks.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/results.tab	Mon Mar 18 13:15:57 2019 -0400
@@ -0,0 +1,2 @@
+sample	scheme	scheme_version	subtype	all_subtypes	tiles_matching_subtype	are_subtypes_consistent	inconsistent_subtypes	n_tiles_matching_all	n_tiles_matching_all_expected	n_tiles_matching_positive	n_tiles_matching_positive_expected	n_tiles_matching_subtype	n_tiles_matching_subtype_expected	file_path	avg_tile_coverage	qc_status	qc_message
+2019C-111	heidelberg	0.5.0	2.2.3.1.2	2; 2.2; 2.2.3; 2.2.3.1; 2.2.3.1.2	2.2.3.1.2	True		202	202	14	14	3	3	['2019C-111_1.fastq', '2019C-111_2.fastq']	30.070	PASS	This is a trial to the cut ,off, system as this data all passed the checks. | I will attemp to get 150 characters into here in a way that is not awful and sounds decent. We can try counting the letters and as of now, it should be ok!