Mercurial > repos > nml > biohansel_bionumeric_converter

--- a/bionumeric_convert.xml	Mon Mar 18 13:15:57 2019 -0400
+++ b/bionumeric_convert.xml	Mon May 13 12:59:15 2019 -0400
@@ -1,38 +1,41 @@
-<tool id="bionumeric_convert" name="biohansel2bionumerics" version="0.1.0">
+<tool id="bionumeric_convert" name="biohansel2bionumerics" version="0.2.0">
     <description>compliant results</description>
     <requirements>
         <requirement type="package" version="0.24.1">pandas</requirement>
     </requirements>
     <command detect_errors="exit_code"><![CDATA[
-        $__tool_directory__/bionumeric_converter.py -f '$Input' -o '$output'
+        $__tool_directory__/bionumeric_converter.py -f '$Input' -o '$Output'
     ]]></command>
     <inputs>
         <param type="data" name="Input" format="tabular"/>
     </inputs>
     <outputs>
-        <data name="output" format="csv" from_work_dir="output" label="Output.csv"/>
+        <data name="Output" format="csv" from_work_dir="output" label="Biohansel_Bionumerics"/>
     </outputs>
     <tests>
         <test>
             <param name="Input" value="results.tab"/>
-            <output name="output" value="Output.csv"/>
+            <output name="Output" value="Biohansel_Bionumerics.csv"/>
         </test>
     </tests>
     <help><![CDATA[
         **What it does**

-        This tool is a supplementary script that takes *only* BioHansel output data and converts it into a format compatible with bionumerics.
+        This tool is a supplementary script that takes Biohansel output data and converts it into a format compatible with Bionumerics.

-        **How to run it**
+        **Inputs:**
+
+        - *Individual* output or *Collection* of outputs for any of the three Biohansel results files (tech_results.tab, match_results.tab, or results.tab)

-        1. Input any of your BioHansel output files (tech_results.tab, match_results.tab, and results.tab)
-        2. Click Execute
+        **Outputs:**

-        **Specific modifications done on the data**
+        - A .CSV file or a collection of .CSV files called "*Output*" that can be renamed and downloaded as required.
+
+        **Specific modifications done to the data**

         1. Converts all commas in the output to "/"
-        2. Shortens BioHansel qc_messages if they are over 150 characters
-        3. Converts the .tab file to a .csv file
+        2. Splits Biohansel qc_message column into multiple columns if the message is longer than 150 characters
+        3. Converts the .tab or .tsv file to a .csv file

     ]]></help>
     <citations>
--- a/bionumeric_converter.py	Mon Mar 18 13:15:57 2019 -0400
+++ b/bionumeric_converter.py	Mon May 13 12:59:15 2019 -0400
@@ -14,7 +14,7 @@
         '-f',
         '--filename',
         required=True,
-        help='Specify your tsv input')
+        help='Specify your biohansel tsv or other tabular separated input')
     parser.add_argument(
         '-o',
         '--output',
@@ -24,30 +24,27 @@
     tsv_file = args.filename
     out_name = args.output

-    no_comma_tsv = comma_remover(tsv_file)
-    df = qc_shortener(no_comma_tsv)
-    df.to_csv(out_name, index=False)
-
-# Remove comma function:
-
+    df_input = pd.read_csv(tsv_file, sep='\t')

-def comma_remover(tsv_file):
-    # Create a table from the tsv file as an input into the dataframe.
-    df = pd.read_csv(tsv_file, sep='\t')
-    # Change all commas to / in the QC message
-    no_comma_tsv = df.replace(',', '/', regex=True)
-    return no_comma_tsv
+    df_no_comma = df_input.replace(',', '/', regex=True)
+    df = qc_shortener(df_no_comma)
+    df.to_csv(out_name, index=False)

 # Shorten QC results:


+def splittingstrings(string, length):
+    return (string[0+i:length+i] for i in range(0, len(string), length))
+
+
 def qc_shortener(df):
-    for count in df.index:
-        message = str(df.at[count, 'qc_message'])
+    for i, row in df.iterrows():
+        message = str(row['qc_message'])
         if len(message) > 150:
-            results = message.find('|')
-            new_message = "Truncated after first '|' : " + message[0:results]
-            df['qc_message'] = df['qc_message'].replace(message, new_message)
+            message_list = list(splittingstrings(message, 150))
+            df.at[i, 'qc_message'] = message_list[0]
+            for val in range(1, len(message_list)):
+                df.at[i, 'qc_message_{}'.format(val)] = message_list[val]
     return df
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/Biohansel_Bionumerics.csv	Mon May 13 12:59:15 2019 -0400
@@ -0,0 +1,4 @@
+sample,subtype,avg_tile_coverage,qc_status,qc_message,qc_message_1
+SRR1645238,1.3,43.345,PASS,,
+SRR1753252,1.1,32.33,PASS,FAIL: This is a test of the cut off system. The data is good and as such I have to manually type this message in to get it to cut off. I am adding in ,5 comas /////
+SRR1928313,1.1.1,555.11,PASS,,
--- a/test-data/Output.csv	Mon Mar 18 13:15:57 2019 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,2 +0,0 @@
-sample,scheme,scheme_version,subtype,all_subtypes,tiles_matching_subtype,are_subtypes_consistent,inconsistent_subtypes,n_tiles_matching_all,n_tiles_matching_all_expected,n_tiles_matching_positive,n_tiles_matching_positive_expected,n_tiles_matching_subtype,n_tiles_matching_subtype_expected,file_path,avg_tile_coverage,qc_status,qc_message
-2019C-111,heidelberg,0.5.0,2.2.3.1.2,2; 2.2; 2.2.3; 2.2.3.1; 2.2.3.1.2,2.2.3.1.2,True,,202,202,14,14,3,3,['2019C-111_1.fastq'/ '2019C-111_2.fastq'],30.07,PASS,Truncated after first '|' : This is a trial to the cut /off/ system as this data all passed the checks.
--- a/test-data/results.tab	Mon Mar 18 13:15:57 2019 -0400
+++ b/test-data/results.tab	Mon May 13 12:59:15 2019 -0400
@@ -1,2 +1,4 @@
-sample	scheme	scheme_version	subtype	all_subtypes	tiles_matching_subtype	are_subtypes_consistent	inconsistent_subtypes	n_tiles_matching_all	n_tiles_matching_all_expected	n_tiles_matching_positive	n_tiles_matching_positive_expected	n_tiles_matching_subtype	n_tiles_matching_subtype_expected	file_path	avg_tile_coverage	qc_status	qc_message
-2019C-111	heidelberg	0.5.0	2.2.3.1.2	2; 2.2; 2.2.3; 2.2.3.1; 2.2.3.1.2	2.2.3.1.2	True		202	202	14	14	3	3	['2019C-111_1.fastq', '2019C-111_2.fastq']	30.070	PASS	This is a trial to the cut ,off, system as this data all passed the checks. | I will attemp to get 150 characters into here in a way that is not awful and sounds decent. We can try counting the letters and as of now, it should be ok!
+sample	subtype	avg_tile_coverage	qc_status	qc_message
+SRR1645238	1.3	43.345	PASS
+SRR1753252	1.1	32.33	PASS	"FAIL: This is a test of the cut off system. The data is good and as such I have to manually type this message in to get it to cut off. I am adding in 5 comas ,,,,,"
+SRR1928313	1.1.1	555.11	PASS