changeset 6:dd451e45681c draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_primer_scheme_bedfiles commit 4880dcfcdddd9ed8415ccde01b2f8e2c28dab5c3"
author iuc
date Tue, 16 Nov 2021 08:22:08 +0000
parents be70da9dc013
children
files data_manager/install_primer_scheme_bedfiles.py data_manager/install_primer_scheme_bedfiles.xml test-data/data_manager_primer_scheme_bedfiles.json.template test-data/primer_scheme_bedfiles.loc
diffstat 4 files changed, 102 insertions(+), 44 deletions(-) [+]
line wrap: on
line diff
--- a/data_manager/install_primer_scheme_bedfiles.py	Sat Apr 24 20:56:25 2021 +0000
+++ b/data_manager/install_primer_scheme_bedfiles.py	Tue Nov 16 08:22:08 2021 +0000
@@ -18,33 +18,50 @@
 DATA_TABLE_NAME = "primer_scheme_bedfiles"
 
 
-def write_artic_style_bed(input_file, bed_output_filename):
+def convert_and_write_bed(input_file, bed_output_filename, scheme_name, force_string=True):
     with open(bed_output_filename, "w") as bed_output_file:
         for line in input_file:
-            fields = line.split("\t")
-            if len(fields) < 6:
-                # too short to encode the strand format
-                exit("invalid format in BED file: {}".format(line.rstrip()))
+            fields = line.strip().split("\t")
+            if "Midnight" in scheme_name:
+                # Midnight primers are distributed in a tabular file, not a BED file
+                if line.startswith("Primer Name"):
+                    continue
+                if len(fields) != 8:
+                    exit("Unexpected format in Midnight primer file: {}".format(line.rstrip()))
+                (primer_name, _, pool, _, _, _, start, end) = fields
+                strand = '+' if primer_name.endswith('LEFT') else '-'
+                if strand == '-':
+                    start, end = end, start
+                fields = ["MN908947.3", start, end, primer_name, pool, strand]
+            else:
+                if len(fields) < 5:
+                    # too short to encode the "ARTIC style BED" format
+                    exit("invalid format in BED file: {}".format(line.rstrip()))
+            # 'BED' format used by ARTIC pipeline uses
+            # chrom  start  end  primer_name  pool_name
+            # see this: https://github.com/artic-network/fieldbioinformatics/blob/master/artic/vcftagprimersites.py#L76
+            # for ARTIC minion and
+            # this: https://github.com/andersen-lab/ivar/blob/master/src/primer_bed.cpp#L125
+            # for ivar trim (ivar trim treats the file as BED following the standard but also allows the ARTIC format)
             try:
-                # try and parse field 5 as a number
-                score = float(fields[4])
+                float(fields[4])
             except ValueError:
-                # Alright, this is an ARTIC-style bed,
-                # which is actually against the specs, but required by the
-                # ARTIC pipeline.
+                # this is a string, we can leave it as is
                 pass
             else:
-                # This is a regular bed with numbers in the score column.
-                # We need to "fix" it for the ARTIC pipeline.
-                fields[4] = '_{0}'.format(score)
-            bed_output_file.write("\t".join(fields))
+                # ensure that it is forced to be a string
+                fields[4] = '_{0}'.format(fields[4])
+            print('\t'.join(fields), file=bed_output_file)
 
 
-def fetch_artic_primers(output_directory, primers):
+def fetch_primers(output_directory, primers):
     primer_sets = {
         "SARS-CoV-2-ARTICv1": "https://raw.githubusercontent.com/artic-network/artic-ncov2019/master/primer_schemes/nCoV-2019/V1/nCoV-2019.bed",
         "SARS-CoV-2-ARTICv2": "https://raw.githubusercontent.com/artic-network/artic-ncov2019/master/primer_schemes/nCoV-2019/V2/nCoV-2019.bed",
         "SARS-CoV-2-ARTICv3": "https://raw.githubusercontent.com/artic-network/artic-ncov2019/master/primer_schemes/nCoV-2019/V3/nCoV-2019.bed",
+        "SARS-CoV-2-ARTICv4": "https://raw.githubusercontent.com/artic-network/artic-ncov2019/master/primer_schemes/nCoV-2019/V4/SARS-CoV-2.scheme.bed",
+        "VarSkip-V1a": "https://raw.githubusercontent.com/nebiolabs/VarSkip/main/schemes/NEB_VarSkip/V1a/NEB_VarSkip.scheme.bed",
+        "Midnight-v1": "https://zenodo.org/record/3897530/files/SARS-CoV-2_primer_sets_RBK004_nanopore_sequencing.tab?download=1"
     }
 
     data = []
@@ -62,19 +79,23 @@
             )
             exit(response.status_code)
         bed_output_filename = os.path.join(output_directory, name + ".bed")
-        write_artic_style_bed(StringIO(response.text), bed_output_filename)
-        description = name[:-2] + " " + name[-2:] + " primer set"
+        convert_and_write_bed(StringIO(response.text), bed_output_filename, name)
+        if 'ARTIC' in name:
+            # split the vX from the rest of the name in ARTIC primer set description
+            description = name[:-2] + " " + name[-2:] + " primer set"
+        else:
+            description = name + " primer set"
         data.append(dict(value=name, path=bed_output_filename, description=description))
     return data
 
 
 def install_primer_file(
-    output_directory, input_filename, primer_name, primer_description
+    output_directory, input_filename, scheme_name, primer_description
 ):
-    name = re.sub(r"\W", "", str(primer_name).replace(" ", "_"))
+    name = re.sub(r"[^\w-]", "", str(scheme_name).replace(" ", "_"))
     output_filename = os.path.join(output_directory, name + ".bed")
     with open(input_filename) as input_file:
-        write_artic_style_bed(input_file, output_filename)
+        convert_and_write_bed(input_file, output_filename, scheme_name)
     data = [dict(value=name, description=primer_description, path=output_filename)]
     return data
 
@@ -86,7 +107,7 @@
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description="Fetch ARTIC SARS-CoV-2 primer files for Galaxy/IRIDA use"
+        description="Fetch ARTIC, VarSkip and Midnight SARS-CoV-2 primer files for Galaxy/IRIDA use"
     )
     parser.add_argument(
         "--output_directory", default="tmp", help="Directory to write output to"
@@ -146,12 +167,10 @@
 
     data_manager_dict = {}
     data_manager_dict["data_tables"] = config.get("data_tables", {})
-    data_manager_dict["data_tables"][DATA_TABLE_NAME] = data_manager_dict[
-        "data_tables"
-    ].get(DATA_TABLE_NAME, [])
+    data_manager_dict["data_tables"][DATA_TABLE_NAME] = []
 
     if args.artic_primers:
-        data = fetch_artic_primers(output_directory, args.artic_primers)
+        data = fetch_primers(output_directory, args.artic_primers)
     else:
         data = install_primer_file(
             output_directory,
--- a/data_manager/install_primer_scheme_bedfiles.xml	Sat Apr 24 20:56:25 2021 +0000
+++ b/data_manager/install_primer_scheme_bedfiles.xml	Tue Nov 16 08:22:08 2021 +0000
@@ -1,13 +1,33 @@
-<tool id="data_manager_primer_scheme_bedfiles" name="BED-format primer scheme data manager" version="0.0.12" tool_type="manage_data" profile="19.05">
+<tool id="data_manager_primer_scheme_bedfiles" name="BED-format primer scheme data manager" version="0.0.13" tool_type="manage_data" profile="20.09">
     <requirements>
-        <requirement type="package" version="2.24.0">requests</requirement>
+        <requirement type="package" version="2.26.0">requests</requirement>
     </requirements>
     <!-- fetch all the primers in one go -->
     <command detect_errors="exit_code"><![CDATA[
     python '$__tool_directory__/install_primer_scheme_bedfiles.py'
         '${output_file}'
-        #if $input.input_type == "ARTIC"
-            --artic_primers '${input.primers}'
+        #if $input.input_type == "network"
+            ## this code looks up the existing table and uses it to build a list of known primers which
+            ## is then used to filter the $input.primers variable. some notes:
+            ##
+            ## $__app__.tool_data_tables is a dictionary where the keys are data table names and the values 
+            ## are TabularToolDataTable objects (from lib/galaxy/tools/data/__init__.py)
+            ##
+            ## the get_fields() method on the TabularToolDataTable returns a list of lists, with one list
+            ## per line of the tool data table, so row[0] is the first field (i.e. the value column)
+            ##
+            ## $input.primers is a string ('val1,val2') when interpreted outside of #set but a list inside of #set
+            ##
+            ## known_primers is the $known_primers variable but because it is in a list comprehension the $ should
+            ## not be used
+            #set $data_table = $__app__.tool_data_tables.get("primer_scheme_bedfiles")
+            #if $data_table is not None:
+                #set $known_primers = [ row[0] for row in $data_table.get_fields() ]
+                #set $primer_list = ','.join([ primer_name for primer_name in $input.primers if primer_name not in known_primers ])
+            #else
+                #set $primer_list = $input.primers
+            #end if
+            --artic_primers '$primer_list'
         #else 
             --primer_file '${input.primer_input}'
             --primer_name '${input.primer_name}'
@@ -17,20 +37,23 @@
     <inputs>
         <conditional name="input">
             <param name="input_type" label="Choose the source for primer schemes" type="select">
-                <option value="ARTIC" selected="true">ARTIC SARS-CoV-2 Github page</option>
+                <option value="network" selected="true">ARTIC SARS-CoV-2, VarSkip and Midnight web pages</option>
                 <option value="history">History</option>
             </param>
-            <when value="ARTIC">                    
+            <when value="network">
                 <param name="primers" type="select" multiple="true" label="SARS-CoV-2 Primers to fetch">
                     <option value="SARS-CoV-2-ARTICv1" selected="true">SARS-CoV-2 ARTIC v1</option>
                     <option value="SARS-CoV-2-ARTICv2" selected="true">SARS-CoV-2 ARTIC v2</option>
                     <option value="SARS-CoV-2-ARTICv3" selected="true">SARS-CoV-2 ARTIC v3</option>
+                    <option value="SARS-CoV-2-ARTICv4" selected="true">SARS-CoV-2 ARTIC v4</option>
+                    <option value="VarSkip-V1a" selected="true">NEB VarSkip-V1a</option>
+                    <option value="Midnight-v1" selected="true">RAPID/Midnight v1</option>
                 </param>
             </when>
             <when value="history">
                 <param name="primer_input" label="Select history item" type="data" format="bed" />
                 <param name="primer_name" label="Name for this primer scheme" type="text" 
-                       help="Non-word characters will be removed from primer scheme name and spaces replaced with underscore (_)" />
+                       help="Characters that are not word characters or - will be removed from primer scheme name and spaces replaced with underscore (_)" />
                 <param name="primer_description" label="Description for primer scheme" type="text" />
             </when>
         </conditional>
@@ -40,13 +63,9 @@
     </outputs>
     <tests>
         <test>
-            <param name="input_type" value="ARTIC" />
-            <param name="primers" value="SARS-CoV-2-ARTICv1,SARS-CoV-2-ARTICv2,SARS-CoV-2-ARTICv3"/>
-            <output name="output_file">
-                <assert_contents>
-                    <has_text text="ARTIC"/>
-                </assert_contents>
-            </output>        
+            <param name="input_type" value="network" />
+            <param name="primers" value="SARS-CoV-2-ARTICv1,SARS-CoV-2-ARTICv2,SARS-CoV-2-ARTICv3,SARS-CoV-2-ARTICv4,VarSkip-V1a,Midnight-v1"/>
+            <output name="output_file" value="data_manager_primer_scheme_bedfiles.json.template" compare="re_match" />
         </test>
         <test>
             <param name="input_type" value="history" />
@@ -59,15 +78,38 @@
                 </assert_contents>
             </output>        
         </test>
+        <test>
+            <param name="input_type" value="history" />
+            <param name="primer_input" ftype="bed" value="sample1.bed" />
+            <param name="primer_name" value="sample^primer" />
+            <param name="primer_description" value="sample primer scheme" />
+            <output name="output_file">
+                <assert_contents>
+                    <has_text text="sampleprimer"/>
+                </assert_contents>
+            </output>        
+        </test>
     </tests>
     <help><![CDATA[
         Amplicon sequencing for viral pathogens using the ARTIC_ pipeline or `PrimalSeq and iVar`_ relies on
         identifying primer locations in a reference sequence using BED format files. This 
         data manager populates a Galaxy tool data table, either from files provided via
-        a history or via the ARTIC_ network Github repository.
+        a history or via the ARTIC_, the New England Biolabs VarSkip_ and the Massey 
+        University Midnight_ web pages.
+
+        If the data manager fails with an error message about downloading some of the primer data files,
+        try and visit the corresponding website to see if it is functioning correctly. As with anything
+        network-dependent, problems can sometimes happen. A simple solution is to wait some time and
+        run the data manager again.
+
+        Finally, the data manager is designed to not duplicate primer files, so primers schemes that have
+        already been downloaded will be skipped when the install from network download mode is used.
 
         .. _PrimalSeq and iVar: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-018-1618-7
         .. _ARTIC: https://artic.network/
+        .. _VarSkip: https://github.com/nebiolabs/VarSkip/
+        .. _Midnight: https://zenodo.org/record/3897530#.XwM3ApMzZ0v
+        .. here_: https://gist.github.com/pvanheus/8403813b77e44cbd18e3fb1b3ca1b624
     ]]></help>
     <citations>
       <citation type="doi">10.1186/s13059-018-1618-7</citation>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/data_manager_primer_scheme_bedfiles.json.template	Tue Nov 16 08:22:08 2021 +0000
@@ -0,0 +1,1 @@
+\{"data_tables": \{"primer_scheme_bedfiles": \[\{"description": "SARS-CoV-2-ARTIC v1 primer set", "path": "[^"]*/SARS-CoV-2-ARTICv1.bed", "value": "SARS-CoV-2-ARTICv1"\}, \{"description": "SARS-CoV-2-ARTIC v2 primer set", "path": "[^"]*/SARS-CoV-2-ARTICv2.bed", "value": "SARS-CoV-2-ARTICv2"\}, \{"description": "SARS-CoV-2-ARTIC v3 primer set", "path": "[^"]*/SARS-CoV-2-ARTICv3.bed", "value": "SARS-CoV-2-ARTICv3"\}, \{"description": "SARS-CoV-2-ARTIC v4 primer set", "path": "[^"]*/SARS-CoV-2-ARTICv4.bed", "value": "SARS-CoV-2-ARTICv4"\}, \{"description": "VarSkip-V1a primer set", "path": "[^"]*/VarSkip-V1a.bed", "value": "VarSkip-V1a"\}\, \{"description": "Midnight-v1 primer set", "path": "[^"]*/Midnight-v1.bed", "value": "Midnight-v1"\}\]\}\}
\ No newline at end of file
--- a/test-data/primer_scheme_bedfiles.loc	Sat Apr 24 20:56:25 2021 +0000
+++ b/test-data/primer_scheme_bedfiles.loc	Tue Nov 16 08:22:08 2021 +0000
@@ -6,7 +6,3 @@
 #
 # for example
 # SARS-CoV-2-ARTICv1	SARS-CoV-2 ARTIC v1 primers	/data/galaxy/tool_data/artic_primers/SARS-CoV-2-ARTICv1.bed
-SARS-CoV-2-ARTICv3	SARS-CoV-2-ARTIC v3 primer set	/tmp/tmpYMFYgd/tmpdHhY2S/tmppwFSVU/database/data_manager_tool-dataI2hi9i/primer_scheme_bedfiles/SARS-CoV-2-ARTICv3.bed
-SARS-CoV-2-ARTICv2	SARS-CoV-2-ARTIC v2 primer set	/tmp/tmpYMFYgd/tmpdHhY2S/tmppwFSVU/database/data_manager_tool-dataI2hi9i/primer_scheme_bedfiles/SARS-CoV-2-ARTICv2.bed
-SARS-CoV-2-ARTICv1	SARS-CoV-2-ARTIC v1 primer set	/tmp/tmpYMFYgd/tmpdHhY2S/tmppwFSVU/database/data_manager_tool-dataI2hi9i/primer_scheme_bedfiles/SARS-CoV-2-ARTICv1.bed
-sample_primer	sample primer scheme	/tmp/tmpYMFYgd/tmpdHhY2S/tmppwFSVU/database/data_manager_tool-dataI2hi9i/primer_scheme_bedfiles/sample_primer.bed