Mercurial > repos > iuc > data_manager_primer_scheme_bedfiles
changeset 6:dd451e45681c draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_primer_scheme_bedfiles commit 4880dcfcdddd9ed8415ccde01b2f8e2c28dab5c3"
author | iuc |
---|---|
date | Tue, 16 Nov 2021 08:22:08 +0000 |
parents | be70da9dc013 |
children | |
files | data_manager/install_primer_scheme_bedfiles.py data_manager/install_primer_scheme_bedfiles.xml test-data/data_manager_primer_scheme_bedfiles.json.template test-data/primer_scheme_bedfiles.loc |
diffstat | 4 files changed, 102 insertions(+), 44 deletions(-) [+] |
line wrap: on
line diff
--- a/data_manager/install_primer_scheme_bedfiles.py Sat Apr 24 20:56:25 2021 +0000 +++ b/data_manager/install_primer_scheme_bedfiles.py Tue Nov 16 08:22:08 2021 +0000 @@ -18,33 +18,50 @@ DATA_TABLE_NAME = "primer_scheme_bedfiles" -def write_artic_style_bed(input_file, bed_output_filename): +def convert_and_write_bed(input_file, bed_output_filename, scheme_name, force_string=True): with open(bed_output_filename, "w") as bed_output_file: for line in input_file: - fields = line.split("\t") - if len(fields) < 6: - # too short to encode the strand format - exit("invalid format in BED file: {}".format(line.rstrip())) + fields = line.strip().split("\t") + if "Midnight" in scheme_name: + # Midnight primers are distributed in a tabular file, not a BED file + if line.startswith("Primer Name"): + continue + if len(fields) != 8: + exit("Unexpected format in Midnight primer file: {}".format(line.rstrip())) + (primer_name, _, pool, _, _, _, start, end) = fields + strand = '+' if primer_name.endswith('LEFT') else '-' + if strand == '-': + start, end = end, start + fields = ["MN908947.3", start, end, primer_name, pool, strand] + else: + if len(fields) < 5: + # too short to encode the "ARTIC style BED" format + exit("invalid format in BED file: {}".format(line.rstrip())) + # 'BED' format used by ARTIC pipeline uses + # chrom start end primer_name pool_name + # see this: https://github.com/artic-network/fieldbioinformatics/blob/master/artic/vcftagprimersites.py#L76 + # for ARTIC minion and + # this: https://github.com/andersen-lab/ivar/blob/master/src/primer_bed.cpp#L125 + # for ivar trim (ivar trim treats the file as BED following the standard but also allows the ARTIC format) try: - # try and parse field 5 as a number - score = float(fields[4]) + float(fields[4]) except ValueError: - # Alright, this is an ARTIC-style bed, - # which is actually against the specs, but required by the - # ARTIC pipeline. + # this is a string, we can leave it as is pass else: - # This is a regular bed with numbers in the score column. - # We need to "fix" it for the ARTIC pipeline. - fields[4] = '_{0}'.format(score) - bed_output_file.write("\t".join(fields)) + # ensure that it is forced to be a string + fields[4] = '_{0}'.format(fields[4]) + print('\t'.join(fields), file=bed_output_file) -def fetch_artic_primers(output_directory, primers): +def fetch_primers(output_directory, primers): primer_sets = { "SARS-CoV-2-ARTICv1": "https://raw.githubusercontent.com/artic-network/artic-ncov2019/master/primer_schemes/nCoV-2019/V1/nCoV-2019.bed", "SARS-CoV-2-ARTICv2": "https://raw.githubusercontent.com/artic-network/artic-ncov2019/master/primer_schemes/nCoV-2019/V2/nCoV-2019.bed", "SARS-CoV-2-ARTICv3": "https://raw.githubusercontent.com/artic-network/artic-ncov2019/master/primer_schemes/nCoV-2019/V3/nCoV-2019.bed", + "SARS-CoV-2-ARTICv4": "https://raw.githubusercontent.com/artic-network/artic-ncov2019/master/primer_schemes/nCoV-2019/V4/SARS-CoV-2.scheme.bed", + "VarSkip-V1a": "https://raw.githubusercontent.com/nebiolabs/VarSkip/main/schemes/NEB_VarSkip/V1a/NEB_VarSkip.scheme.bed", + "Midnight-v1": "https://zenodo.org/record/3897530/files/SARS-CoV-2_primer_sets_RBK004_nanopore_sequencing.tab?download=1" } data = [] @@ -62,19 +79,23 @@ ) exit(response.status_code) bed_output_filename = os.path.join(output_directory, name + ".bed") - write_artic_style_bed(StringIO(response.text), bed_output_filename) - description = name[:-2] + " " + name[-2:] + " primer set" + convert_and_write_bed(StringIO(response.text), bed_output_filename, name) + if 'ARTIC' in name: + # split the vX from the rest of the name in ARTIC primer set description + description = name[:-2] + " " + name[-2:] + " primer set" + else: + description = name + " primer set" data.append(dict(value=name, path=bed_output_filename, description=description)) return data def install_primer_file( - output_directory, input_filename, primer_name, primer_description + output_directory, input_filename, scheme_name, primer_description ): - name = re.sub(r"\W", "", str(primer_name).replace(" ", "_")) + name = re.sub(r"[^\w-]", "", str(scheme_name).replace(" ", "_")) output_filename = os.path.join(output_directory, name + ".bed") with open(input_filename) as input_file: - write_artic_style_bed(input_file, output_filename) + convert_and_write_bed(input_file, output_filename, scheme_name) data = [dict(value=name, description=primer_description, path=output_filename)] return data @@ -86,7 +107,7 @@ if __name__ == "__main__": parser = argparse.ArgumentParser( - description="Fetch ARTIC SARS-CoV-2 primer files for Galaxy/IRIDA use" + description="Fetch ARTIC, VarSkip and Midnight SARS-CoV-2 primer files for Galaxy/IRIDA use" ) parser.add_argument( "--output_directory", default="tmp", help="Directory to write output to" @@ -146,12 +167,10 @@ data_manager_dict = {} data_manager_dict["data_tables"] = config.get("data_tables", {}) - data_manager_dict["data_tables"][DATA_TABLE_NAME] = data_manager_dict[ - "data_tables" - ].get(DATA_TABLE_NAME, []) + data_manager_dict["data_tables"][DATA_TABLE_NAME] = [] if args.artic_primers: - data = fetch_artic_primers(output_directory, args.artic_primers) + data = fetch_primers(output_directory, args.artic_primers) else: data = install_primer_file( output_directory,
--- a/data_manager/install_primer_scheme_bedfiles.xml Sat Apr 24 20:56:25 2021 +0000 +++ b/data_manager/install_primer_scheme_bedfiles.xml Tue Nov 16 08:22:08 2021 +0000 @@ -1,13 +1,33 @@ -<tool id="data_manager_primer_scheme_bedfiles" name="BED-format primer scheme data manager" version="0.0.12" tool_type="manage_data" profile="19.05"> +<tool id="data_manager_primer_scheme_bedfiles" name="BED-format primer scheme data manager" version="0.0.13" tool_type="manage_data" profile="20.09"> <requirements> - <requirement type="package" version="2.24.0">requests</requirement> + <requirement type="package" version="2.26.0">requests</requirement> </requirements> <!-- fetch all the primers in one go --> <command detect_errors="exit_code"><![CDATA[ python '$__tool_directory__/install_primer_scheme_bedfiles.py' '${output_file}' - #if $input.input_type == "ARTIC" - --artic_primers '${input.primers}' + #if $input.input_type == "network" + ## this code looks up the existing table and uses it to build a list of known primers which + ## is then used to filter the $input.primers variable. some notes: + ## + ## $__app__.tool_data_tables is a dictionary where the keys are data table names and the values + ## are TabularToolDataTable objects (from lib/galaxy/tools/data/__init__.py) + ## + ## the get_fields() method on the TabularToolDataTable returns a list of lists, with one list + ## per line of the tool data table, so row[0] is the first field (i.e. the value column) + ## + ## $input.primers is a string ('val1,val2') when interpreted outside of #set but a list inside of #set + ## + ## known_primers is the $known_primers variable but because it is in a list comprehension the $ should + ## not be used + #set $data_table = $__app__.tool_data_tables.get("primer_scheme_bedfiles") + #if $data_table is not None: + #set $known_primers = [ row[0] for row in $data_table.get_fields() ] + #set $primer_list = ','.join([ primer_name for primer_name in $input.primers if primer_name not in known_primers ]) + #else + #set $primer_list = $input.primers + #end if + --artic_primers '$primer_list' #else --primer_file '${input.primer_input}' --primer_name '${input.primer_name}' @@ -17,20 +37,23 @@ <inputs> <conditional name="input"> <param name="input_type" label="Choose the source for primer schemes" type="select"> - <option value="ARTIC" selected="true">ARTIC SARS-CoV-2 Github page</option> + <option value="network" selected="true">ARTIC SARS-CoV-2, VarSkip and Midnight web pages</option> <option value="history">History</option> </param> - <when value="ARTIC"> + <when value="network"> <param name="primers" type="select" multiple="true" label="SARS-CoV-2 Primers to fetch"> <option value="SARS-CoV-2-ARTICv1" selected="true">SARS-CoV-2 ARTIC v1</option> <option value="SARS-CoV-2-ARTICv2" selected="true">SARS-CoV-2 ARTIC v2</option> <option value="SARS-CoV-2-ARTICv3" selected="true">SARS-CoV-2 ARTIC v3</option> + <option value="SARS-CoV-2-ARTICv4" selected="true">SARS-CoV-2 ARTIC v4</option> + <option value="VarSkip-V1a" selected="true">NEB VarSkip-V1a</option> + <option value="Midnight-v1" selected="true">RAPID/Midnight v1</option> </param> </when> <when value="history"> <param name="primer_input" label="Select history item" type="data" format="bed" /> <param name="primer_name" label="Name for this primer scheme" type="text" - help="Non-word characters will be removed from primer scheme name and spaces replaced with underscore (_)" /> + help="Characters that are not word characters or - will be removed from primer scheme name and spaces replaced with underscore (_)" /> <param name="primer_description" label="Description for primer scheme" type="text" /> </when> </conditional> @@ -40,13 +63,9 @@ </outputs> <tests> <test> - <param name="input_type" value="ARTIC" /> - <param name="primers" value="SARS-CoV-2-ARTICv1,SARS-CoV-2-ARTICv2,SARS-CoV-2-ARTICv3"/> - <output name="output_file"> - <assert_contents> - <has_text text="ARTIC"/> - </assert_contents> - </output> + <param name="input_type" value="network" /> + <param name="primers" value="SARS-CoV-2-ARTICv1,SARS-CoV-2-ARTICv2,SARS-CoV-2-ARTICv3,SARS-CoV-2-ARTICv4,VarSkip-V1a,Midnight-v1"/> + <output name="output_file" value="data_manager_primer_scheme_bedfiles.json.template" compare="re_match" /> </test> <test> <param name="input_type" value="history" /> @@ -59,15 +78,38 @@ </assert_contents> </output> </test> + <test> + <param name="input_type" value="history" /> + <param name="primer_input" ftype="bed" value="sample1.bed" /> + <param name="primer_name" value="sample^primer" /> + <param name="primer_description" value="sample primer scheme" /> + <output name="output_file"> + <assert_contents> + <has_text text="sampleprimer"/> + </assert_contents> + </output> + </test> </tests> <help><![CDATA[ Amplicon sequencing for viral pathogens using the ARTIC_ pipeline or `PrimalSeq and iVar`_ relies on identifying primer locations in a reference sequence using BED format files. This data manager populates a Galaxy tool data table, either from files provided via - a history or via the ARTIC_ network Github repository. + a history or via the ARTIC_, the New England Biolabs VarSkip_ and the Massey + University Midnight_ web pages. + + If the data manager fails with an error message about downloading some of the primer data files, + try and visit the corresponding website to see if it is functioning correctly. As with anything + network-dependent, problems can sometimes happen. A simple solution is to wait some time and + run the data manager again. + + Finally, the data manager is designed to not duplicate primer files, so primers schemes that have + already been downloaded will be skipped when the install from network download mode is used. .. _PrimalSeq and iVar: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-018-1618-7 .. _ARTIC: https://artic.network/ + .. _VarSkip: https://github.com/nebiolabs/VarSkip/ + .. _Midnight: https://zenodo.org/record/3897530#.XwM3ApMzZ0v + .. here_: https://gist.github.com/pvanheus/8403813b77e44cbd18e3fb1b3ca1b624 ]]></help> <citations> <citation type="doi">10.1186/s13059-018-1618-7</citation>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/data_manager_primer_scheme_bedfiles.json.template Tue Nov 16 08:22:08 2021 +0000 @@ -0,0 +1,1 @@ +\{"data_tables": \{"primer_scheme_bedfiles": \[\{"description": "SARS-CoV-2-ARTIC v1 primer set", "path": "[^"]*/SARS-CoV-2-ARTICv1.bed", "value": "SARS-CoV-2-ARTICv1"\}, \{"description": "SARS-CoV-2-ARTIC v2 primer set", "path": "[^"]*/SARS-CoV-2-ARTICv2.bed", "value": "SARS-CoV-2-ARTICv2"\}, \{"description": "SARS-CoV-2-ARTIC v3 primer set", "path": "[^"]*/SARS-CoV-2-ARTICv3.bed", "value": "SARS-CoV-2-ARTICv3"\}, \{"description": "SARS-CoV-2-ARTIC v4 primer set", "path": "[^"]*/SARS-CoV-2-ARTICv4.bed", "value": "SARS-CoV-2-ARTICv4"\}, \{"description": "VarSkip-V1a primer set", "path": "[^"]*/VarSkip-V1a.bed", "value": "VarSkip-V1a"\}\, \{"description": "Midnight-v1 primer set", "path": "[^"]*/Midnight-v1.bed", "value": "Midnight-v1"\}\]\}\} \ No newline at end of file
--- a/test-data/primer_scheme_bedfiles.loc Sat Apr 24 20:56:25 2021 +0000 +++ b/test-data/primer_scheme_bedfiles.loc Tue Nov 16 08:22:08 2021 +0000 @@ -6,7 +6,3 @@ # # for example # SARS-CoV-2-ARTICv1 SARS-CoV-2 ARTIC v1 primers /data/galaxy/tool_data/artic_primers/SARS-CoV-2-ARTICv1.bed -SARS-CoV-2-ARTICv3 SARS-CoV-2-ARTIC v3 primer set /tmp/tmpYMFYgd/tmpdHhY2S/tmppwFSVU/database/data_manager_tool-dataI2hi9i/primer_scheme_bedfiles/SARS-CoV-2-ARTICv3.bed -SARS-CoV-2-ARTICv2 SARS-CoV-2-ARTIC v2 primer set /tmp/tmpYMFYgd/tmpdHhY2S/tmppwFSVU/database/data_manager_tool-dataI2hi9i/primer_scheme_bedfiles/SARS-CoV-2-ARTICv2.bed -SARS-CoV-2-ARTICv1 SARS-CoV-2-ARTIC v1 primer set /tmp/tmpYMFYgd/tmpdHhY2S/tmppwFSVU/database/data_manager_tool-dataI2hi9i/primer_scheme_bedfiles/SARS-CoV-2-ARTICv1.bed -sample_primer sample primer scheme /tmp/tmpYMFYgd/tmpdHhY2S/tmppwFSVU/database/data_manager_tool-dataI2hi9i/primer_scheme_bedfiles/sample_primer.bed