Mercurial > repos > iuc > ivar_removereads

--- a/ivar_removereads.xml	Thu Aug 05 12:44:59 2021 +0000
+++ b/ivar_removereads.xml	Fri Aug 20 20:34:11 2021 +0000
@@ -1,4 +1,4 @@
-<tool id="ivar_removereads" name="ivar removereads" version="@VERSION@+galaxy1">
+<tool id="ivar_removereads" name="ivar removereads" version="@VERSION@+galaxy2">
     <description>Remove reads from trimmed BAM file</description>
     <macros>
         <import>macros.xml</import>
@@ -38,7 +38,7 @@
         <param name="input_bed" argument="-b" type="data" format="bed" label="Primer binding sites information"
         help="The same six-column BED dataset that served as input to ivar trim"/>
         <conditional name="amplicons">
-            <param name="computed" type="select" label="Compute amplicon info from BED file" help="Compute the amplicon info file from the primer BED file">
+            <param name="computed" type="select" label="Compute amplicon info from BED file" help="For suitable primer binding site datasets amplicon info can be computed directly (see tool help below). For others you will need to provide an extra amplicon info dataset.">
                 <option value="yes" selected="true">Yes</option>
                 <option value="no">No</option>
             </param>
@@ -86,6 +86,23 @@
 From this input it will remove reads that come from amplicons that have been
 generated with one or more primers that may have been affected in their binding
 by variants listed in the variants input file.
+To do its job, the needs to know which primers work together to form an
+amplicon. The tool can try to deduce this info from the names of the primers
+found in the primer info dataset. This will require a primer naming scheme
+following the regex pattern::
+
+  .*_(?P<amplicon_number>\d+).*_(?P<primer_orientation>L(?:EFT)?|R(?:IGHT)?)
+
+*i.e.*, the following schemes will work (and get parsed as):
+
+- ``nCoV-2019_1_LEFT`` (forward primer of amplicon 1)
+- ``400_2_out_R`` (reverse primer of amplicon 2)
+- ``QIAseq_163-2_LEFT`` (forward primer of amplicon 163)
+
+Alternatively, you can specify the amplicon information explicitly through a
+dataset that lists the names of primers that together form any given amplicon.
+In it, primer names (exactly matching those in the primer info dataset) need to
+be TAB-separated with one line per amplicon.

 .. class:: Warning mark
--- a/write_amplicon_info_file.py	Thu Aug 05 12:44:59 2021 +0000
+++ b/write_amplicon_info_file.py	Fri Aug 20 20:34:11 2021 +0000
@@ -3,48 +3,49 @@
 import argparse
 import re

-AMPLICON_NAME_RE = r'.*_(?P<num>\d+)_[^0-9]*(?P<name>L(?:EFT)?|R(?:IGHT)?)'

-
-def primer_info_to_position(name):
-    position = 0
-    re_match = re.match(AMPLICON_NAME_RE, name)
-    if re_match is None:
-        raise ValueError("{} does not match expected amplicon name format".format(name))
-    side = re_match.group('name')
-    num = re_match.group('num')
-    if side == 'RIGHT' or side == 'R':
-        position += 1000
-    if num is not None:
-        position += int(num)
-    return position
+AMPLICON_PAT = re.compile(r'.*_(?P<num>\d+).*_(?P<name>L(?:EFT)?|R(?:IGHT)?)')


 def write_amplicon_info_file(bed_file, amplicon_info_file):
     amplicon_sets = {}
-    amplicon_ids = set()
     for line in bed_file:
         fields = line.strip().split('\t')
+        start = int(fields[1])
         name = fields[3]
-        re_match = re.match(AMPLICON_NAME_RE, name)
+        re_match = AMPLICON_PAT.match(name)
         if re_match is None:
-            raise ValueError("{} does not match expected amplicon name format".format(name))
+            raise ValueError(
+                '{} does not match expected amplicon name format'.format(name)
+            )
         amplicon_id = int(re_match.group('num'))
         amplicon_set = amplicon_sets.get(amplicon_id, [])
-        amplicon_set.append(name)
+        amplicon_set.append((name, start))
         amplicon_sets[amplicon_id] = amplicon_set
-        amplicon_ids.add(amplicon_id)

-    for id in sorted(list(amplicon_ids)):
-        amplicon_info = '\t'.join([name for name in sorted(amplicon_sets[id], key=primer_info_to_position)]) + '\n'
+    # write amplicons sorted by number with primers sorted by start position
+    for id in sorted(amplicon_sets):
+        amplicon_info = '\t'.join(
+            [name for name, start in sorted(
+                amplicon_sets[id], key=lambda x: x[1]
+            )]
+        ) + '\n'
         amplicon_info_file.write(amplicon_info)
     amplicon_info_file.close()


 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Write an amplicon info file for iVar from a BED file describing primer positions')
-    parser.add_argument('bed_file', type=argparse.FileType(), help='Primer BED file')
-    parser.add_argument('amplicon_info_file', type=argparse.FileType('w'), help='Output file: amplicon info file in TSV format')
+    parser = argparse.ArgumentParser(
+        description='Write an amplicon info file for iVar '
+                    'from a BED file describing primer positions'
+    )
+    parser.add_argument(
+        'bed_file', type=argparse.FileType(), help='Primer BED file'
+    )
+    parser.add_argument(
+        'amplicon_info_file', type=argparse.FileType('w'),
+        help='Output file: amplicon info file in TSV format'
+    )
     args = parser.parse_args()

     write_amplicon_info_file(args.bed_file, args.amplicon_info_file)