Mercurial > repos > iuc > samtools_view
diff samtools_view.xml @ 16:2dce91e11ca7 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/samtools/samtools_view commit e3de8bc1123bf4ce56818f2b7ad4b53080cb3bd8
author | iuc |
---|---|
date | Fri, 30 Aug 2024 10:24:46 +0000 |
parents | 6be888be75f9 |
children | 32dc5f781059 |
line wrap: on
line diff
--- a/samtools_view.xml Mon Nov 20 22:17:43 2023 +0000 +++ b/samtools_view.xml Fri Aug 30 10:24:46 2024 +0000 @@ -1,4 +1,4 @@ -<tool id="samtools_view" name="Samtools view" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> +<tool id="samtools_view" name="Samtools view" version="@TOOL_VERSION@+galaxy3" profile="@PROFILE@"> <description>- reformat, filter, or subsample SAM, BAM or CRAM</description> <macros> <import>macros.xml</import> @@ -136,6 +136,9 @@ #if $mode.filter_config.qname_file: #set std_filters = $std_filters + " --qname-file '%s'" % $mode.filter_config.qname_file #end if + #if str($cond_expr.select_expr) == "yes": + #set std_filters = $std_filters + " -e '%s'" % $cond_expr.expression + #end if #end if #if $with_subsampling: @@ -170,7 +173,6 @@ ## filter options (except regions filter, which is the last parameter) $std_filters - #if $with_subsampling: --subsample-seed $seed #if str($mode.subsample_config.subsampling_mode.select_subsample) == "target": @@ -300,6 +302,24 @@ <param name="rgfile" type="data" format="tabular" argument="-R" label="Filter by read groups in file" help="Output alignments in read groups listed in FILE." /> </when> </conditional> + <conditional name="cond_expr"> + <param name="select_expr" type="select" label="Filter by expression"> + <option value="no" selected="True">No</option> + <option value="yes">Filter using an expression (see manual)</option> + </param> + <when value="no"/> + <when value="yes"> + <param name="expression" type="text" argument="-e" label="Filter by expression - for example sclen>0 will filter all soft clipped reads" help="See Samtools manual for Filter expression syntax"> + <sanitizer invalid_char=""> + <valid initial="string.printable"> + <remove value=" "/> + <remove value="'"/> + <remove value='"'/> + </valid> + </sanitizer> + </param> + </when> + </conditional> <param name="quality" type="integer" argument="-q" optional="true" min="0" label="Filter by quality" help="Skip alignments with MAPQ smaller than INT." /> <param name="library" type="text" argument="-l" optional="true" label="Filter by library" help="Only output alignments in library STR" /> <param name="cigarcons" type="integer" argument="-m" optional="true" min="0" label="Filter by number of CIGAR bases consuming query sequence" help="Only output alignments with number of CIGAR bases consuming query sequence greater than or equal INT." /> @@ -576,7 +596,7 @@ <param name="addref_select" value="history" /> <param name="ref" value="test.fa" /> </conditional> - <output name="outputsam" file="test_15.cram" ftype="cram" compare="sim_size" delta="250" /> + <output name="outputsam" file="test_15.cram" ftype="cram" compare="sim_size" delta="500" /> </test> <!-- 16) --> <test expect_num_outputs="1"> @@ -908,6 +928,50 @@ </assert_command> <output name="outputsam" file="test_31.bam" ftype="bam" lines_diff="2" /> </test> + <!-- 32) testing expression filters --> + <test expect_num_outputs="1"> + <param name="input" value="in_test_30.bam" ftype="bam"/> + <conditional name="mode"> + <param name="outtype" value="selected_reads" /> + <section name="filter_config"> + <conditional name="cond_expr"> + <param name="select_expr" value="yes"/> + <param name="expression" value="sclen>0"/> + </conditional> + </section> + <conditional name="output_options"> + <conditional name="output_format"> + <param name="oformat" value="bam" /> + </conditional> + </conditional> + </conditional> + <assert_command> + <has_text text="-e 'sclen>0'"/> + </assert_command> + <output name="outputsam" file="test_32.bam" ftype="bam" lines_diff="2" /> + </test> + <!-- 33) testing expression filters --> + <test expect_num_outputs="1"> + <param name="input" value="in_test_30.bam" ftype="bam"/> + <conditional name="mode"> + <param name="outtype" value="selected_reads" /> + <section name="filter_config"> + <conditional name="cond_expr"> + <param name="select_expr" value="yes"/> + <param name="expression" value='rname!="chr13"'/> + </conditional> + </section> + <conditional name="output_options"> + <conditional name="output_format"> + <param name="oformat" value="bam" /> + </conditional> + </conditional> + </conditional> + <assert_command> + <has_text text="-e 'rname!="/> + </assert_command> + <output name="outputsam" file="test_33.bam" ftype="bam" lines_diff="2" /> + </test> </tests> <help> **What it does** @@ -991,12 +1055,143 @@ This filters based on the MAPQ column of the SAM format which gives an estimate about the correct placement of the alignment. Note that aligners do not follow a consistent definition. -## Filtering by Tag ** +**Filtering by Tag** This filter allows to select reads based on tool or user specific tags, e.g., XS:i:-18 the alignment score tag of bowtie. Thus to filter for a specific value of the tag you need the format STR1:STR2, e.g., XS:-18 to filter reads with an aligment score of -18. You can also just write STR1 without the value STR2 hence the filter selects all reads with the tag STR1, e.g., XS. +**Filtering by Expression** + + +Filter expressions are used as an on-the-fly checking of incoming SAM, BAM or CRAM records, discarding records that do not match the specified expression. + +The language used is primarily C style, but with a few differences in the precedence rules for bit operators and the inclusion of regular expression +matching. + +The operator precedence, from strongest binding to weakest, is + +:: + + Grouping (, ) E.g. "(1+2)*3" + Values: literals, vars Numbers, strings and variables + Unary ops: +, -, !, ~ E.g. -10 +10, !10 (not), ~5 (bit not) + Math ops: \*, /, % Multiply, division and (integer) modulo + Math ops: +, - Addition / subtraction + Bit-wise: & Integer AND + Bit-wise ^ Integer XOR + Bit-wise | Integer OR + Conditionals: >, >=, <, <= + Equality: \=\=, !=, =~, !~ =~ and !~ match regular expressions + Boolean: &&, || Logical AND / OR + + +Expressions are computed using floating point mathematics, so "10 / 4" evaluates to 2.5 rather than 2. They may be written as integers in decimal or +"0x" plus hexadecimal, and floating point with or without exponents.However operations that require integers first do an implicit type conversion, so +"7.9 % 5" is 2 and "7.9 & 4.1" is equivalent to "7 & 4", which is 4. Strings are always specified using double quotes. To get a double quote in a +string, use backslash. Similarly a double backslash is used to get a literal backslash. For example ab\"c\\d is the string ab"c\d. + +Comparison operators are evaluated as a match being 1 and a mismatch being 0, thus "(2 > 1) + (3 < 5)" evaluates as 2. All comparisons involving undefined (null) values are deemed to be false. + +The variables are where the file format specifics are accessed from the expression. The variables correspond to SAM fields, for example to find paired +alignments with high mapping quality and a very large insert size, we may use the expression "mapq >= 30 && (tlen >= 100000 || tlen <= -100000)". Valid +variable names and their data types are: + +:: + + endpos int Alignment end position (1-based) + flag int Combined FLAG field + flag.paired int Single bit, 0 or 1 + flag.proper_pair int Single bit, 0 or 2 + flag.unmap int Single bit, 0 or 4 + flag.munmap int Single bit, 0 or 8 + flag.reverse int Single bit, 0 or 16 + flag.mreverse int Single bit, 0 or 32 + flag.read1 int Single bit, 0 or 64 + flag.read2 int Single bit, 0 or 128 + flag.secondary int Single bit, 0 or 256 + flag.qcfail int Single bit, 0 or 512 + flag.dup int Single bit, 0 or 1024 + flag.supplementary int Single bit, 0 or 2048 + hclen int Number of hard-clipped bases + library string Library (LB header via RG) + mapq int Mapping quality + mpos int Synonym for pnext + mrefid int Mate reference number (0 based) + mrname string Synonym for rnext + ncigar int Number of cigar operations + pnext int Mate's alignment position (1-based) + pos int Alignment position (1-based) + qlen int Alignment length: no. query bases + qname string Query name + qual string Quality values (raw, 0 based) + refid int Integer reference number (0 based) + rlen int Alignment length: no. reference bases + rname string Reference name + rnext string Mate's reference name + sclen int Number of soft-clipped bases + seq string Sequence + tlen int Template length (insert size) + [XX] int / string XX tag value + + +Flags are returned either as the whole flag value or by checking for a single bit. Hence the filter expression flag.dup is equivalent to flag & 1024. + +"qlen" and "rlen" are measured using the CIGAR string to count the number of query (sequence) and reference bases consumed. Note "qlen" may not exactly +match the length of the "seq" field if the sequence is "*". + +"sclen" and "hclen" are the number of soft and hard-clipped bases respectively. The formula "qlen-sclen" gives the number of sequence bases used in the +alignment, distinguishing between global alignment and local alignment length. + +"endpos" is the (1-based inclusive) position of the rightmost mapped base of the read, as measured using the CIGAR string, and for mapped reads is +equivalent to "pos+rlen-1". For unmapped reads, it is the same as "pos". + +Reference names may be matched either by their string forms ("rname" and "mrname") or as the Nth @SQ line (counting from zero) as stored in BAM using +"tid" and "mtid" respectively. + +Auxiliary tags are described in square brackets and these expand to either integer or string as defined by the tag itself (XX:Z:string or XX:i:int). +For example [NM]>=10 can be used to look for alignments with many mismatches and [RG]=~"grp[ABC]-" will match the read-group string. + +If no comparison is used with an auxiliary tag it is taken simply to be a test for the existence of that tag. So [NM] will return any record containing +an NM tag, even if that tag is zero (NM:i:0). In htslib <= 1.15 negating this with ![NM] gave misleading results as it was true if the tag did not exist +or did exist but was zero. Now this is strictly does-not-exist. An explicit exists([NM]) and !exists([NM]) function has also been added to make +this intention clear. + +Similarly in htslib <= 1.15 using [NM]!=0 was true both when the tag existed and was not zero as well as when the tag did not exist. From 1.16 onwards +all comparison operators are only true for tags that exist, so [NM]!=0 works as expected. + +Some simple functions are available to operate on strings. These treat the strings as arrays of bytes, permitting their length, minimum, maximum and +average values to be computed. These are useful for processing Quality Scores. + +:: + + length(x) Length of the string (excluding nul char) + min(x) Minimum byte value in the string + max(x) Maximum byte value in the string + avg(x) Average byte value in the string + + +Note that "avg" is a floating point value and it may be NAN for empty strings. This means that "avg(qual)" does not produce an error for records that +have both seq and qual of "*". NAN values will fail any conditional checks, so e.g. "avg(qual) > 20" works and will not report these records. NAN also +fails all equality, < and > comparisons, and returns zero when given as an argument to the exists function. It can be negated with !x in which case it +becomes true. + +Functions that operate on both strings and numerics: + +:: + + exists(x) True if the value exists (or is explicitly true). + default(x,d) Value x if it exists or d if not. + +Functions that apply only to numeric values: + +:: + + qrt(x) Square root of x + og(x) Natural logarithm of x + ow(x, y) Power function, x to the power of y + xp(x) Base-e exponential, equivalent to pow(e,x) + </help> <expand macro="citations"/> </tool>