Mercurial > repos > petr-novak > various_galaxy_tools
changeset 0:696e702ebf74 draft
"planemo upload commit 0f6eca49bafc3c946189d793161a7f81d595e1a1-dirty"
author | petr-novak |
---|---|
date | Mon, 09 May 2022 08:26:30 +0000 |
parents | |
children | 639c0edb7e64 |
files | README.md copy_files.sh extract_GFF_Features.py extract_GFF_Features.xml gff2bed.xml gff2tabular.R gff2tabular.xml gff_filter_by_attribute.py gff_filter_by_attribute.xml gff_to_bed_converter.py |
diffstat | 10 files changed, 919 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.md Mon May 09 08:26:30 2022 +0000 @@ -0,0 +1,9 @@ +# Galaxy tools +Tools which were copied and adapted from https://github.com/galaxyproject/galaxy repository. Tools are available galaxy toolshed in my private repository + + + + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/copy_files.sh Mon May 09 08:26:30 2022 +0000 @@ -0,0 +1,8 @@ +#!/bin/bash +wget https://github.com/galaxyproject/galaxy/raw/dev/tools/filters/gff/gff_filter_by_attribute.py +wget https://raw.githubusercontent.com/galaxyproject/galaxy/dev/tools/filters/gff/gff_filter_by_attribute.xml +wget https://github.com/galaxyproject/galaxy/raw/dev/tools/filters/gff/extract_GFF_Features.py +wget https://github.com/galaxyproject/galaxy/raw/dev/tools/filters/gff/extract_GFF_Features.xml +wget https://github.com/galaxyproject/galaxy/raw/dev/tools/filters/gff2bed.xml +wget https://github.com/galaxyproject/galaxy/raw/dev/tools/filters/gff_to_bed_converter.py +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/extract_GFF_Features.py Mon May 09 08:26:30 2022 +0000 @@ -0,0 +1,54 @@ +#!/usr/bin/env python +# Guruprasad Ananda +""" +Extract features from GFF file. + +usage: %prog input1 out_file1 column features +""" +from __future__ import print_function + +import sys + +from bx.cookbook import doc_optparse + + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit() + + +def main(): + # Parsing Command Line here + options, args = doc_optparse.parse(__doc__) + + try: + inp_file, out_file, column, features = args + except ValueError: + stop_err("One or more arguments is missing or invalid.\nUsage: prog input output column features") + try: + column = int(column) + except ValueError: + stop_err("Column %s is an invalid column." % column) + + if features is None: + stop_err("Column %d has no features to display, select another column." % (column + 1)) + + fo = open(out_file, "w") + for line in open(inp_file): + line = line.rstrip("\r\n") + if line and line.startswith("#"): + # Keep valid comment lines in the output + fo.write("%s\n" % line) + else: + try: + if line.split("\t")[column] in features.split(","): + fo.write("%s\n" % line) + except Exception: + pass + fo.close() + + print("Column %d features: %s" % (column + 1, features)) + + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/extract_GFF_Features.xml Mon May 09 08:26:30 2022 +0000 @@ -0,0 +1,114 @@ +<tool id="Extract_features1" name="Extract features" version="1.0.0"> + <description>from GFF data</description> + <command interpreter="python">extract_GFF_Features.py $input1 $out_file1 ${column_choice.col} ${column_choice.feature}</command> + <inputs> + <param format="gff" name="input1" type="data" label="Select GFF data"/> + <conditional name="column_choice"> + <param name="col" type="select" label="From"> + <option value="0" selected="true">Column 1 / Sequence name</option> + <option value="1">Column 2 / Source</option> + <option value="2">Column 3 / Feature</option> + <option value="6">Column 7 / Strand</option> + <option value="7">Column 8 / Frame</option> + </param> + <when value="0"> + <param name="feature" type="select" multiple="true" label="Extract features" help="Multi-select list - hold the appropriate key while clicking to select multiple columns"> + <options from_dataset="input1"> + <column name="name" index="0"/> + <column name="value" index="0"/> + <filter type="unique_value" name="unique" column="0"/> + </options> + </param> + </when> + <when value="1"> + <param name="feature" type="select" multiple="true" label="Extract features" help="Multi-select list - hold the appropriate key while clicking to select multiple columns"> + <options from_dataset="input1"> + <column name="name" index="1"/> + <column name="value" index="1"/> + <filter type="unique_value" name="unique" column="1"/> + </options> + </param> + </when> + <when value="2"> + <param name="feature" type="select" multiple="true" label="Extract features" help="Multi-select list - hold the appropriate key while clicking to select multiple columns"> + <options from_dataset="input1"> + <column name="name" index="2"/> + <column name="value" index="2"/> + <filter type="unique_value" name="unique" column="2"/> + </options> + </param> + </when> + <when value="6"> + <param name="feature" type="select" multiple="true" label="Extract features" help="Multi-select list - hold the appropriate key while clicking to select multiple columns"> + <options from_dataset="input1"> + <column name="name" index="6"/> + <column name="value" index="6"/> + <filter type="unique_value" name="unique" column="6"/> + </options> + </param> + </when> + <when value="7"> + <param name="feature" type="select" multiple="true" label="Extract features" help="Multi-select list - hold the appropriate key while clicking to select multiple columns"> + <options from_dataset="input1"> + <column name="name" index="7"/> + <column name="value" index="7"/> + <filter type="unique_value" name="unique" column="7"/> + </options> + </param> + </when> + </conditional> + </inputs> + <outputs> + <data format="input" name="out_file1" metadata_source="input1"/> + </outputs> + <tests> + <test> + <param name="input1" value="5.gff"/> + <param name="col" value="0" /> + <param name="feature" value="chr5,chr6,chr7,chr8" /> + <output name="out_file1" file="Extract_features1_out.gff"/> + </test> + </tests> + <help> + +**What it does** + +This tool extracts selected features from GFF data. + +----- + +**Example** + +Selecting **promoter** from the following GFF data:: + + chr22 GeneA enhancer 10000000 10001000 500 + . TGA + chr22 GeneA promoter 10010000 10010100 900 + . TGA + chr22 GeneB promoter 10020000 10025000 400 - . TGB + chr22 GeneB CCDS2220 10030000 10065000 800 - . TGB + +will produce the following output:: + + chr22 GeneA promoter 10010000 10010100 900 + . TGA + chr22 GeneB promoter 10020000 10025000 400 - . TGB + +---- + +.. class:: infomark + +**About formats** + +**GFF format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF lines have nine tab-separated fields:: + + 1. seqname - Must be a chromosome or scaffold. + 2. source - The program that generated this feature. + 3. feature - The name of this type of feature. Some examples of standard feature types are "CDS", "start_codon", "stop_codon", and "exon". + 4. start - The starting position of the feature in the sequence. The first base is numbered 1. + 5. end - The ending position of the feature (inclusive). + 6. score - A score between 0 and 1000. If there is no score value, enter ".". + 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). + 8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. + 9. group - All lines with the same group are linked together into a single item. + + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gff2bed.xml Mon May 09 08:26:30 2022 +0000 @@ -0,0 +1,90 @@ +<tool id="gff2bed1" name="GFF-to-BED" version="1.0.1"> + <description>converter</description> + <edam_operations> + <edam_operation>operation_3434</edam_operation> + </edam_operations> + <command interpreter="python">gff_to_bed_converter.py $input $out_file1</command> + <inputs> + <param format="gff" name="input" type="data" label="Convert this dataset"/> + </inputs> + <outputs> + <data format="bed" name="out_file1" /> + </outputs> + <tests> + <test> + <param name="input" value="5.gff" ftype="gff"/> + <output name="out_file1" file="gff2bed_out.bed"/> + </test> + <test> + <param name="input" value="gff2bed_in2.gff" ftype="gff"/> + <output name="out_file1" file="gff2bed_out2.bed"/> + </test> + <test> + <!-- Test conversion of gff3 file. --> + <param name="input" value="5.gff3" ftype="gff"/> + <output name="out_file1" file="gff2bed_out3.bed"/> + </test> + </tests> + <help> + +**What it does** + +This tool converts data from GFF format to BED format (scroll down for format description). + +-------- + +**Example** + +The following data in GFF format:: + + chr22 GeneA enhancer 10000000 10001000 500 + . TGA + chr22 GeneA promoter 10010000 10010100 900 + . TGA + +Will be converted to BED (**note** that 1 is subtracted from the start coordinate):: + + chr22 9999999 10001000 enhancer 0 + + chr22 10009999 10010100 promoter 0 + + +------ + +.. class:: infomark + +**About formats** + +**BED format** Browser Extensible Data format was designed at UCSC for displaying data tracks in the Genome Browser. It has three required fields and several additional optional ones: + +The first three BED fields (required) are:: + + 1. chrom - The name of the chromosome (e.g. chr1, chrY_random). + 2. chromStart - The starting position in the chromosome. (The first base in a chromosome is numbered 0.) + 3. chromEnd - The ending position in the chromosome, plus 1 (i.e., a half-open interval). + +The additional BED fields (optional) are:: + + 4. name - The name of the BED line. + 5. score - A score between 0 and 1000. + 6. strand - Defines the strand - either '+' or '-'. + 7. thickStart - The starting position where the feature is drawn thickly at the Genome Browser. + 8. thickEnd - The ending position where the feature is drawn thickly at the Genome Browser. + 9. reserved - This should always be set to zero. + 10. blockCount - The number of blocks (exons) in the BED line. + 11. blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount. + 12. blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount. + 13. expCount - The number of experiments. + 14. expIds - A comma-separated list of experiment ids. The number of items in this list should correspond to expCount. + 15. expScores - A comma-separated list of experiment scores. All of the expScores should be relative to expIds. The number of items in this list should correspond to expCount. + +**GFF format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF lines have nine tab-separated fields:: + + 1. seqname - Must be a chromosome or scaffold. + 2. source - The program that generated this feature. + 3. feature - The name of this type of feature. Some examples of standard feature types are "CDS", "start_codon", "stop_codon", and "exon". + 4. start - The starting position of the feature in the sequence. The first base is numbered 1. + 5. end - The ending position of the feature (inclusive). + 6. score - A score between 0 and 1000. If there is no score value, enter ".". + 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). + 8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. + 9. group - All lines with the same group are linked together into a single item. + +</help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gff2tabular.R Mon May 09 08:26:30 2022 +0000 @@ -0,0 +1,6 @@ +#!/usr/bin/env Rscript +library(rtracklayer) +gff <- import(commandArgs(T)[1], format='GFF') +tabular <- as.data.frame(gff) +write.table(tabular, file = commandArgs(T)[2], quote=FALSE, sep="\t", row.names=FALSE) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gff2tabular.xml Mon May 09 08:26:30 2022 +0000 @@ -0,0 +1,19 @@ +<tool id="gff2tabular" name="GFF to Tabular" version="0.1.0"> + <description>converter</description> + <requirements> + <requirement type="package">R</requirement> + <requirement type="package">bioconductor-rtracklayer</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + Rscript ${__tool_directory__}/gff2tabular.R '$inputgff' '$output' + ]]></command> + <inputs> + <param type="data" name="inputgff" format="gff" /> + </inputs> + <outputs> + <data name="output" format="tabular" /> + </outputs> + <help><![CDATA[ + Convert gff2/gff3 to tab delimited file + ]]></help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gff_filter_by_attribute.py Mon May 09 08:26:30 2022 +0000 @@ -0,0 +1,402 @@ +#!/usr/bin/env python +# This tool takes a gff file as input and creates filters on attributes based on certain properties. +# The tool will skip over invalid lines within the file, informing the user about the number of lines skipped. +# TODO: much of this code is copied from the Filter1 tool (filtering.py in tools/stats/). The commonalities should be +# abstracted and leveraged in each filtering tool. +from __future__ import ( + division, + print_function, +) + +import sys +from ast import ( + Module, + parse, + walk, +) +from json import loads + +AST_NODE_TYPE_WHITELIST = [ + "Expr", + "Load", + "Str", + "Num", + "BoolOp", + "Compare", + "And", + "Eq", + "NotEq", + "Or", + "GtE", + "LtE", + "Lt", + "Gt", + "BinOp", + "Add", + "Div", + "Sub", + "Mult", + "Mod", + "Pow", + "LShift", + "GShift", + "BitAnd", + "BitOr", + "BitXor", + "UnaryOp", + "Invert", + "Not", + "NotIn", + "In", + "Is", + "IsNot", + "List", + "Index", + "Subscript", + "Name", +] + + +BUILTIN_AND_MATH_FUNCTIONS = "abs|all|any|bin|chr|cmp|complex|divmod|float|hex|int|len|long|max|min|oct|ord|pow|range|reversed|round|sorted|str|sum|type|unichr|unicode|log|exp|sqrt|ceil|floor".split( + "|" +) +STRING_AND_LIST_METHODS = [name for name in dir("") + dir([]) if not name.startswith("_")] +VALID_FUNCTIONS = BUILTIN_AND_MATH_FUNCTIONS + STRING_AND_LIST_METHODS +# Name blacklist isn't strictly needed - but provides extra peace of mind. +NAME_BLACKLIST = ["exec", "eval", "globals", "locals", "__import__", "__builtins__"] + + +def __check_name(ast_node): + name = ast_node.id + return name not in NAME_BLACKLIST + + +def check_simple_name(text): + """ + + >>> check_simple_name("col_name") + True + >>> check_simple_name("c1=='chr1' and c3-c2>=2000 and c6=='+'") + False + >>> check_simple_name("eval('1+1')") + False + >>> check_simple_name("import sys") + False + >>> check_simple_name("[].__str__") + False + >>> check_simple_name("__builtins__") + False + >>> check_simple_name("'x' in globals") + False + >>> check_simple_name("'x' in [1,2,3]") + False + >>> check_simple_name("c3=='chr1' and c5>5") + False + >>> check_simple_name("c3=='chr1' and d5>5") + False + >>> check_simple_name("c3=='chr1' and c5>5 or exec") + False + >>> check_simple_name("type(c1) != type(1)") + False + >>> check_simple_name("c1.split(',')[1] == '1'") + False + >>> check_simple_name("exec 1") + False + >>> check_simple_name("str(c2) in [\\\"a\\\",\\\"b\\\"]") + False + >>> check_simple_name("__import__('os').system('touch /tmp/OOPS')") + False + """ + try: + module = parse(text) + except SyntaxError: + return False + + if not isinstance(module, Module): + return False + statements = module.body + if not len(statements) == 1: + return False + expression = statements[0] + if expression.__class__.__name__ != "Expr": + return False + + for ast_node in walk(expression): + ast_node_class = ast_node.__class__.__name__ + if ast_node_class not in ["Expr", "Name", "Load"]: + return False + + if ast_node_class == "Name" and not __check_name(ast_node): + return False + + return True + + +def check_expression(text): + """ + + >>> check_expression("c1=='chr1' and c3-c2>=2000 and c6=='+'") + True + >>> check_expression("eval('1+1')") + False + >>> check_expression("import sys") + False + >>> check_expression("[].__str__") + False + >>> check_expression("__builtins__") + False + >>> check_expression("'x' in globals") + False + >>> check_expression("'x' in [1,2,3]") + True + >>> check_expression("c3=='chr1' and c5>5") + True + >>> check_expression("c3=='chr1' and d5>5") + True + >>> check_expression("c3=='chr1' and c5>5 or exec") + False + >>> check_expression("type(c1) != type(1)") + False + >>> check_expression("c1.split(',')[1] == '1'") + False + >>> check_expression("exec 1") + False + >>> check_expression("str(c2) in [\\\"a\\\",\\\"b\\\"]") + False + >>> check_expression("__import__('os').system('touch /tmp/OOPS')") + False + """ + try: + module = parse(text) + except SyntaxError: + return False + + if not isinstance(module, Module): + return False + statements = module.body + if not len(statements) == 1: + return False + expression = statements[0] + if expression.__class__.__name__ != "Expr": + return False + + for ast_node in walk(expression): + ast_node_class = ast_node.__class__.__name__ + + # Toss out everything that is not a "simple" expression, + # imports, error handling, etc... + if ast_node_class not in AST_NODE_TYPE_WHITELIST: + return False + + if ast_node_class == "Name" and not __check_name(ast_node): + return False + + return True + + +# +# Helper functions. +# +def get_operands(filter_condition): + # Note that the order of all_operators is important + items_to_strip = [ + "+", + "-", + "**", + "*", + "//", + "/", + "%", + "<<", + ">>", + "&", + "|", + "^", + "~", + "<=", + "<", + ">=", + ">", + "==", + "!=", + "<>", + " and ", + " or ", + " not ", + " is ", + " is not ", + " in ", + " not in ", + ] + for item in items_to_strip: + if filter_condition.find(item) >= 0: + filter_condition = filter_condition.replace(item, " ") + operands = set(filter_condition.split(" ")) + return operands + + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit() + + +def check_for_executable(text, description=""): + # Attempt to determine if the condition includes executable stuff and, if so, exit. + secured = dir() + operands = get_operands(text) + for operand in operands: + try: + int(operand) + except ValueError: + if operand in secured: + stop_err("Illegal value '%s' in %s '%s'" % (operand, description, text)) + + +# +# Process inputs. +# +in_fname = sys.argv[1] +out_fname = sys.argv[2] +cond_text = sys.argv[3] +attribute_types = loads(sys.argv[4]) + +# Convert types from str to type objects. +for name, a_type in attribute_types.items(): + check_for_executable(a_type) + if not check_simple_name(a_type): + stop_err("Problem with attribute type [%s]" % a_type) + attribute_types[name] = eval(a_type) + +# Possible (eg workflows) that user's filter contains standard +# GFF attributes which are not present in the actual file - +# and thus not in the metadata passed in by Galaxy. +# To avoid a nasty error here, add the official terms from +# the GFF3 specification (if not already defined). +# (These all start with a capital letter, which is important): +for name in [ + "ID", + "Name", + "Alias", + "Parent", + "Target", + "Gap", + "Derives_from", + "Note", + "Dbxref", + "Ontology_term", + "Is_circular", +]: + attribute_types[name] = str + +# Unescape if input has been escaped +mapped_str = { + "__lt__": "<", + "__le__": "<=", + "__eq__": "==", + "__ne__": "!=", + "__gt__": ">", + "__ge__": ">=", + "__sq__": "'", + "__dq__": '"', +} +for key, value in mapped_str.items(): + cond_text = cond_text.replace(key, value) + +# Attempt to determine if the condition includes executable stuff and, if so, exit. +check_for_executable(cond_text, "condition") + +if not check_expression(cond_text): + stop_err("Illegal/invalid in condition '%s'" % (cond_text)) + +# Prepare the column variable names and wrappers for column data types. Only +# prepare columns up to largest column in condition. +attrs, type_casts = [], [] +for name in attribute_types.keys(): + attrs.append(name) + type_cast = "get_value('%(name)s', attribute_types['%(name)s'], attribute_values)" % ({"name": name}) + type_casts.append(type_cast) + +attr_str = ", ".join(attrs) # 'c1, c2, c3, c4' +type_cast_str = ", ".join(type_casts) # 'str(c1), int(c2), int(c3), str(c4)' +wrap = "%s = %s" % (attr_str, type_cast_str) + +# Stats +skipped_lines = 0 +first_invalid_line = 0 +invalid_line = None +lines_kept = 0 +total_lines = 0 +out = open(out_fname, "wt") + + +# Helper function to safely get and type cast a value in a dict. +def get_value(name, a_type, values_dict): + if name in values_dict: + return (a_type)(values_dict[name]) + else: + return None + + +# Read and filter input file, skipping invalid lines +code = """ +for i, line in enumerate( open( in_fname ) ): + total_lines += 1 + line = line.rstrip( '\\r\\n' ) + if not line or line.startswith( '#' ): + # Ignore blank lines or comments + continue + try: + # Place attribute values into variables with attribute + # name; type casting is done as well. + elems = line.split( '\t' ) + attribute_values = {} + for name_value_pair in elems[8].split(";"): + # Split on first equals (GFF3) or space (legacy) + name_value_pair = name_value_pair.strip() + i = name_value_pair.replace(" ", "=").find("=") + if i == -1: + continue + name = name_value_pair[:i].strip() + if name == '': + continue + # Need to strip double quote from value and typecast. + attribute_values[name] = name_value_pair[i+1:].strip(" \\"") + %s + if %s: + lines_kept += 1 + print( line, file=out ) + except Exception as e: + print( e ) + skipped_lines += 1 + if not invalid_line: + first_invalid_line = i + 1 + invalid_line = line +""" % ( + wrap, + cond_text, +) + +valid_filter = True +try: + exec(code) +except Exception as e: + out.close() + if str(e).startswith("invalid syntax"): + valid_filter = False + stop_err('Filter condition "%s" likely invalid. See tool tips, syntax and examples.' % cond_text) + else: + stop_err(str(e)) + +if valid_filter: + out.close() + valid_lines = total_lines - skipped_lines + print("Filtering with %s, " % (cond_text)) + if valid_lines > 0: + print("kept %4.2f%% of %d lines." % (100.0 * lines_kept / valid_lines, total_lines)) + else: + print( + 'Possible invalid filter condition "%s" or non-existent column referenced. See tool tips, syntax and examples.' + % cond_text + ) + if skipped_lines > 0: + print('Skipped %d invalid lines starting at line #%d: "%s"' % (skipped_lines, first_invalid_line, invalid_line))
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gff_filter_by_attribute.xml Mon May 09 08:26:30 2022 +0000 @@ -0,0 +1,57 @@ +<tool id="gff_filter_by_attribute" name="Filter GFF data by attribute" version="0.2"> + <description>using simple expressions</description> + <command> +python '$__tool_directory__/gff_filter_by_attribute.py' '$input' '$out_file1' '$cond' '${input.metadata.attribute_types}' + </command> + <inputs> + <param name="input" type="data" format="gff" label="Filter" help="Dataset missing? See TIP below." /> + <param name="cond" type="text" value="gene_id=='uc002loc.1'" label="With following condition" help="Double equal signs, ==, must be used as shown above. To filter for an arbitrary string, use the Select tool."> + <validator type="empty_field" message="Enter a valid filtering condition, see syntax and examples below."/> + </param> + </inputs> + <outputs> + <data name="out_file1" format="input" metadata_source="input"/> + </outputs> + <tests> + <test> + <param name="input" value="gff_filter_attr_in1.gff"/> + <param name="cond" value="conf_lo>0"/> + <output name="out_file1" file="gff_filter_by_attribute_out1.gff"/> + </test> + <test> + <param name="input" value="gff_filter_attr_in1.gff"/> + <param name="cond" value="conf_lo==0 or conf_hi>125"/> + <output name="out_file1" file="gff_filter_by_attribute_out2.gff"/> + </test> + <test> + <param name="input" value="5.gff3" ftype="gff3"/> + <param name="cond" value="Note=='Ig-like'"/> + <output name="out_file1" file="5_Ig-like.gff"/> + </test> + </tests> + + <help> +.. class:: warningmark + +Double equal signs, ==, must be used as *"equal to"* (e.g., **c1 == 'chr22'**) + +.. class:: infomark + +**TIP:** Attempting to apply a filtering condition may throw exceptions if the data type (e.g., string, integer) in every line of the attribute being filtered is not appropriate for the condition (e.g., attempting certain numerical calculations on strings). If an exception is thrown when applying the condition to a line, that line is skipped as invalid for the filter condition. The number of invalid skipped lines is documented in the resulting history item as a "Condition/data issue". + +.. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* + +----- + +**Syntax** + +The filter tool allows you to restrict the dataset using simple conditional statements. + +- Make sure that multi-character operators contain no white space ( e.g., **<=** is valid while **< =** is not valid ) +- When using 'equal-to' operator **double equal sign '==' must be used** ( e.g., **attribute_name=='chr1'** ) +- Non-numerical values must be included in single or double quotes ( e.g., **attribute_name=='XX22'** ) +- You can combine multiple conditional statements using **and** or **or** ( e.g., **attribute_name=='XX22' or attribute_name=='XX21'** ) + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gff_to_bed_converter.py Mon May 09 08:26:30 2022 +0000 @@ -0,0 +1,160 @@ +#!/usr/bin/env python +from __future__ import print_function + +import sys + +from galaxy.datatypes.util.gff_util import parse_gff_attributes + + +def get_bed_line(chrom, name, strand, blocks): + """Returns a BED line for given data.""" + + if len(blocks) == 1: + # Use simple BED format if there is only a single block: + # chrom, chromStart, chromEnd, name, score, strand + # + start, end = blocks[0] + return "%s\t%i\t%i\t%s\t0\t%s\n" % (chrom, start, end, name, strand) + + # + # Build lists for transcript blocks' starts, sizes. + # + + # Get transcript start, end. + t_start = sys.maxsize + t_end = -1 + for block_start, block_end in blocks: + if block_start < t_start: + t_start = block_start + if block_end > t_end: + t_end = block_end + + # Get block starts, sizes. + block_starts = [] + block_sizes = [] + for block_start, block_end in blocks: + block_starts.append(str(block_start - t_start)) + block_sizes.append(str(block_end - block_start)) + + # + # Create BED entry. + # Bed format: chrom, chromStart, chromEnd, name, score, strand, \ + # thickStart, thickEnd, itemRgb, blockCount, blockSizes, blockStarts + # + # Render complete feature with thick blocks. There's no clear way to do this unless + # we analyze the block names, but making everything thick makes more sense than + # making everything thin. + # + return "%s\t%i\t%i\t%s\t0\t%s\t%i\t%i\t0\t%i\t%s\t%s\n" % ( + chrom, + t_start, + t_end, + name, + strand, + t_start, + t_end, + len(block_starts), + ",".join(block_sizes), + ",".join(block_starts), + ) + + +def __main__(): + input_name = sys.argv[1] + output_name = sys.argv[2] + skipped_lines = 0 + first_skipped_line = 0 + i = 0 + cur_transcript_chrome = None + cur_transcript_id = None + cur_transcript_strand = None + cur_transcripts_blocks = [] # (start, end) for each block. + with open(output_name, "w") as out, open(input_name) as in_fh: + for i, line in enumerate(in_fh): + line = line.rstrip("\r\n") + if line and not line.startswith("#"): + try: + # GFF format: chrom source, name, chromStart, chromEnd, score, strand, attributes + elems = line.split("\t") + start = str(int(elems[3]) - 1) + coords = [int(start), int(elems[4])] + strand = elems[6] + if strand not in ["+", "-"]: + strand = "+" + attributes = parse_gff_attributes(elems[8]) + t_id = attributes.get("transcript_id", None) + + if not t_id: + # + # No transcript ID, so write last transcript and write current line as its own line. + # + + # Write previous transcript. + if cur_transcript_id: + # Write BED entry. + out.write( + get_bed_line( + cur_transcript_chrome, + cur_transcript_id, + cur_transcript_strand, + cur_transcripts_blocks, + ) + ) + + # Replace any spaces in the name with underscores so UCSC will not complain. + name = elems[2].replace(" ", "_") + out.write(get_bed_line(elems[0], name, strand, [coords])) + continue + + # There is a transcript ID, so process line at transcript level. + if t_id == cur_transcript_id: + # Line is element of transcript and will be a block in the BED entry. + cur_transcripts_blocks.append(coords) + continue + + # + # Line is part of new transcript; write previous transcript and start + # new transcript. + # + + # Write previous transcript. + if cur_transcript_id: + # Write BED entry. + out.write( + get_bed_line( + cur_transcript_chrome, cur_transcript_id, cur_transcript_strand, cur_transcripts_blocks + ) + ) + + # Start new transcript. + cur_transcript_chrome = elems[0] + cur_transcript_id = t_id + cur_transcript_strand = strand + cur_transcripts_blocks = [] + cur_transcripts_blocks.append(coords) + except Exception: + skipped_lines += 1 + if not first_skipped_line: + first_skipped_line = i + 1 + else: + skipped_lines += 1 + if not first_skipped_line: + first_skipped_line = i + 1 + + # Write last transcript. + if cur_transcript_id: + # Write BED entry. + out.write( + get_bed_line(cur_transcript_chrome, cur_transcript_id, cur_transcript_strand, cur_transcripts_blocks) + ) + info_msg = "%i lines converted to BED. " % (i + 1 - skipped_lines) + if skipped_lines > 0: + info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." % ( + skipped_lines, + first_skipped_line, + ) + print(info_msg) + + +if __name__ == "__main__": + __main__()