Mercurial > repos > dave > interval2maf
diff interval2maf.py @ 0:d38f317fa9d4 draft default tip
Uploaded
author | dave |
---|---|
date | Fri, 10 Jul 2020 16:05:40 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/interval2maf.py Fri Jul 10 16:05:40 2020 -0400 @@ -0,0 +1,145 @@ +#!/usr/bin/env python +""" +Reads a list of intervals and a maf. Produces a new maf containing the +blocks or parts of blocks in the original that overlapped the intervals. + +If a MAF file, not UID, is provided the MAF file is indexed before being processed. + +NOTE: If two intervals overlap the same block it will be written twice. + +usage: %prog maf_file [options] + -d, --dbkey=d: Database key, ie hg17 + -c, --chromCol=c: Column of Chr + -s, --startCol=s: Column of Start + -e, --endCol=e: Column of End + -S, --strandCol=S: Column of Strand + -t, --mafType=t: Type of MAF source to use + -m, --mafFile=m: Path of source MAF file, if not using cached version + -I, --mafIndex=I: Path of precomputed source MAF file index, if not using cached version + -i, --interval_file=i: Input interval file + -o, --output_file=o: Output MAF file + -p, --species=p: Species to include in output + -P, --split_blocks_by_species=P: Split blocks by species + -r, --remove_all_gap_columns=r: Remove all Gap columns + -l, --indexLocation=l: Override default maf_index.loc file + -z, --mafIndexFile=z: Directory of local maf index file ( maf_index.loc or maf_pairwise.loc ) +""" +# Dan Blankenberg +from __future__ import print_function + +import bx.align.maf +import bx.intervals.io +from bx.cookbook import doc_optparse + +from galaxy.tools.util import maf_utilities + + +def __main__(): + index = index_filename = None + + # Parse Command Line + options, args = doc_optparse.parse(__doc__) + + if options.dbkey: + dbkey = options.dbkey + else: + dbkey = None + if dbkey in [None, "?"]: + maf_utilities.tool_fail("You must specify a proper build in order to extract alignments. You can specify your genome build by clicking on the pencil icon associated with your interval file.") + + species = maf_utilities.parse_species_option(options.species) + + if options.chromCol: + chromCol = int(options.chromCol) - 1 + else: + maf_utilities.tool_fail("Chromosome column not set, click the pencil icon in the history item to set the metadata attributes.") + + if options.startCol: + startCol = int(options.startCol) - 1 + else: + maf_utilities.tool_fail("Start column not set, click the pencil icon in the history item to set the metadata attributes.") + + if options.endCol: + endCol = int(options.endCol) - 1 + else: + maf_utilities.tool_fail("End column not set, click the pencil icon in the history item to set the metadata attributes.") + + if options.strandCol: + strandCol = int(options.strandCol) - 1 + else: + strandCol = -1 + + if options.interval_file: + interval_file = options.interval_file + else: + maf_utilities.tool_fail("Input interval file has not been specified.") + + if options.output_file: + output_file = options.output_file + else: + maf_utilities.tool_fail("Output file has not been specified.") + + split_blocks_by_species = remove_all_gap_columns = False + if options.split_blocks_by_species and options.split_blocks_by_species == 'split_blocks_by_species': + split_blocks_by_species = True + if options.remove_all_gap_columns and options.remove_all_gap_columns == 'remove_all_gap_columns': + remove_all_gap_columns = True + else: + remove_all_gap_columns = True + # Finish parsing command line + + # Open indexed access to MAFs + if options.mafType: + if options.indexLocation: + index = maf_utilities.maf_index_by_uid(options.mafType, options.indexLocation) + else: + index = maf_utilities.maf_index_by_uid(options.mafType, options.mafIndexFile) + if index is None: + maf_utilities.tool_fail("The MAF source specified (%s) appears to be invalid." % (options.mafType)) + elif options.mafFile: + index, index_filename = maf_utilities.open_or_build_maf_index(options.mafFile, options.mafIndex, species=[dbkey]) + if index is None: + maf_utilities.tool_fail("Your MAF file appears to be malformed.") + else: + maf_utilities.tool_fail("Desired source MAF type has not been specified.") + + # Create MAF writter + out = bx.align.maf.Writer(open(output_file, "w")) + + # Iterate over input regions + num_blocks = 0 + num_regions = None + for num_regions, region in enumerate(bx.intervals.io.NiceReaderWrapper(open(interval_file, 'r'), chrom_col=chromCol, start_col=startCol, end_col=endCol, strand_col=strandCol, fix_strand=True, return_header=False, return_comments=False)): + src = maf_utilities.src_merge(dbkey, region.chrom) + for block in index.get_as_iterator(src, region.start, region.end): + if split_blocks_by_species: + blocks = [new_block for new_block in maf_utilities.iter_blocks_split_by_species(block) if maf_utilities.component_overlaps_region(new_block.get_component_by_src_start(dbkey), region)] + else: + blocks = [block] + for block in blocks: + block = maf_utilities.chop_block_by_region(block, src, region) + if block is not None: + if species is not None: + block = block.limit_to_species(species) + block = maf_utilities.orient_block_by_region(block, src, region) + if remove_all_gap_columns: + block.remove_all_gap_columns() + out.write(block) + num_blocks += 1 + + # Close output MAF + out.close() + + # remove index file if created during run + maf_utilities.remove_temp_index_file(index_filename) + + if num_blocks: + print("%i MAF blocks extracted for %i regions." % (num_blocks, (num_regions + 1))) + elif num_regions is not None: + print("No MAF blocks could be extracted for %i regions." % (num_regions + 1)) + else: + print("No valid regions have been provided.") + + +if __name__ == "__main__": + __main__()