Mercurial > repos > devteam > flanking_features
comparison flanking_features.py @ 1:8307665c4b6c draft
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
| author | devteam |
|---|---|
| date | Wed, 11 Nov 2015 12:48:18 -0500 |
| parents | 90100b587723 |
| children | a09d13b108fd |
comparison
equal
deleted
inserted
replaced
| 0:90100b587723 | 1:8307665c4b6c |
|---|---|
| 8 -2, --cols2=N,N,N,N: Columns for start, end, strand in second file | 8 -2, --cols2=N,N,N,N: Columns for start, end, strand in second file |
| 9 -G, --gff1: input 1 is GFF format, meaning start and end coordinates are 1-based, closed interval | 9 -G, --gff1: input 1 is GFF format, meaning start and end coordinates are 1-based, closed interval |
| 10 -H, --gff2: input 2 is GFF format, meaning start and end coordinates are 1-based, closed interval | 10 -H, --gff2: input 2 is GFF format, meaning start and end coordinates are 1-based, closed interval |
| 11 """ | 11 """ |
| 12 | 12 |
| 13 import sys, traceback, fileinput | 13 import fileinput |
| 14 from warnings import warn | 14 import sys |
| 15 from bx.cookbook import doc_optparse | 15 from bx.cookbook import doc_optparse |
| 16 from galaxy.tools.util.galaxyops import * | 16 from bx.intervals.io import Comment, GenomicInterval, Header, NiceReaderWrapper |
| 17 from bx.intervals.io import * | |
| 18 from bx.intervals.operations import quicksect | 17 from bx.intervals.operations import quicksect |
| 19 from utils.gff_util import * | 18 from bx.tabular.io import ParseError |
| 19 from galaxy.tools.util.galaxyops import fail, parse_cols_arg, skipped | |
| 20 from utils.gff_util import convert_bed_coords_to_gff, GFFIntervalToBEDReaderWrapper | |
| 20 | 21 |
| 21 assert sys.version_info[:2] >= ( 2, 4 ) | 22 assert sys.version_info[:2] >= ( 2, 4 ) |
| 22 | 23 |
| 23 def get_closest_feature (node, direction, threshold_up, threshold_down, report_func_up, report_func_down): | 24 |
| 25 def get_closest_feature(node, direction, threshold_up, threshold_down, report_func_up, report_func_down): | |
| 24 #direction=1 for +ve strand upstream and -ve strand downstream cases; and it is 0 for +ve strand downstream and -ve strand upstream cases | 26 #direction=1 for +ve strand upstream and -ve strand downstream cases; and it is 0 for +ve strand downstream and -ve strand upstream cases |
| 25 #threhold_Up is equal to the interval start for +ve strand, and interval end for -ve strand | 27 #threhold_Up is equal to the interval start for +ve strand, and interval end for -ve strand |
| 26 #threhold_down is equal to the interval end for +ve strand, and interval start for -ve strand | 28 #threhold_down is equal to the interval end for +ve strand, and interval start for -ve strand |
| 27 if direction == 1: | 29 if direction == 1: |
| 28 if node.maxend <= threshold_up: | 30 if node.maxend <= threshold_up: |
| 29 if node.end == node.maxend: | 31 if node.end == node.maxend: |
| 30 report_func_up(node) | 32 report_func_up(node) |
| 31 elif node.right and node.left: | 33 elif node.right and node.left: |
| 32 if node.right.maxend == node.maxend: | 34 if node.right.maxend == node.maxend: |
| 58 get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down) | 60 get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down) |
| 59 else: | 61 else: |
| 60 if node.right: | 62 if node.right: |
| 61 get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down) | 63 get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down) |
| 62 | 64 |
| 65 | |
| 63 def proximal_region_finder(readers, region, comments=True): | 66 def proximal_region_finder(readers, region, comments=True): |
| 64 """ | 67 """ |
| 65 Returns an iterator that yields elements of the form [ <original_interval>, <closest_feature> ]. | 68 Returns an iterator that yields elements of the form [ <original_interval>, <closest_feature> ]. |
| 66 Intervals are GenomicInterval objects. | 69 Intervals are GenomicInterval objects. |
| 67 """ | 70 """ |
| 68 primary = readers[0] | 71 primary = readers[0] |
| 69 features = readers[1] | 72 features = readers[1] |
| 70 either = False | 73 either = False |
| 71 if region == 'Upstream': | 74 if region == 'Upstream': |
| 74 up, down = False, True | 77 up, down = False, True |
| 75 else: | 78 else: |
| 76 up, down = True, True | 79 up, down = True, True |
| 77 if region == 'Either': | 80 if region == 'Either': |
| 78 either = True | 81 either = True |
| 79 | 82 |
| 80 # Read features into memory: | 83 # Read features into memory: |
| 81 rightTree = quicksect.IntervalTree() | 84 rightTree = quicksect.IntervalTree() |
| 82 for item in features: | 85 for item in features: |
| 83 if type( item ) is GenomicInterval: | 86 if type( item ) is GenomicInterval: |
| 84 rightTree.insert( item, features.linenum, item ) | 87 rightTree.insert( item, features.linenum, item ) |
| 85 | 88 |
| 86 for interval in primary: | 89 for interval in primary: |
| 87 if type( interval ) is Header: | 90 if type( interval ) is Header: |
| 88 yield interval | 91 yield interval |
| 89 if type( interval ) is Comment and comments: | 92 if type( interval ) is Comment and comments: |
| 90 yield interval | 93 yield interval |
| 94 end = int(interval.end) | 97 end = int(interval.end) |
| 95 strand = interval.strand | 98 strand = interval.strand |
| 96 if chrom not in rightTree.chroms: | 99 if chrom not in rightTree.chroms: |
| 97 continue | 100 continue |
| 98 else: | 101 else: |
| 99 root = rightTree.chroms[chrom] #root node for the chrom tree | 102 root = rightTree.chroms[chrom] # root node for the chrom tree |
| 100 result_up = [] | 103 result_up = [] |
| 101 result_down = [] | 104 result_down = [] |
| 102 if (strand == '+' and up) or (strand == '-' and down): | 105 if (strand == '+' and up) or (strand == '-' and down): |
| 103 #upstream +ve strand and downstream -ve strand cases | 106 #upstream +ve strand and downstream -ve strand cases |
| 104 get_closest_feature (root, 1, start, None, lambda node: result_up.append( node ), None) | 107 get_closest_feature(root, 1, start, None, lambda node: result_up.append( node ), None) |
| 105 | 108 |
| 106 if (strand == '+' and down) or (strand == '-' and up): | 109 if (strand == '+' and down) or (strand == '-' and up): |
| 107 #downstream +ve strand and upstream -ve strand case | 110 #downstream +ve strand and upstream -ve strand case |
| 108 get_closest_feature (root, 0, None, end-1, None, lambda node: result_down.append( node )) | 111 get_closest_feature(root, 0, None, end - 1, None, lambda node: result_down.append( node )) |
| 109 | 112 |
| 110 if result_up: | 113 if result_up: |
| 111 if len(result_up) > 1: #The results_up list has a list of intervals upstream to the given interval. | 114 if len(result_up) > 1: # The results_up list has a list of intervals upstream to the given interval. |
| 112 ends = [] | 115 ends = [] |
| 113 for n in result_up: | 116 for n in result_up: |
| 114 ends.append(n.end) | 117 ends.append(n.end) |
| 115 res_ind = ends.index(max(ends)) #fetch the index of the closest interval i.e. the interval with the max end from the results_up list | 118 res_ind = ends.index(max(ends)) # fetch the index of the closest interval i.e. the interval with the max end from the results_up list |
| 116 else: | 119 else: |
| 117 res_ind = 0 | 120 res_ind = 0 |
| 118 if not(either): | 121 if not(either): |
| 119 yield [ interval, result_up[res_ind].other ] | 122 yield [ interval, result_up[res_ind].other ] |
| 120 | 123 |
| 121 if result_down: | 124 if result_down: |
| 122 if not(either): | 125 if not(either): |
| 123 #The last element of result_down will be the closest element to the given interval | 126 #The last element of result_down will be the closest element to the given interval |
| 124 yield [ interval, result_down[-1].other ] | 127 yield [ interval, result_down[-1].other ] |
| 125 | 128 |
| 126 if either and (result_up or result_down): | 129 if either and (result_up or result_down): |
| 127 iter_val = [] | 130 iter_val = [] |
| 128 if result_up and result_down: | 131 if result_up and result_down: |
| 129 if abs(start - int(result_up[res_ind].end)) <= abs(end - int(result_down[-1].start)): | 132 if abs(start - int(result_up[res_ind].end)) <= abs(end - int(result_down[-1].start)): |
| 130 iter_val = [ interval, result_up[res_ind].other ] | 133 iter_val = [ interval, result_up[res_ind].other ] |
| 135 iter_val = [ interval, result_up[res_ind].other ] | 138 iter_val = [ interval, result_up[res_ind].other ] |
| 136 elif result_down: | 139 elif result_down: |
| 137 #The last element of result_down will be the closest element to the given interval | 140 #The last element of result_down will be the closest element to the given interval |
| 138 iter_val = [ interval, result_down[-1].other ] | 141 iter_val = [ interval, result_down[-1].other ] |
| 139 yield iter_val | 142 yield iter_val |
| 140 | 143 |
| 144 | |
| 141 def main(): | 145 def main(): |
| 142 options, args = doc_optparse.parse( __doc__ ) | 146 options, args = doc_optparse.parse( __doc__ ) |
| 143 try: | 147 try: |
| 144 chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) | 148 chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) |
| 145 chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 ) | 149 chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 ) |
| 146 in1_gff_format = bool( options.gff1 ) | 150 in1_gff_format = bool( options.gff1 ) |
| 147 in2_gff_format = bool( options.gff2 ) | 151 in2_gff_format = bool( options.gff2 ) |
| 148 in_fname, in2_fname, out_fname, direction = args | 152 in_fname, in2_fname, out_fname, direction = args |
| 149 except: | 153 except: |
| 150 doc_optparse.exception() | 154 doc_optparse.exception() |
| 151 | 155 |
| 152 # Set readers to handle either GFF or default format. | 156 # Set readers to handle either GFF or default format. |
| 153 if in1_gff_format: | 157 if in1_gff_format: |
| 154 in1_reader_wrapper = GFFIntervalToBEDReaderWrapper | 158 in1_reader_wrapper = GFFIntervalToBEDReaderWrapper |
| 155 else: | 159 else: |
| 156 in1_reader_wrapper = NiceReaderWrapper | 160 in1_reader_wrapper = NiceReaderWrapper |
| 158 in2_reader_wrapper = GFFIntervalToBEDReaderWrapper | 162 in2_reader_wrapper = GFFIntervalToBEDReaderWrapper |
| 159 else: | 163 else: |
| 160 in2_reader_wrapper = NiceReaderWrapper | 164 in2_reader_wrapper = NiceReaderWrapper |
| 161 | 165 |
| 162 g1 = in1_reader_wrapper( fileinput.FileInput( in_fname ), | 166 g1 = in1_reader_wrapper( fileinput.FileInput( in_fname ), |
| 163 chrom_col=chr_col_1, | 167 chrom_col=chr_col_1, |
| 164 start_col=start_col_1, | 168 start_col=start_col_1, |
| 165 end_col=end_col_1, | 169 end_col=end_col_1, |
| 166 strand_col=strand_col_1, | 170 strand_col=strand_col_1, |
| 167 fix_strand=True ) | 171 fix_strand=True ) |
| 168 g2 = in2_reader_wrapper( fileinput.FileInput( in2_fname ), | 172 g2 = in2_reader_wrapper( fileinput.FileInput( in2_fname ), |
| 169 chrom_col=chr_col_2, | 173 chrom_col=chr_col_2, |
| 170 start_col=start_col_2, | 174 start_col=start_col_2, |
| 171 end_col=end_col_2, | 175 end_col=end_col_2, |
| 172 strand_col=strand_col_2, | 176 strand_col=strand_col_2, |
| 173 fix_strand=True ) | 177 fix_strand=True ) |
| 174 | 178 |
| 175 # Find flanking features. | 179 # Find flanking features. |
| 176 out_file = open( out_fname, "w" ) | 180 out_file = open( out_fname, "w" ) |
| 177 try: | 181 try: |
| 178 for result in proximal_region_finder([g1,g2], direction): | 182 for result in proximal_region_finder([g1, g2], direction): |
| 179 if type( result ) is list: | 183 if type( result ) is list: |
| 180 line, closest_feature = result | 184 line, closest_feature = result |
| 181 # Need to join outputs differently depending on file types. | 185 # Need to join outputs differently depending on file types. |
| 182 if in1_gff_format: | 186 if in1_gff_format: |
| 183 # Output is GFF with added attribute 'closest feature.' | 187 # Output is GFF with added attribute 'closest feature.' |
| 184 | 188 |
| 185 # Invervals are in BED coordinates; need to convert to GFF. | 189 # Invervals are in BED coordinates; need to convert to GFF. |
| 186 line = convert_bed_coords_to_gff( line ) | 190 line = convert_bed_coords_to_gff( line ) |
| 187 closest_feature = convert_bed_coords_to_gff( closest_feature ) | 191 closest_feature = convert_bed_coords_to_gff( closest_feature ) |
| 188 | 192 |
| 189 # Replace double quotes with single quotes in closest feature's attributes. | 193 # Replace double quotes with single quotes in closest feature's attributes. |
| 190 out_file.write( "%s closest_feature \"%s\" \n" % | 194 out_file.write( "%s closest_feature \"%s\" \n" % |
| 191 ( "\t".join( line.fields ), \ | 195 ( "\t".join( line.fields ), |
| 192 "\t".join( closest_feature.fields ).replace( "\"", "\\\"" ) | 196 "\t".join( closest_feature.fields ).replace( "\"", "\\\"" ) |
| 193 ) ) | 197 ) ) |
| 194 else: | 198 else: |
| 195 # Output is BED + closest feature fields. | 199 # Output is BED + closest feature fields. |
| 196 output_line_fields = [] | 200 output_line_fields = [] |
| 197 output_line_fields.extend( line.fields ) | 201 output_line_fields.extend( line.fields ) |
| 198 output_line_fields.extend( closest_feature.fields ) | 202 output_line_fields.extend( closest_feature.fields ) |
| 200 else: | 204 else: |
| 201 out_file.write( "%s\n" % result ) | 205 out_file.write( "%s\n" % result ) |
| 202 except ParseError, exc: | 206 except ParseError, exc: |
| 203 fail( "Invalid file format: %s" % str( exc ) ) | 207 fail( "Invalid file format: %s" % str( exc ) ) |
| 204 | 208 |
| 205 print "Direction: %s" %(direction) | 209 print "Direction: %s" % (direction) |
| 206 if g1.skipped > 0: | 210 if g1.skipped > 0: |
| 207 print skipped( g1, filedesc=" of 1st dataset" ) | 211 print skipped( g1, filedesc=" of 1st dataset" ) |
| 208 if g2.skipped > 0: | 212 if g2.skipped > 0: |
| 209 print skipped( g2, filedesc=" of 2nd dataset" ) | 213 print skipped( g2, filedesc=" of 2nd dataset" ) |
| 210 | 214 |
