Mercurial > repos > iuc > genetrack
diff genetrack_util.py @ 0:25cd59a002d9 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/genetrack commit e96df94dba60050fa28aaf55b5bb095717a5f260
author | iuc |
---|---|
date | Tue, 22 Dec 2015 17:03:27 -0500 |
parents | |
children | df7ac50ade5d |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genetrack_util.py Tue Dec 22 17:03:27 2015 -0500 @@ -0,0 +1,390 @@ +import bisect +import math +import numpy +import re +import subprocess +import sys +import tempfile + +GFF_EXT = 'gff' +SCIDX_EXT = 'scidx' + + +def noop(data): + return data + + +def zeropad_to_numeric(data): + return re.sub(r'chr0(\d)', r'chr\1', data) + + +def numeric_to_zeropad(data): + return re.sub(r'chr(\d([^\d]|$))', r'chr0\1', data) + + +FORMATS = ['zeropad', 'numeric'] +IN_CONVERT = {'zeropad': zeropad_to_numeric, 'numeric': noop} +OUT_CONVERT = {'zeropad': numeric_to_zeropad, 'numeric': noop} + + +def conversion_functions(in_fmt, out_fmt): + """ + Returns the proper list of functions to apply to perform a conversion + """ + return [IN_CONVERT[in_fmt], OUT_CONVERT[out_fmt]] + + +def convert_data(data, in_fmt, out_fmt): + for fn in conversion_functions(in_fmt, out_fmt): + data = fn(data) + return data + + +class ChromosomeManager(object): + """ + Manages a CSV reader of an index file to only load one chrom at a time + """ + + def __init__(self, reader): + self.done = False + self.reader = reader + self.processed_chromosomes = [] + self.current_index = 0 + self.next_valid() + + def next(self): + self.line = self.reader.next() + + def is_valid(self, line): + if len(line) not in [4, 5, 9]: + return False + try: + [int(i) for i in line[1:]] + self.format = SCIDX_EXT + return True + except ValueError: + try: + if len(line) < 6: + return False + [int(line[4]), int(line[5])] + self.format = GFF_EXT + return True + except ValueError: + return False + + def next_valid(self): + """ + Advance to the next valid line in the reader + """ + self.line = self.reader.next() + s = 0 + while not self.is_valid(self.line): + self.line = self.reader.next() + s += 1 + if s > 0: + # Skip initial line(s) of file + pass + + def parse_line(self, line): + if self.format == SCIDX_EXT: + return [int(line[1]), int(line[2]), int(line[3])] + else: + return [int(line[3]), line[6], line[5]] + + def chromosome_name(self): + """ + Return the name of the chromosome about to be loaded + """ + return self.line[0] + + def load_chromosome(self, collect_data=True): + """ + Load the current chromosome into an array and return it + """ + cname = self.chromosome_name() + if cname in self.processed_chromosomes: + stop_err('File is not grouped by chromosome') + self.data = [] + while self.line[0] == cname: + if collect_data: + read = self.parse_line(self.line) + if read[0] < self.current_index: + msg = 'Reads in chromosome %s are not sorted by index. (At index %d)' % (cname, self.current_index) + stop_err(msg) + self.current_index = read[0] + self.add_read(read) + try: + self.next() + except StopIteration: + self.done = True + break + self.processed_chromosomes.append(cname) + self.current_index = 0 + data = self.data + # Don't retain reference anymore to save memory + del self.data + return data + + def add_read(self, read): + if self.format == SCIDX_EXT: + self.data.append(read) + else: + index, strand, value = read + if value == '' or value == '.': + value = 1 + else: + value = int(value) + if not self.data: + self.data.append([index, 0, 0]) + current_read = self.data[-1] + if self.data[-1][0] == index: + current_read = self.data[-1] + elif self.data[-1][0] < index: + self.data.append([index, 0, 0]) + current_read = self.data[-1] + else: + msg = 'Reads in chromosome %s are not sorted by index. (At index %d)' % (self.chromosome_name(), index) + stop_err(msg) + if strand == '+': + current_read[1] += value + elif strand == '-': + current_read[2] += value + else: + msg = 'Strand "%s" at chromosome "%s" index %d is not valid.' % (strand, self.chromosome_name(), index) + stop_err(msg) + + def skip_chromosome(self): + """ + Skip the current chromosome, discarding data + """ + self.load_chromosome(collect_data=False) + + +class Peak(object): + def __init__(self, index, pos_width, neg_width): + self.index = index + self.start = index - neg_width + self.end = index + pos_width + self.value = 0 + self.deleted = False + self.safe = False + + def __repr__(self): + return '[%d] %d' % (self.index, self.value) + + +def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}): + return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs)) + + +def gff_attrs(d): + if not d: + return '.' + return ';'.join('%s=%s' % item for item in d.items()) + + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit(1) + + +def is_int(i): + try: + int(i) + return True + except ValueError: + return False + + +def make_keys(data): + return [read[0] for read in data] + + +def make_peak_keys(peaks): + return [peak.index for peak in peaks] + + +def get_window(data, start, end, keys): + """ + Returns all reads from the data set with index between the two indexes + """ + start_index = bisect.bisect_left(keys, start) + end_index = bisect.bisect_right(keys, end) + return data[start_index:end_index] + + +def get_index(value, keys): + """ + Returns the index of the value in the keys using bisect + """ + return bisect.bisect_left(keys, value) + + +def get_range(data): + lo = min([item[0] for item in data]) + hi = max([item[0] for item in data]) + return lo, hi + + +def get_chunks(lo, hi, size, overlap=500): + """ + Divides a range into chunks of maximum size size. Returns a list of + 2-tuples (slice_range, process_range), each a 2-tuple (start, end). + process_range has zero overlap and should be given to process_chromosome + as-is, and slice_range is overlapped and should be used to slice the + data (using get_window) to be given to process_chromosome. + """ + chunks = [] + for start_index in range(lo, hi, size): + process_start = start_index + # Don't go over upper bound + process_end = min(start_index + size, hi) + # Don't go under lower bound + slice_start = max(process_start - overlap, lo) + # Don't go over upper bound + slice_end = min(process_end + overlap, hi) + chunks.append(((slice_start, slice_end), (process_start, process_end))) + return chunks + + +def allocate_array(data, width): + """ + Allocates a new array with the dimensions required to fit all reads in + the argument. The new array is totally empty. Returns the array and the + shift (number to add to a read index to get the position in the array it + should be at). + """ + lo, hi = get_range(data) + rng = hi - lo + shift = width - lo + return numpy.zeros(rng+width*2, numpy.float), shift + + +def normal_array(width, sigma, normalize=True): + """ + Returns an array of the normal distribution of the specified width + """ + sigma2 = float(sigma)**2 + + def normal_func(x): + return math.exp(-x * x / (2 * sigma2)) + + # width is the half of the distribution + values = map(normal_func, range(-width, width)) + values = numpy.array(values, numpy.float) + # normalization + if normalize: + values = 1.0/math.sqrt(2 * numpy.pi * sigma2) * values + return values + + +def call_peaks(array, shift, data, keys, direction, down_width, up_width, exclusion): + peaks = [] + + def find_peaks(): + # Go through the array and call each peak + results = (array > numpy.roll(array, 1)) & (array > numpy.roll(array, -1)) + indexes = numpy.where(results) + for index in indexes[0]: + pos = down_width or exclusion // 2 + neg = up_width or exclusion // 2 + # Reverse strand + if direction == 2: + # Swap positive and negative widths + pos, neg = neg, pos + peaks.append(Peak(int(index)-shift, pos, neg)) + find_peaks() + + def calculate_reads(): + # Calculate the number of reads in each peak + for peak in peaks: + reads = get_window(data, peak.start, peak.end, keys) + peak.value = sum([read[direction] for read in reads]) + # Flat list of indexes with frequency + indexes = [r for read in reads for r in [read[0]] * read[direction]] + peak.stddev = numpy.std(indexes) + calculate_reads() + + def perform_exclusion(): + # Process the exclusion zone + peak_keys = make_peak_keys(peaks) + peaks_by_value = peaks[:] + peaks_by_value.sort(key=lambda peak: -peak.value) + for peak in peaks_by_value: + peak.safe = True + window = get_window(peaks, + peak.index-exclusion//2, + peak.index+exclusion//2, + peak_keys) + for excluded in window: + if excluded.safe: + continue + i = get_index(excluded.index, peak_keys) + del peak_keys[i] + del peaks[i] + perform_exclusion() + return peaks + + +def process_chromosome(cname, data, writer, process_bounds, width, sigma, down_width, up_width, exclusion, filter): + """ + Process a chromosome. Takes the chromosome name, list of reads, a CSV + writer to write processes results to, the bounds (2-tuple) to write + results in, and options. + """ + if not data: + return + keys = make_keys(data) + # Create the arrays that hold the sum of the normals + forward_array, forward_shift = allocate_array(data, width) + reverse_array, reverse_shift = allocate_array(data, width) + normal = normal_array(width, sigma) + + def populate_array(): + # Add each read's normal to the array + for read in data: + index, forward, reverse = read + # Add the normals to the appropriate regions + if forward: + forward_array[index+forward_shift-width:index+forward_shift+width] += normal * forward + if reverse: + reverse_array[index+reverse_shift-width:index+reverse_shift+width] += normal * reverse + populate_array() + forward_peaks = call_peaks(forward_array, forward_shift, data, keys, 1, down_width, up_width, exclusion) + reverse_peaks = call_peaks(reverse_array, reverse_shift, data, keys, 2, down_width, up_width, exclusion) + # Convert chromosome name in preparation for writing output + cname = convert_data(cname, 'zeropad', 'numeric') + + def write(cname, strand, peak): + start = max(peak.start, 1) + end = peak.end + value = peak.value + stddev = peak.stddev + if value > filter: + # This version of genetrack outputs only gff files. + writer.writerow(gff_row(cname=cname, + source='genetrack', + start=start, + end=end, + score=value, + strand=strand, + attrs={'stddev': stddev})) + + for peak in forward_peaks: + if process_bounds[0] < peak.index < process_bounds[1]: + write(cname, '+', peak) + for peak in reverse_peaks: + if process_bounds[0] < peak.index < process_bounds[1]: + write(cname, '-', peak) + + +def sort_chromosome_reads_by_index(input_path): + """ + Return a gff file with chromosome reads sorted by index. + """ + # Will this sort produce different results across platforms? + output_path = tempfile.NamedTemporaryFile(delete=False).name + command = 'sort -k 1,1 -k 4,4n "%s" > "%s"' % (input_path, output_path) + p = subprocess.Popen(command, shell=True) + p.wait() + return output_path