Mercurial > repos > artbio > small_rna_signatures
diff signature.py @ 7:07771982ef9b draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_signatures commit 7276b6b73aef7af4058ad2c1e34c4557e9cccbe0
author | artbio |
---|---|
date | Sun, 10 Sep 2017 13:50:40 -0400 |
parents | a35e6f9c1d34 |
children | 8d3ca9652a5b |
line wrap: on
line diff
--- a/signature.py Sun Sep 10 10:27:19 2017 -0400 +++ b/signature.py Sun Sep 10 13:50:40 2017 -0400 @@ -40,11 +40,23 @@ class Map: - def __init__(self, bam_file): + def __init__(self, bam_file, minquery=23, maxquery=29, mintarget=23, + maxtarget=29, minscope=1, maxscope=19, output_h='', + output_z=''): self.bam_object = pysam.AlignmentFile(bam_file, 'rb') + self.query_range = range(minquery, maxquery + 1) + self.target_range = range(mintarget, maxtarget + 1) + self.scope = range(minscope, maxscope + 1) + self.H = open(output_h, 'w') + self.Z = open(output_z, 'w') self.chromosomes = dict(zip(self.bam_object.references, self.bam_object.lengths)) self.map_dict = self.create_map(self.bam_object) + self.query_positions = self.compute_query_positions() + self.Z.write(self.compute_signature_pairs()) + self.H.write(self.compute_signature_h()) + self.H.close() + self.Z.close() def create_map(self, bam_object): ''' @@ -58,18 +70,73 @@ map_dictionary[(chrom, self.chromosomes[chrom], 'F')] = [] for chrom in self.chromosomes: for read in bam_object.fetch(chrom): - positions = read.positions # a list of covered positions if read.is_reverse: - map_dictionary[(chrom, positions[-1]+1, + map_dictionary[(chrom, read.reference_end, 'R')].append(read.query_alignment_length) else: - map_dictionary[(chrom, positions[0]+1, + map_dictionary[(chrom, read.reference_start+1, 'F')].append(read.query_alignment_length) return map_dictionary - def signature_tables(self, minquery, maxquery, mintarget, maxtarget): - query_range = range(minquery, maxquery + 1) - target_range = range(mintarget, maxtarget + 1) + def compute_query_positions(self): + ''' this method does not filter on read size, just forward reads + that overlap reverse reads in the overlap range''' + all_query_positions = defaultdict(list) + for genomicKey in self.map_dict.keys(): + chrom, coord, pol = genomicKey + for i in self.scope: + if pol == 'F' and len(self.map_dict[chrom, + coord+i-1, + 'R']) > 0: + all_query_positions[chrom].append(coord) + break + for chrom in all_query_positions: + all_query_positions[chrom] = sorted( + list(set(all_query_positions[chrom]))) + return all_query_positions + + def countpairs(self, uppers, lowers): + query_range = self.query_range + target_range = self.target_range + uppers = [size for size in uppers if size in query_range or size in + target_range] + lowers = [size for size in lowers if size in query_range or size in + target_range] + paired = [] + for upread in uppers: + for downread in lowers: + if (upread in query_range and downread in target_range) or ( + upread in target_range and downread in query_range): + paired.append(upread) + lowers.remove(downread) + break + return len(paired) + + def compute_signature_pairs(self): + frequency_table = defaultdict(dict) + scope = self.scope + for chrom in self.chromosomes: + for overlap in scope: + frequency_table[chrom][overlap] = 0 + for chrom in self.query_positions: + for coord in self.query_positions[chrom]: + for overlap in scope: + uppers = self.map_dict[chrom, coord, 'F'] + lowers = self.map_dict[chrom, coord+overlap-1, 'R'] + frequency_table[chrom][overlap] += self.countpairs(uppers, + lowers) + # compute overlaps for all chromosomes merged + for overlap in scope: + accumulator = [] + for chrom in frequency_table: + if chrom != 'all_chromosomes': + accumulator.append(frequency_table[chrom][overlap]) + frequency_table['all_chromosomes'][overlap] = sum(accumulator) + return self.stringify_table(frequency_table) + + def signature_tables(self): + query_range = self.query_range + target_range = self.target_range Query_table = defaultdict(dict) Target_table = defaultdict(dict) for key in self.map_dict: @@ -87,39 +154,9 @@ Target_table[key[0]].get(coordinate, 0) + 1 return Query_table, Target_table - def compute_signature_z(self, minquery, maxquery, mintarget, maxtarget, - scope, zscore="no"): - Query_table, Target_table = self.signature_tables(minquery, maxquery, - mintarget, maxtarget) - frequency_table = defaultdict(dict) - for chrom in self.chromosomes: - for overlap in scope: - frequency_table[chrom][overlap] = 0 - for chrom in Query_table: - for coord in Query_table[chrom]: - for overlap in scope: - frequency_table[chrom][overlap] += min( - Query_table[chrom][coord], - Target_table[chrom].get(-coord - overlap + 1, 0)) - # since we want the number of pairs, not the number or paired reads - # to do: what in complex cases - # with query and target sizes partially overlap ? - for chrom in frequency_table: - for overlap in frequency_table[chrom]: - frequency_table[chrom][overlap] /= 2 - # compute overlaps for all chromosomes merged - for overlap in scope: - accumulator = [] - for chrom in frequency_table: - if chrom != 'all_chromosomes': - accumulator.append(frequency_table[chrom][overlap]) - frequency_table['all_chromosomes'][overlap] = sum(accumulator) - return self.stringify_table(frequency_table) - - def compute_signature_h(self, minquery, maxquery, mintarget, - maxtarget, scope): - Query_table, Target_table = self.signature_tables(minquery, maxquery, - mintarget, maxtarget) + def compute_signature_h(self): + scope = self.scope + Query_table, Target_table = self.signature_tables() frequency_table = defaultdict(dict) for chrom in self.chromosomes: for overlap in scope: @@ -187,23 +224,8 @@ return ''.join(tablestring) - -def main(input, minquery, maxquery, mintarget, maxtarget, minscope, maxscope, - output_h, output_z, genome_wide=False, zscore="no"): - H = open(output_h, 'w') - Z = open(output_z, 'w') - mapobj = Map(input) - scope = range(minscope, maxscope + 1) - Z.write(mapobj.compute_signature_z(minquery, maxquery, mintarget, - maxtarget, scope, zscore="no")) - H.write(mapobj.compute_signature_h(minquery, maxquery, mintarget, - maxtarget, scope)) - H.close() - Z.close() - - if __name__ == "__main__": args = Parser() - main(args.input, args.minquery, args.maxquery, args.mintarget, - args.maxtarget, args.minscope, args.maxscope, args.output_h, - args.output_z) + mapobj = Map(args.input, args.minquery, args.maxquery, args.mintarget, + args.maxtarget, args.minscope, args.maxscope, args.output_h, + args.output_z)