diff signature.py @ 7:07771982ef9b draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_signatures commit 7276b6b73aef7af4058ad2c1e34c4557e9cccbe0
author artbio
date Sun, 10 Sep 2017 13:50:40 -0400
parents a35e6f9c1d34
children 8d3ca9652a5b
line wrap: on
line diff
--- a/signature.py	Sun Sep 10 10:27:19 2017 -0400
+++ b/signature.py	Sun Sep 10 13:50:40 2017 -0400
@@ -40,11 +40,23 @@
 
 class Map:
 
-    def __init__(self, bam_file):
+    def __init__(self, bam_file, minquery=23, maxquery=29, mintarget=23,
+                 maxtarget=29, minscope=1, maxscope=19, output_h='',
+                 output_z=''):
         self.bam_object = pysam.AlignmentFile(bam_file, 'rb')
+        self.query_range = range(minquery, maxquery + 1)
+        self.target_range = range(mintarget, maxtarget + 1)
+        self.scope = range(minscope, maxscope + 1)
+        self.H = open(output_h, 'w')
+        self.Z = open(output_z, 'w')
         self.chromosomes = dict(zip(self.bam_object.references,
                                 self.bam_object.lengths))
         self.map_dict = self.create_map(self.bam_object)
+        self.query_positions = self.compute_query_positions()
+        self.Z.write(self.compute_signature_pairs())
+        self.H.write(self.compute_signature_h())
+        self.H.close()
+        self.Z.close()
 
     def create_map(self, bam_object):
         '''
@@ -58,18 +70,73 @@
             map_dictionary[(chrom, self.chromosomes[chrom], 'F')] = []
         for chrom in self.chromosomes:
             for read in bam_object.fetch(chrom):
-                positions = read.positions  # a list of covered positions
                 if read.is_reverse:
-                    map_dictionary[(chrom, positions[-1]+1,
+                    map_dictionary[(chrom, read.reference_end,
                                     'R')].append(read.query_alignment_length)
                 else:
-                    map_dictionary[(chrom, positions[0]+1,
+                    map_dictionary[(chrom, read.reference_start+1,
                                     'F')].append(read.query_alignment_length)
         return map_dictionary
 
-    def signature_tables(self, minquery, maxquery, mintarget, maxtarget):
-        query_range = range(minquery, maxquery + 1)
-        target_range = range(mintarget, maxtarget + 1)
+    def compute_query_positions(self):
+        ''' this method does not filter on read size, just forward reads
+        that overlap reverse reads in the overlap range'''
+        all_query_positions = defaultdict(list)
+        for genomicKey in self.map_dict.keys():
+            chrom, coord, pol = genomicKey
+            for i in self.scope:
+                if pol == 'F' and len(self.map_dict[chrom,
+                                                    coord+i-1,
+                                                    'R']) > 0:
+                    all_query_positions[chrom].append(coord)
+                    break
+        for chrom in all_query_positions:
+            all_query_positions[chrom] = sorted(
+                list(set(all_query_positions[chrom])))
+        return all_query_positions
+
+    def countpairs(self, uppers, lowers):
+        query_range = self.query_range
+        target_range = self.target_range
+        uppers = [size for size in uppers if size in query_range or size in
+                  target_range]
+        lowers = [size for size in lowers if size in query_range or size in
+                  target_range]
+        paired = []
+        for upread in uppers:
+            for downread in lowers:
+                if (upread in query_range and downread in target_range) or (
+                        upread in target_range and downread in query_range):
+                    paired.append(upread)
+                    lowers.remove(downread)
+                    break
+        return len(paired)
+
+    def compute_signature_pairs(self):
+        frequency_table = defaultdict(dict)
+        scope = self.scope
+        for chrom in self.chromosomes:
+            for overlap in scope:
+                frequency_table[chrom][overlap] = 0
+        for chrom in self.query_positions:
+            for coord in self.query_positions[chrom]:
+                for overlap in scope:
+                    uppers = self.map_dict[chrom, coord, 'F']
+                    lowers = self.map_dict[chrom, coord+overlap-1, 'R']
+                    frequency_table[chrom][overlap] += self.countpairs(uppers,
+                                                                       lowers)
+        # compute overlaps for all chromosomes merged
+        for overlap in scope:
+            accumulator = []
+            for chrom in frequency_table:
+                if chrom != 'all_chromosomes':
+                    accumulator.append(frequency_table[chrom][overlap])
+            frequency_table['all_chromosomes'][overlap] = sum(accumulator)
+        return self.stringify_table(frequency_table)
+
+    def signature_tables(self):
+        query_range = self.query_range
+        target_range = self.target_range
         Query_table = defaultdict(dict)
         Target_table = defaultdict(dict)
         for key in self.map_dict:
@@ -87,39 +154,9 @@
                         Target_table[key[0]].get(coordinate, 0) + 1
         return Query_table, Target_table
 
-    def compute_signature_z(self, minquery, maxquery, mintarget, maxtarget,
-                            scope, zscore="no"):
-        Query_table, Target_table = self.signature_tables(minquery, maxquery,
-                                                     mintarget, maxtarget)
-        frequency_table = defaultdict(dict)
-        for chrom in self.chromosomes:
-            for overlap in scope:
-                frequency_table[chrom][overlap] = 0
-        for chrom in Query_table:
-            for coord in Query_table[chrom]:
-                for overlap in scope:
-                    frequency_table[chrom][overlap] += min(
-                        Query_table[chrom][coord],
-                        Target_table[chrom].get(-coord - overlap + 1, 0))
-        # since we want the number of pairs, not the number or paired reads
-        # to do: what in complex cases
-        # with query and target sizes partially overlap ?
-        for chrom in frequency_table:
-            for overlap in frequency_table[chrom]:
-                frequency_table[chrom][overlap] /= 2
-        # compute overlaps for all chromosomes merged
-        for overlap in scope:
-            accumulator = []
-            for chrom in frequency_table:
-                if chrom != 'all_chromosomes':
-                    accumulator.append(frequency_table[chrom][overlap])
-            frequency_table['all_chromosomes'][overlap] = sum(accumulator)
-        return self.stringify_table(frequency_table)
-
-    def compute_signature_h(self, minquery, maxquery, mintarget,
-                                   maxtarget, scope):
-        Query_table, Target_table = self.signature_tables(minquery, maxquery,
-                                                     mintarget, maxtarget)
+    def compute_signature_h(self):
+        scope = self.scope
+        Query_table, Target_table = self.signature_tables()
         frequency_table = defaultdict(dict)
         for chrom in self.chromosomes:
             for overlap in scope:
@@ -187,23 +224,8 @@
         return ''.join(tablestring)
 
 
-
-def main(input, minquery, maxquery, mintarget, maxtarget, minscope, maxscope,
-         output_h, output_z, genome_wide=False, zscore="no"):
-    H = open(output_h, 'w')
-    Z = open(output_z, 'w')
-    mapobj = Map(input)
-    scope = range(minscope, maxscope + 1)
-    Z.write(mapobj.compute_signature_z(minquery, maxquery, mintarget,
-            maxtarget, scope, zscore="no"))
-    H.write(mapobj.compute_signature_h(minquery, maxquery, mintarget,
-            maxtarget, scope))
-    H.close()
-    Z.close()
-
-
 if __name__ == "__main__":
     args = Parser()
-    main(args.input, args.minquery, args.maxquery, args.mintarget,
-         args.maxtarget, args.minscope, args.maxscope, args.output_h,
-         args.output_z)
+    mapobj = Map(args.input, args.minquery, args.maxquery, args.mintarget,
+                 args.maxtarget, args.minscope, args.maxscope, args.output_h,
+                 args.output_z)