Mercurial > repos > artbio > small_rna_signatures
comparison overlapping_reads.py @ 1:6f1378738798 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_signatures commit 6133bb114c76a795fa12a4a11edb1a8b80fd104d
author | artbio |
---|---|
date | Tue, 29 Aug 2017 20:02:15 -0400 |
parents | |
children | 4d9682bd3a6b |
comparison
equal
deleted
inserted
replaced
0:a35e6f9c1d34 | 1:6f1378738798 |
---|---|
1 import argparse | |
2 from collections import defaultdict | |
3 | |
4 import pysam | |
5 | |
6 | |
7 def Parser(): | |
8 the_parser = argparse.ArgumentParser() | |
9 the_parser.add_argument( | |
10 '--input', action="store", type=str, help="bam alignment file") | |
11 the_parser.add_argument( | |
12 '--minquery', type=int, | |
13 help="Minimum readsize of query reads (nt) - must be an integer") | |
14 the_parser.add_argument( | |
15 '--maxquery', type=int, | |
16 help="Maximum readsize of query reads (nt) - must be an integer") | |
17 the_parser.add_argument( | |
18 '--mintarget', type=int, | |
19 help="Minimum readsize of target reads (nt) - must be an integer") | |
20 the_parser.add_argument( | |
21 '--maxtarget', type=int, | |
22 help="Maximum readsize of target reads (nt) - must be an integer") | |
23 the_parser.add_argument( | |
24 '--overlap', type=int, | |
25 help="Overlap analyzed (nt) - must be an integer") | |
26 the_parser.add_argument( | |
27 '--output', action="store", type=str, | |
28 help="Pairable sequences") | |
29 args = the_parser.parse_args() | |
30 return args | |
31 | |
32 | |
33 class Map: | |
34 | |
35 def __init__(self, bam_file): | |
36 self.bam_object = pysam.AlignmentFile(bam_file, 'rb') | |
37 self.chromosomes = dict(zip(self.bam_object.references, | |
38 self.bam_object.lengths)) | |
39 self.map_dict = self.create_map(self.bam_object) | |
40 | |
41 def create_map(self, bam_object): | |
42 ''' | |
43 Returns a map_dictionary {(chromosome,read_position,polarity): | |
44 [read_length, ...]} | |
45 ''' | |
46 map_dictionary = defaultdict(list) | |
47 # get empty value for start and end of each chromosome | |
48 for chrom in self.chromosomes: | |
49 map_dictionary[(chrom, 1, 'F')] = [] | |
50 map_dictionary[(chrom, self.chromosomes[chrom], 'F')] = [] | |
51 for chrom in self.chromosomes: | |
52 for read in bam_object.fetch(chrom): | |
53 positions = read.positions # a list of covered positions | |
54 if read.is_reverse: | |
55 map_dictionary[(chrom, positions[-1]+1, | |
56 'R')].append(read.query_alignment_length) | |
57 else: | |
58 map_dictionary[(chrom, positions[0]+1, | |
59 'F')].append(read.query_alignment_length) | |
60 return map_dictionary | |
61 | |
62 def signature_tables(self, minquery, maxquery, mintarget, maxtarget): | |
63 query_range = range(minquery, maxquery + 1) | |
64 target_range = range(mintarget, maxtarget + 1) | |
65 Query_table = defaultdict(dict) | |
66 Target_table = defaultdict(dict) | |
67 for key in self.map_dict: | |
68 for size in self.map_dict[key]: | |
69 if size in query_range or size in target_range: | |
70 if key[2] == 'F': | |
71 coordinate = key[1] | |
72 else: | |
73 coordinate = -key[1] | |
74 if size in query_range: | |
75 Query_table[key[0]][coordinate] = Query_table[key[0]].get( | |
76 coordinate, 0) + 1 | |
77 if size in target_range: | |
78 Target_table[key[0]][coordinate] = \ | |
79 Target_table[key[0]].get(coordinate, 0) + 1 | |
80 return Query_table, Target_table | |
81 | |
82 def search_overlaps(self, minquery, maxquery, mintarget, maxtarget, | |
83 overlap=10): | |
84 Query_table, Target_table = self.signature_tables(minquery, maxquery, | |
85 mintarget, maxtarget) | |
86 overlap_groups = defaultdict(list) | |
87 for chrom in Query_table: | |
88 for coord in Query_table[chrom]: | |
89 if Target_table[chrom].get(-coord - overlap + 1, 0): | |
90 overlap_groups[chrom].append(coord) | |
91 return overlap_groups | |
92 | |
93 def feed_overlaps(self, overlap_groups, minquery, output, overlap=10): | |
94 F = open(output, 'w') | |
95 for chrom in sorted(overlap_groups): | |
96 for pos in sorted(overlap_groups[chrom]): | |
97 if pos > 0: # read are forward | |
98 reads = self.bam_object.fetch(chrom, start=pos-1, | |
99 end=pos-1+overlap-1) | |
100 for read in reads: | |
101 positions = read.positions | |
102 if pos-1 == positions[0] and \ | |
103 read.query_alignment_length >= minquery: | |
104 F.write('>%s|%s|%s|%s\n%s\n' % ( | |
105 chrom, pos, 'F', | |
106 read.query_alignment_length, | |
107 read.query_sequence)) | |
108 else: # reads are reverse | |
109 reads = self.bam_object.fetch(chrom, | |
110 start=-pos-1-overlap+1, | |
111 end=-pos-1) | |
112 for read in reads: | |
113 positions = read.positions | |
114 if -pos-1 == positions[-1] and \ | |
115 read.query_alignment_length >= minquery: | |
116 readseq = self.revcomp(read.query_sequence) | |
117 readsize = read.query_alignment_length | |
118 F.write('>%s|%s|%s|%s\n%s\n' % (chrom, | |
119 positions[0] + 1, | |
120 'R', readsize, readseq)) | |
121 F.close() | |
122 return | |
123 | |
124 def revcomp(self, sequence): | |
125 antidict = {"A": "T", "T": "A", "G": "C", "C": "G", "N": "N"} | |
126 revseq = sequence[::-1] | |
127 return "".join([antidict[i] for i in revseq]) | |
128 | |
129 | |
130 def main(input, minquery, maxquery, mintarget, maxtarget, output, overlap=10): | |
131 mapobj = Map(input) | |
132 mapobj.feed_overlaps(mapobj.search_overlaps(minquery, maxquery, | |
133 mintarget, maxtarget, | |
134 overlap), minquery, output) | |
135 | |
136 | |
137 if __name__ == "__main__": | |
138 args = Parser() | |
139 main(args.input, args.minquery, args.maxquery, args.mintarget, | |
140 args.maxtarget, args.output) |