Mercurial > repos > artbio > small_rna_signatures
comparison overlapping_reads.py @ 3:4d9682bd3a6b draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_signatures commit 96ed5824190aff281cc3aa47dc60fc66aac41db3
author | artbio |
---|---|
date | Sat, 02 Sep 2017 06:35:15 -0400 |
parents | 6f1378738798 |
children | 20d28cfdeefe |
comparison
equal
deleted
inserted
replaced
2:320e06bf99b9 | 3:4d9682bd3a6b |
---|---|
34 | 34 |
35 def __init__(self, bam_file): | 35 def __init__(self, bam_file): |
36 self.bam_object = pysam.AlignmentFile(bam_file, 'rb') | 36 self.bam_object = pysam.AlignmentFile(bam_file, 'rb') |
37 self.chromosomes = dict(zip(self.bam_object.references, | 37 self.chromosomes = dict(zip(self.bam_object.references, |
38 self.bam_object.lengths)) | 38 self.bam_object.lengths)) |
39 self.map_dict = self.create_map(self.bam_object) | 39 self.all_query_positions = self.query_positions(self.bam_object) |
40 self.readdic = self.make_readdic(self.bam_object) | |
40 | 41 |
41 def create_map(self, bam_object): | 42 def make_readdic(self, bam_object): |
42 ''' | 43 readdic = defaultdict(int) |
43 Returns a map_dictionary {(chromosome,read_position,polarity): | 44 for read in bam_object.fetch(): |
44 [read_length, ...]} | 45 readdic[read.query_sequence] += 1 |
45 ''' | 46 return readdic |
46 map_dictionary = defaultdict(list) | 47 |
47 # get empty value for start and end of each chromosome | 48 def query_positions(self, bam_object): |
48 for chrom in self.chromosomes: | 49 all_query_positions = defaultdict(list) |
49 map_dictionary[(chrom, 1, 'F')] = [] | |
50 map_dictionary[(chrom, self.chromosomes[chrom], 'F')] = [] | |
51 for chrom in self.chromosomes: | 50 for chrom in self.chromosomes: |
52 for read in bam_object.fetch(chrom): | 51 for read in bam_object.fetch(chrom): |
53 positions = read.positions # a list of covered positions | 52 if not read.is_reverse: |
54 if read.is_reverse: | 53 all_query_positions[chrom].append( |
55 map_dictionary[(chrom, positions[-1]+1, | 54 read.get_reference_positions(full_length=True)[0]) |
56 'R')].append(read.query_alignment_length) | |
57 else: | 55 else: |
58 map_dictionary[(chrom, positions[0]+1, | 56 all_query_positions[chrom].append( |
59 'F')].append(read.query_alignment_length) | 57 read.get_reference_positions(full_length=True)[-1]) |
60 return map_dictionary | 58 all_query_positions[chrom] = sorted( |
59 list(set(all_query_positions[chrom]))) | |
60 return all_query_positions | |
61 | 61 |
62 def signature_tables(self, minquery, maxquery, mintarget, maxtarget): | 62 def direct_pairing(self, minquery, maxquery, mintarget, maxtarget, |
63 file, overlap=10): | |
64 F = open(file, 'w') | |
63 query_range = range(minquery, maxquery + 1) | 65 query_range = range(minquery, maxquery + 1) |
64 target_range = range(mintarget, maxtarget + 1) | 66 target_range = range(mintarget, maxtarget + 1) |
65 Query_table = defaultdict(dict) | 67 stringresult = [] |
66 Target_table = defaultdict(dict) | 68 for chrom in sorted(self.chromosomes): |
67 for key in self.map_dict: | 69 for pos in (self.all_query_positions[chrom]): |
68 for size in self.map_dict[key]: | 70 iterreads_1 = self.bam_object.fetch(chrom, |
69 if size in query_range or size in target_range: | 71 start=pos, end=pos+overlap-1) |
70 if key[2] == 'F': | 72 iterreads_2 = self.bam_object.fetch(chrom, |
71 coordinate = key[1] | 73 start=pos, end=pos+overlap-1) |
72 else: | 74 iterreads_3 = self.bam_object.fetch(chrom, |
73 coordinate = -key[1] | 75 start=pos, end=pos+overlap-1) |
74 if size in query_range: | 76 iterreads_4 = self.bam_object.fetch(chrom, |
75 Query_table[key[0]][coordinate] = Query_table[key[0]].get( | 77 start=pos, end=pos+overlap-1) |
76 coordinate, 0) + 1 | 78 # 1 |
77 if size in target_range: | 79 for queryread in iterreads_1: |
78 Target_table[key[0]][coordinate] = \ | 80 if queryread.get_reference_positions( |
79 Target_table[key[0]].get(coordinate, 0) + 1 | 81 full_length=True)[0] == pos and \ |
80 return Query_table, Target_table | 82 queryread.query_alignment_length in query_range \ |
81 | 83 and not queryread.is_reverse: |
82 def search_overlaps(self, minquery, maxquery, mintarget, maxtarget, | 84 for targetread in iterreads_2: |
83 overlap=10): | 85 if (targetread. |
84 Query_table, Target_table = self.signature_tables(minquery, maxquery, | 86 get_reference_positions(full_length=True)[-1] |
85 mintarget, maxtarget) | 87 == queryread.get_reference_positions( |
86 overlap_groups = defaultdict(list) | 88 full_length=True)[overlap-1] and |
87 for chrom in Query_table: | 89 targetread.query_alignment_length in |
88 for coord in Query_table[chrom]: | 90 target_range and targetread.is_reverse): |
89 if Target_table[chrom].get(-coord - overlap + 1, 0): | 91 targetreadseq = self.revcomp( |
90 overlap_groups[chrom].append(coord) | 92 targetread.query_sequence) |
91 return overlap_groups | 93 stringresult.append( |
92 | 94 '>%s|%s|%s|%s|n=%s\n%s\n' % |
93 def feed_overlaps(self, overlap_groups, minquery, output, overlap=10): | 95 (chrom, queryread.get_reference_positions( |
94 F = open(output, 'w') | 96 full_length=True)[0]+1, |
95 for chrom in sorted(overlap_groups): | 97 'F', queryread.query_alignment_length, |
96 for pos in sorted(overlap_groups[chrom]): | 98 self.readdic[queryread.query_sequence], |
97 if pos > 0: # read are forward | 99 queryread.query_sequence)) |
98 reads = self.bam_object.fetch(chrom, start=pos-1, | 100 stringresult.append( |
99 end=pos-1+overlap-1) | 101 '>%s|%s|%s|%s|n=%s\n%s\n' % |
100 for read in reads: | 102 (chrom, targetread.get_reference_positions( |
101 positions = read.positions | 103 full_length=True)[0]+1, |
102 if pos-1 == positions[0] and \ | 104 'R', targetread.query_alignment_length, |
103 read.query_alignment_length >= minquery: | 105 self.readdic[targetread.query_sequence], |
104 F.write('>%s|%s|%s|%s\n%s\n' % ( | 106 targetreadseq)) |
105 chrom, pos, 'F', | 107 # 2 |
106 read.query_alignment_length, | 108 for queryread in iterreads_3: |
107 read.query_sequence)) | 109 if queryread.get_reference_positions( |
108 else: # reads are reverse | 110 full_length=True)[-1] == pos+overlap-1 and \ |
109 reads = self.bam_object.fetch(chrom, | 111 queryread.query_alignment_length in query_range \ |
110 start=-pos-1-overlap+1, | 112 and queryread.is_reverse: |
111 end=-pos-1) | 113 for targetread in iterreads_4: |
112 for read in reads: | 114 if (targetread. |
113 positions = read.positions | 115 get_reference_positions(full_length=True)[0] |
114 if -pos-1 == positions[-1] and \ | 116 == pos and targetread.query_alignment_length |
115 read.query_alignment_length >= minquery: | 117 in target_range and not |
116 readseq = self.revcomp(read.query_sequence) | 118 targetread.is_reverse): |
117 readsize = read.query_alignment_length | 119 queryreadseq = self.revcomp( |
118 F.write('>%s|%s|%s|%s\n%s\n' % (chrom, | 120 queryread.query_sequence) |
119 positions[0] + 1, | 121 targetreadseq = targetread.query_sequence |
120 'R', readsize, readseq)) | 122 stringresult.append( |
121 F.close() | 123 '>%s|%s|%s|%s|n=%s\n%s\n' % |
122 return | 124 (chrom, queryread.get_reference_positions( |
125 full_length=True)[0]+1, 'R', | |
126 queryread.query_alignment_length, | |
127 self.readdic[queryread.query_sequence], | |
128 queryreadseq)) | |
129 stringresult.append( | |
130 '>%s|%s|%s|%s|n=%s\n%s\n' % | |
131 (chrom, targetread.get_reference_positions( | |
132 full_length=True)[0]+1, | |
133 'F', targetread.query_alignment_length, | |
134 self.readdic[targetread.query_sequence], | |
135 targetreadseq)) | |
136 stringresult = sorted(set(stringresult), | |
137 key=lambda x: stringresult.index(x)) | |
138 F.write(''.join(stringresult)) | |
123 | 139 |
124 def revcomp(self, sequence): | 140 def revcomp(self, sequence): |
125 antidict = {"A": "T", "T": "A", "G": "C", "C": "G", "N": "N"} | 141 antidict = {"A": "T", "T": "A", "G": "C", "C": "G", "N": "N"} |
126 revseq = sequence[::-1] | 142 revseq = sequence[::-1] |
127 return "".join([antidict[i] for i in revseq]) | 143 return "".join([antidict[i] for i in revseq]) |
128 | 144 |
129 | 145 |
130 def main(input, minquery, maxquery, mintarget, maxtarget, output, overlap=10): | 146 def main(input, minquery, maxquery, mintarget, maxtarget, output, overlap=10): |
131 mapobj = Map(input) | 147 mapobj = Map(input) |
132 mapobj.feed_overlaps(mapobj.search_overlaps(minquery, maxquery, | 148 mapobj.direct_pairing(minquery, maxquery, mintarget, maxtarget, |
133 mintarget, maxtarget, | 149 output, overlap) |
134 overlap), minquery, output) | |
135 | 150 |
136 | 151 |
137 if __name__ == "__main__": | 152 if __name__ == "__main__": |
138 args = Parser() | 153 args = Parser() |
139 main(args.input, args.minquery, args.maxquery, args.mintarget, | 154 main(args.input, args.minquery, args.maxquery, args.mintarget, |
140 args.maxtarget, args.output) | 155 args.maxtarget, args.output, args.overlap) |