comparison overlapping_reads.py @ 3:4d9682bd3a6b draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_signatures commit 96ed5824190aff281cc3aa47dc60fc66aac41db3
author artbio
date Sat, 02 Sep 2017 06:35:15 -0400
parents 6f1378738798
children 20d28cfdeefe
comparison
equal deleted inserted replaced
2:320e06bf99b9 3:4d9682bd3a6b
34 34
35 def __init__(self, bam_file): 35 def __init__(self, bam_file):
36 self.bam_object = pysam.AlignmentFile(bam_file, 'rb') 36 self.bam_object = pysam.AlignmentFile(bam_file, 'rb')
37 self.chromosomes = dict(zip(self.bam_object.references, 37 self.chromosomes = dict(zip(self.bam_object.references,
38 self.bam_object.lengths)) 38 self.bam_object.lengths))
39 self.map_dict = self.create_map(self.bam_object) 39 self.all_query_positions = self.query_positions(self.bam_object)
40 self.readdic = self.make_readdic(self.bam_object)
40 41
41 def create_map(self, bam_object): 42 def make_readdic(self, bam_object):
42 ''' 43 readdic = defaultdict(int)
43 Returns a map_dictionary {(chromosome,read_position,polarity): 44 for read in bam_object.fetch():
44 [read_length, ...]} 45 readdic[read.query_sequence] += 1
45 ''' 46 return readdic
46 map_dictionary = defaultdict(list) 47
47 # get empty value for start and end of each chromosome 48 def query_positions(self, bam_object):
48 for chrom in self.chromosomes: 49 all_query_positions = defaultdict(list)
49 map_dictionary[(chrom, 1, 'F')] = []
50 map_dictionary[(chrom, self.chromosomes[chrom], 'F')] = []
51 for chrom in self.chromosomes: 50 for chrom in self.chromosomes:
52 for read in bam_object.fetch(chrom): 51 for read in bam_object.fetch(chrom):
53 positions = read.positions # a list of covered positions 52 if not read.is_reverse:
54 if read.is_reverse: 53 all_query_positions[chrom].append(
55 map_dictionary[(chrom, positions[-1]+1, 54 read.get_reference_positions(full_length=True)[0])
56 'R')].append(read.query_alignment_length)
57 else: 55 else:
58 map_dictionary[(chrom, positions[0]+1, 56 all_query_positions[chrom].append(
59 'F')].append(read.query_alignment_length) 57 read.get_reference_positions(full_length=True)[-1])
60 return map_dictionary 58 all_query_positions[chrom] = sorted(
59 list(set(all_query_positions[chrom])))
60 return all_query_positions
61 61
62 def signature_tables(self, minquery, maxquery, mintarget, maxtarget): 62 def direct_pairing(self, minquery, maxquery, mintarget, maxtarget,
63 file, overlap=10):
64 F = open(file, 'w')
63 query_range = range(minquery, maxquery + 1) 65 query_range = range(minquery, maxquery + 1)
64 target_range = range(mintarget, maxtarget + 1) 66 target_range = range(mintarget, maxtarget + 1)
65 Query_table = defaultdict(dict) 67 stringresult = []
66 Target_table = defaultdict(dict) 68 for chrom in sorted(self.chromosomes):
67 for key in self.map_dict: 69 for pos in (self.all_query_positions[chrom]):
68 for size in self.map_dict[key]: 70 iterreads_1 = self.bam_object.fetch(chrom,
69 if size in query_range or size in target_range: 71 start=pos, end=pos+overlap-1)
70 if key[2] == 'F': 72 iterreads_2 = self.bam_object.fetch(chrom,
71 coordinate = key[1] 73 start=pos, end=pos+overlap-1)
72 else: 74 iterreads_3 = self.bam_object.fetch(chrom,
73 coordinate = -key[1] 75 start=pos, end=pos+overlap-1)
74 if size in query_range: 76 iterreads_4 = self.bam_object.fetch(chrom,
75 Query_table[key[0]][coordinate] = Query_table[key[0]].get( 77 start=pos, end=pos+overlap-1)
76 coordinate, 0) + 1 78 # 1
77 if size in target_range: 79 for queryread in iterreads_1:
78 Target_table[key[0]][coordinate] = \ 80 if queryread.get_reference_positions(
79 Target_table[key[0]].get(coordinate, 0) + 1 81 full_length=True)[0] == pos and \
80 return Query_table, Target_table 82 queryread.query_alignment_length in query_range \
81 83 and not queryread.is_reverse:
82 def search_overlaps(self, minquery, maxquery, mintarget, maxtarget, 84 for targetread in iterreads_2:
83 overlap=10): 85 if (targetread.
84 Query_table, Target_table = self.signature_tables(minquery, maxquery, 86 get_reference_positions(full_length=True)[-1]
85 mintarget, maxtarget) 87 == queryread.get_reference_positions(
86 overlap_groups = defaultdict(list) 88 full_length=True)[overlap-1] and
87 for chrom in Query_table: 89 targetread.query_alignment_length in
88 for coord in Query_table[chrom]: 90 target_range and targetread.is_reverse):
89 if Target_table[chrom].get(-coord - overlap + 1, 0): 91 targetreadseq = self.revcomp(
90 overlap_groups[chrom].append(coord) 92 targetread.query_sequence)
91 return overlap_groups 93 stringresult.append(
92 94 '>%s|%s|%s|%s|n=%s\n%s\n' %
93 def feed_overlaps(self, overlap_groups, minquery, output, overlap=10): 95 (chrom, queryread.get_reference_positions(
94 F = open(output, 'w') 96 full_length=True)[0]+1,
95 for chrom in sorted(overlap_groups): 97 'F', queryread.query_alignment_length,
96 for pos in sorted(overlap_groups[chrom]): 98 self.readdic[queryread.query_sequence],
97 if pos > 0: # read are forward 99 queryread.query_sequence))
98 reads = self.bam_object.fetch(chrom, start=pos-1, 100 stringresult.append(
99 end=pos-1+overlap-1) 101 '>%s|%s|%s|%s|n=%s\n%s\n' %
100 for read in reads: 102 (chrom, targetread.get_reference_positions(
101 positions = read.positions 103 full_length=True)[0]+1,
102 if pos-1 == positions[0] and \ 104 'R', targetread.query_alignment_length,
103 read.query_alignment_length >= minquery: 105 self.readdic[targetread.query_sequence],
104 F.write('>%s|%s|%s|%s\n%s\n' % ( 106 targetreadseq))
105 chrom, pos, 'F', 107 # 2
106 read.query_alignment_length, 108 for queryread in iterreads_3:
107 read.query_sequence)) 109 if queryread.get_reference_positions(
108 else: # reads are reverse 110 full_length=True)[-1] == pos+overlap-1 and \
109 reads = self.bam_object.fetch(chrom, 111 queryread.query_alignment_length in query_range \
110 start=-pos-1-overlap+1, 112 and queryread.is_reverse:
111 end=-pos-1) 113 for targetread in iterreads_4:
112 for read in reads: 114 if (targetread.
113 positions = read.positions 115 get_reference_positions(full_length=True)[0]
114 if -pos-1 == positions[-1] and \ 116 == pos and targetread.query_alignment_length
115 read.query_alignment_length >= minquery: 117 in target_range and not
116 readseq = self.revcomp(read.query_sequence) 118 targetread.is_reverse):
117 readsize = read.query_alignment_length 119 queryreadseq = self.revcomp(
118 F.write('>%s|%s|%s|%s\n%s\n' % (chrom, 120 queryread.query_sequence)
119 positions[0] + 1, 121 targetreadseq = targetread.query_sequence
120 'R', readsize, readseq)) 122 stringresult.append(
121 F.close() 123 '>%s|%s|%s|%s|n=%s\n%s\n' %
122 return 124 (chrom, queryread.get_reference_positions(
125 full_length=True)[0]+1, 'R',
126 queryread.query_alignment_length,
127 self.readdic[queryread.query_sequence],
128 queryreadseq))
129 stringresult.append(
130 '>%s|%s|%s|%s|n=%s\n%s\n' %
131 (chrom, targetread.get_reference_positions(
132 full_length=True)[0]+1,
133 'F', targetread.query_alignment_length,
134 self.readdic[targetread.query_sequence],
135 targetreadseq))
136 stringresult = sorted(set(stringresult),
137 key=lambda x: stringresult.index(x))
138 F.write(''.join(stringresult))
123 139
124 def revcomp(self, sequence): 140 def revcomp(self, sequence):
125 antidict = {"A": "T", "T": "A", "G": "C", "C": "G", "N": "N"} 141 antidict = {"A": "T", "T": "A", "G": "C", "C": "G", "N": "N"}
126 revseq = sequence[::-1] 142 revseq = sequence[::-1]
127 return "".join([antidict[i] for i in revseq]) 143 return "".join([antidict[i] for i in revseq])
128 144
129 145
130 def main(input, minquery, maxquery, mintarget, maxtarget, output, overlap=10): 146 def main(input, minquery, maxquery, mintarget, maxtarget, output, overlap=10):
131 mapobj = Map(input) 147 mapobj = Map(input)
132 mapobj.feed_overlaps(mapobj.search_overlaps(minquery, maxquery, 148 mapobj.direct_pairing(minquery, maxquery, mintarget, maxtarget,
133 mintarget, maxtarget, 149 output, overlap)
134 overlap), minquery, output)
135 150
136 151
137 if __name__ == "__main__": 152 if __name__ == "__main__":
138 args = Parser() 153 args = Parser()
139 main(args.input, args.minquery, args.maxquery, args.mintarget, 154 main(args.input, args.minquery, args.maxquery, args.mintarget,
140 args.maxtarget, args.output) 155 args.maxtarget, args.output, args.overlap)