Mercurial > repos > iuc > ngsutils_bam_filter
comparison ngsutils/support/regions.py @ 0:4e4e4093d65d draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ngsutils commit 09194687c74a424732f8b0c017cbb942aad89068
author | iuc |
---|---|
date | Wed, 11 Nov 2015 13:04:07 -0500 |
parents | |
children | 7a68005de299 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4e4e4093d65d |
---|---|
1 class RangeMatch(object): | |
2 ''' | |
3 Simple genomic ranges. You can define chrom:start-end ranges, then ask if a | |
4 particular genomic coordinate maps to any of those ranges. This is less- | |
5 efficient than an R-Tree, but easier to code. | |
6 ''' | |
7 def __init__(self, name): | |
8 self.ranges = {} | |
9 self.name = name | |
10 | |
11 def add_range(self, chrom, strand, start, end): | |
12 if not chrom in self.ranges: | |
13 self.ranges[chrom] = {} | |
14 | |
15 bin = start / 100000 | |
16 if not bin in self.ranges[chrom]: | |
17 self.ranges[chrom][bin] = [] | |
18 self.ranges[chrom][bin].insert(0, (start, end, strand)) | |
19 | |
20 if (end / 100000) != bin: | |
21 for bin in xrange(bin + 1, (end / 100000) + 1): | |
22 if not bin in self.ranges[chrom]: | |
23 self.ranges[chrom][bin] = [] | |
24 self.ranges[chrom][bin].insert(0, (start, end, strand)) | |
25 | |
26 def get_tag(self, chrom, strand, pos, ignore_strand=False): | |
27 ''' | |
28 returns (region, is_reverse_orientation) | |
29 ''' | |
30 if not chrom in self.ranges: | |
31 return None, False | |
32 bin = pos / 100000 | |
33 if not bin in self.ranges[chrom]: | |
34 return None, False | |
35 for start, end, r_strand in self.ranges[chrom][bin]: | |
36 if pos >= start and pos <= end: | |
37 if ignore_strand or strand == r_strand: | |
38 return self.name, False | |
39 return self.name, True | |
40 return None, False | |
41 | |
42 | |
43 class RegionTagger(object): | |
44 def __init__(self, gtf, valid_chroms=None, only_first_fragment=True): | |
45 self.regions = [] | |
46 self.counts = {} | |
47 self.only_first_fragment = only_first_fragment | |
48 | |
49 coding = RangeMatch('coding') | |
50 exons = RangeMatch('other-exon') | |
51 utr_5 = RangeMatch('utr-5') | |
52 utr_3 = RangeMatch('utr-3') | |
53 introns = RangeMatch('intron') | |
54 promoters = RangeMatch('promoter') | |
55 | |
56 for gene in gtf.genes: | |
57 if valid_chroms and not gene.chrom in valid_chroms: | |
58 continue | |
59 if gene.strand == '+': | |
60 promoters.add_range(gene.chrom, gene.strand, gene.start - 2000, gene.start) | |
61 else: | |
62 promoters.add_range(gene.chrom, gene.strand, gene.end, gene.end + 2000) | |
63 | |
64 for transcript in gene.transcripts: | |
65 if transcript.has_cds: | |
66 for start, end in transcript.cds: | |
67 coding.add_range(gene.chrom, gene.strand, start, end) | |
68 | |
69 # TODO: Fix this so that it iterates over exons in the 5'/3' UTRS | |
70 for s, e in transcript.utr_5: | |
71 utr_5.add_range(gene.chrom, gene.strand, s, e) | |
72 for s, e in transcript.utr_3: | |
73 utr_3.add_range(gene.chrom, gene.strand, s, e) | |
74 | |
75 last_end = None | |
76 for start, end in transcript.exons: | |
77 if last_end: | |
78 introns.add_range(gene.chrom, gene.strand, last_end, start) | |
79 exons.add_range(gene.chrom, gene.strand, start, end) | |
80 last_end = end | |
81 | |
82 | |
83 self.regions.append(coding) | |
84 self.regions.append(utr_5) | |
85 self.regions.append(utr_3) | |
86 self.regions.append(exons) | |
87 self.regions.append(introns) | |
88 self.regions.append(promoters) | |
89 | |
90 self.counts['coding'] = 0 | |
91 self.counts['coding-rev'] = 0 | |
92 self.counts['other-exon'] = 0 | |
93 self.counts['utr-5'] = 0 | |
94 self.counts['utr-3'] = 0 | |
95 self.counts['utr-5-rev'] = 0 | |
96 self.counts['utr-3-rev'] = 0 | |
97 self.counts['intron'] = 0 | |
98 self.counts['promoter'] = 0 | |
99 self.counts['other-exon-rev'] = 0 | |
100 self.counts['intron-rev'] = 0 | |
101 self.counts['promoter-rev'] = 0 | |
102 self.counts['junction'] = 0 | |
103 self.counts['intergenic'] = 0 | |
104 self.counts['mitochondrial'] = 0 | |
105 | |
106 def add_read(self, read, chrom): | |
107 if read.is_unmapped: | |
108 return | |
109 | |
110 if self.only_first_fragment and read.is_paired and not read.is_read1: | |
111 return | |
112 | |
113 tag = None | |
114 is_rev = False | |
115 | |
116 strand = '-' if read.is_reverse else '+' | |
117 | |
118 if chrom == 'chrM': | |
119 tag = 'mitochondrial' | |
120 | |
121 if not tag: | |
122 for op, length in read.cigar: | |
123 if op == 3: | |
124 tag = 'junction' | |
125 break | |
126 | |
127 if not tag: | |
128 for region in self.regions: | |
129 tag, is_rev = region.get_tag(chrom, strand, read.pos) | |
130 if tag: | |
131 break | |
132 | |
133 if not tag: | |
134 tag = 'intergenic' | |
135 | |
136 if tag: | |
137 if is_rev: | |
138 self.counts['%s-rev' % tag] += 1 | |
139 else: | |
140 self.counts[tag] += 1 | |
141 | |
142 return tag | |
143 | |
144 def tag_region(self, chrom, start, end, strand): | |
145 tag = None | |
146 is_rev = False | |
147 | |
148 if chrom == 'chrM' or chrom == 'M': | |
149 tag = 'mitochondrial' | |
150 | |
151 if not tag: | |
152 for region in self.regions: | |
153 tag, is_rev = region.get_tag(chrom, strand, start) | |
154 if is_rev: | |
155 tag = '%s-rev' % tag | |
156 | |
157 if start != end: | |
158 endtag, is_rev = region.get_tag(chrom, strand, end) | |
159 if is_rev: | |
160 endtag = '%s-rev' % endtag | |
161 | |
162 if tag and endtag and endtag != tag: | |
163 tag = '%s/%s' % (tag, endtag) | |
164 | |
165 if not tag: | |
166 tag = 'intergenic' |