comparison ngsutils/support/regions.py @ 0:4e4e4093d65d draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ngsutils commit 09194687c74a424732f8b0c017cbb942aad89068
author iuc
date Wed, 11 Nov 2015 13:04:07 -0500
parents
children 7a68005de299
comparison
equal deleted inserted replaced
-1:000000000000 0:4e4e4093d65d
1 class RangeMatch(object):
2 '''
3 Simple genomic ranges. You can define chrom:start-end ranges, then ask if a
4 particular genomic coordinate maps to any of those ranges. This is less-
5 efficient than an R-Tree, but easier to code.
6 '''
7 def __init__(self, name):
8 self.ranges = {}
9 self.name = name
10
11 def add_range(self, chrom, strand, start, end):
12 if not chrom in self.ranges:
13 self.ranges[chrom] = {}
14
15 bin = start / 100000
16 if not bin in self.ranges[chrom]:
17 self.ranges[chrom][bin] = []
18 self.ranges[chrom][bin].insert(0, (start, end, strand))
19
20 if (end / 100000) != bin:
21 for bin in xrange(bin + 1, (end / 100000) + 1):
22 if not bin in self.ranges[chrom]:
23 self.ranges[chrom][bin] = []
24 self.ranges[chrom][bin].insert(0, (start, end, strand))
25
26 def get_tag(self, chrom, strand, pos, ignore_strand=False):
27 '''
28 returns (region, is_reverse_orientation)
29 '''
30 if not chrom in self.ranges:
31 return None, False
32 bin = pos / 100000
33 if not bin in self.ranges[chrom]:
34 return None, False
35 for start, end, r_strand in self.ranges[chrom][bin]:
36 if pos >= start and pos <= end:
37 if ignore_strand or strand == r_strand:
38 return self.name, False
39 return self.name, True
40 return None, False
41
42
43 class RegionTagger(object):
44 def __init__(self, gtf, valid_chroms=None, only_first_fragment=True):
45 self.regions = []
46 self.counts = {}
47 self.only_first_fragment = only_first_fragment
48
49 coding = RangeMatch('coding')
50 exons = RangeMatch('other-exon')
51 utr_5 = RangeMatch('utr-5')
52 utr_3 = RangeMatch('utr-3')
53 introns = RangeMatch('intron')
54 promoters = RangeMatch('promoter')
55
56 for gene in gtf.genes:
57 if valid_chroms and not gene.chrom in valid_chroms:
58 continue
59 if gene.strand == '+':
60 promoters.add_range(gene.chrom, gene.strand, gene.start - 2000, gene.start)
61 else:
62 promoters.add_range(gene.chrom, gene.strand, gene.end, gene.end + 2000)
63
64 for transcript in gene.transcripts:
65 if transcript.has_cds:
66 for start, end in transcript.cds:
67 coding.add_range(gene.chrom, gene.strand, start, end)
68
69 # TODO: Fix this so that it iterates over exons in the 5'/3' UTRS
70 for s, e in transcript.utr_5:
71 utr_5.add_range(gene.chrom, gene.strand, s, e)
72 for s, e in transcript.utr_3:
73 utr_3.add_range(gene.chrom, gene.strand, s, e)
74
75 last_end = None
76 for start, end in transcript.exons:
77 if last_end:
78 introns.add_range(gene.chrom, gene.strand, last_end, start)
79 exons.add_range(gene.chrom, gene.strand, start, end)
80 last_end = end
81
82
83 self.regions.append(coding)
84 self.regions.append(utr_5)
85 self.regions.append(utr_3)
86 self.regions.append(exons)
87 self.regions.append(introns)
88 self.regions.append(promoters)
89
90 self.counts['coding'] = 0
91 self.counts['coding-rev'] = 0
92 self.counts['other-exon'] = 0
93 self.counts['utr-5'] = 0
94 self.counts['utr-3'] = 0
95 self.counts['utr-5-rev'] = 0
96 self.counts['utr-3-rev'] = 0
97 self.counts['intron'] = 0
98 self.counts['promoter'] = 0
99 self.counts['other-exon-rev'] = 0
100 self.counts['intron-rev'] = 0
101 self.counts['promoter-rev'] = 0
102 self.counts['junction'] = 0
103 self.counts['intergenic'] = 0
104 self.counts['mitochondrial'] = 0
105
106 def add_read(self, read, chrom):
107 if read.is_unmapped:
108 return
109
110 if self.only_first_fragment and read.is_paired and not read.is_read1:
111 return
112
113 tag = None
114 is_rev = False
115
116 strand = '-' if read.is_reverse else '+'
117
118 if chrom == 'chrM':
119 tag = 'mitochondrial'
120
121 if not tag:
122 for op, length in read.cigar:
123 if op == 3:
124 tag = 'junction'
125 break
126
127 if not tag:
128 for region in self.regions:
129 tag, is_rev = region.get_tag(chrom, strand, read.pos)
130 if tag:
131 break
132
133 if not tag:
134 tag = 'intergenic'
135
136 if tag:
137 if is_rev:
138 self.counts['%s-rev' % tag] += 1
139 else:
140 self.counts[tag] += 1
141
142 return tag
143
144 def tag_region(self, chrom, start, end, strand):
145 tag = None
146 is_rev = False
147
148 if chrom == 'chrM' or chrom == 'M':
149 tag = 'mitochondrial'
150
151 if not tag:
152 for region in self.regions:
153 tag, is_rev = region.get_tag(chrom, strand, start)
154 if is_rev:
155 tag = '%s-rev' % tag
156
157 if start != end:
158 endtag, is_rev = region.get_tag(chrom, strand, end)
159 if is_rev:
160 endtag = '%s-rev' % endtag
161
162 if tag and endtag and endtag != tag:
163 tag = '%s/%s' % (tag, endtag)
164
165 if not tag:
166 tag = 'intergenic'