6
|
1 import re
|
|
2 from xml.sax.handler import ContentHandler
|
|
3
|
|
4 class FindRep( ContentHandler ):
|
|
5 def __init__(self,outfileName, filter=0,count=0):
|
|
6 self.inWindowContent = 0
|
|
7 self.inSeqNameContent = 0
|
|
8 self.inStartContent = 0
|
|
9 self.inEndContent = 0
|
|
10 self.inPeriodContent = 0
|
|
11 self.inUnitContent = 0
|
|
12 self.inScoreContent = 0
|
|
13 self.count = count
|
|
14 self._outfileName = outfileName
|
|
15 self.filter=filter
|
|
16
|
|
17 def startDocument(self):
|
|
18 self._fileout = open(self._outfileName,"w")
|
|
19
|
|
20 def startElement(self,name,attrs):
|
|
21 if name=="window":
|
|
22 self.inWindowContent=1
|
|
23 elif name=="sequence-name":
|
|
24 self.inSeqNameContent=1
|
|
25 self.seqname=""
|
|
26 elif name=="repeat":
|
|
27 self.inRepContent=1
|
|
28 self.start=""
|
|
29 self.end=""
|
|
30 self.period=""
|
|
31 self.type={}
|
|
32 elif name=="start":
|
|
33 self.inStartContent=1
|
|
34 elif name=="end":
|
|
35 self.inEndContent=1
|
|
36 elif name=="period":
|
|
37 self.inPeriodContent=1
|
|
38 elif name=="unit":
|
|
39 self.inUnitContent=1
|
|
40 self.unit=""
|
|
41 elif name=="score":
|
|
42 self.inScoreContent=1
|
|
43 self.score=""
|
|
44
|
|
45 def characters(self,ch):
|
|
46 if self.inSeqNameContent:
|
|
47 self.seqname+=ch
|
|
48 elif self.inStartContent:
|
|
49 self.start+=ch
|
|
50 elif self.inEndContent:
|
|
51 self.end+=ch
|
|
52 elif self.inPeriodContent:
|
|
53 self.period+=ch
|
|
54 elif self.inUnitContent:
|
|
55 self.unit+=ch
|
|
56 elif self.inScoreContent:
|
|
57 self.score+=ch
|
|
58
|
|
59 def endElement(self,name):
|
|
60 if name=="window":
|
|
61 self.inWindowContent=0
|
|
62 elif name=="sequence-name":
|
|
63 self.inSeqNameContent=0
|
|
64 elif name=="repeat":
|
|
65 self.inRepContent=0
|
|
66 start=int(self.start)
|
|
67 end=int(self.end)
|
|
68 period=int(self.period)
|
|
69 score=float(self.score)
|
|
70 if score>self.filter:
|
|
71 return
|
|
72 max = 0
|
|
73 self.count+=1
|
|
74 for k,n in self.type.items():
|
|
75 if n>max:
|
|
76 max = n
|
|
77 k_max = k
|
|
78
|
|
79 m=re.match("^[0-9]+.+\{Cut\}",self.seqname)
|
|
80 if m!=None:
|
|
81 seqname=self.seqname[m.start(0):m.end(0)-5].rstrip()
|
|
82 seqname=re.sub("^[0-9]+ ","",seqname).lstrip()
|
|
83 tok=self.seqname[m.end(0):].split("..")
|
|
84 astart=start+int(tok[0])-1
|
|
85 aend=end+int(tok[0])-1
|
|
86 else:
|
|
87 astart=start
|
|
88 aend=end
|
|
89 seqname=self.seqname
|
|
90 if len(k_max) > 100:
|
|
91 k_max=k_max[:48]+"..."+k_max[-51:]
|
|
92 strout="%d\t(%s)%d\t%s\t%d\t%d"%\
|
|
93 (self.count,k_max,(abs(start-end)+1)/period,\
|
|
94 seqname,astart,aend)
|
|
95 self._fileout.write("%s\n"%(strout))
|
|
96
|
|
97 elif name=="start":
|
|
98 self.inStartContent=0
|
|
99 elif name=="end":
|
|
100 self.inEndContent=0
|
|
101 elif name=="period":
|
|
102 self.inPeriodContent=0
|
|
103 elif name=="score":
|
|
104 self.inScoreContent=0
|
|
105 elif name=="unit":
|
|
106 self.inUnitContent=0
|
|
107 if self.type.has_key(self.unit):
|
|
108 self.type[self.unit]+=1
|
|
109 else:
|
|
110 self.type[self.unit]=1
|
|
111
|
|
112 def endDocument(self):
|
|
113 self._fileout.close() |