Mercurial > repos > yufei-luo > s_mart
diff commons/core/parsing/FindRep.py @ 6:769e306b7933
Change the repository level.
author | yufei-luo |
---|---|
date | Fri, 18 Jan 2013 04:54:14 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/FindRep.py Fri Jan 18 04:54:14 2013 -0500 @@ -0,0 +1,113 @@ +import re +from xml.sax.handler import ContentHandler + +class FindRep( ContentHandler ): + def __init__(self,outfileName, filter=0,count=0): + self.inWindowContent = 0 + self.inSeqNameContent = 0 + self.inStartContent = 0 + self.inEndContent = 0 + self.inPeriodContent = 0 + self.inUnitContent = 0 + self.inScoreContent = 0 + self.count = count + self._outfileName = outfileName + self.filter=filter + + def startDocument(self): + self._fileout = open(self._outfileName,"w") + + def startElement(self,name,attrs): + if name=="window": + self.inWindowContent=1 + elif name=="sequence-name": + self.inSeqNameContent=1 + self.seqname="" + elif name=="repeat": + self.inRepContent=1 + self.start="" + self.end="" + self.period="" + self.type={} + elif name=="start": + self.inStartContent=1 + elif name=="end": + self.inEndContent=1 + elif name=="period": + self.inPeriodContent=1 + elif name=="unit": + self.inUnitContent=1 + self.unit="" + elif name=="score": + self.inScoreContent=1 + self.score="" + + def characters(self,ch): + if self.inSeqNameContent: + self.seqname+=ch + elif self.inStartContent: + self.start+=ch + elif self.inEndContent: + self.end+=ch + elif self.inPeriodContent: + self.period+=ch + elif self.inUnitContent: + self.unit+=ch + elif self.inScoreContent: + self.score+=ch + + def endElement(self,name): + if name=="window": + self.inWindowContent=0 + elif name=="sequence-name": + self.inSeqNameContent=0 + elif name=="repeat": + self.inRepContent=0 + start=int(self.start) + end=int(self.end) + period=int(self.period) + score=float(self.score) + if score>self.filter: + return + max = 0 + self.count+=1 + for k,n in self.type.items(): + if n>max: + max = n + k_max = k + + m=re.match("^[0-9]+.+\{Cut\}",self.seqname) + if m!=None: + seqname=self.seqname[m.start(0):m.end(0)-5].rstrip() + seqname=re.sub("^[0-9]+ ","",seqname).lstrip() + tok=self.seqname[m.end(0):].split("..") + astart=start+int(tok[0])-1 + aend=end+int(tok[0])-1 + else: + astart=start + aend=end + seqname=self.seqname + if len(k_max) > 100: + k_max=k_max[:48]+"..."+k_max[-51:] + strout="%d\t(%s)%d\t%s\t%d\t%d"%\ + (self.count,k_max,(abs(start-end)+1)/period,\ + seqname,astart,aend) + self._fileout.write("%s\n"%(strout)) + + elif name=="start": + self.inStartContent=0 + elif name=="end": + self.inEndContent=0 + elif name=="period": + self.inPeriodContent=0 + elif name=="score": + self.inScoreContent=0 + elif name=="unit": + self.inUnitContent=0 + if self.type.has_key(self.unit): + self.type[self.unit]+=1 + else: + self.type[self.unit]=1 + + def endDocument(self): + self._fileout.close() \ No newline at end of file