diff commons/core/parsing/FindRep.py @ 36:44d5973c188c

Uploaded
author m-zytnicki
date Tue, 30 Apr 2013 15:02:29 -0400
parents 769e306b7933
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/commons/core/parsing/FindRep.py	Tue Apr 30 15:02:29 2013 -0400
@@ -0,0 +1,113 @@
+import re
+from xml.sax.handler import ContentHandler
+
+class FindRep( ContentHandler ):
+    def __init__(self,outfileName, filter=0,count=0):
+        self.inWindowContent = 0
+        self.inSeqNameContent = 0
+        self.inStartContent = 0
+        self.inEndContent = 0
+        self.inPeriodContent = 0
+        self.inUnitContent = 0
+        self.inScoreContent = 0
+        self.count = count
+        self._outfileName = outfileName
+        self.filter=filter
+    
+    def startDocument(self):
+        self._fileout = open(self._outfileName,"w")
+        
+    def startElement(self,name,attrs):
+        if name=="window":
+            self.inWindowContent=1
+        elif name=="sequence-name":
+            self.inSeqNameContent=1
+            self.seqname=""
+        elif name=="repeat":
+            self.inRepContent=1
+            self.start=""
+            self.end=""
+            self.period=""
+            self.type={}
+        elif name=="start":
+            self.inStartContent=1
+        elif name=="end":
+            self.inEndContent=1
+        elif name=="period":
+            self.inPeriodContent=1
+        elif name=="unit":
+            self.inUnitContent=1
+            self.unit=""
+        elif name=="score":
+            self.inScoreContent=1
+            self.score=""
+
+    def characters(self,ch):
+        if self.inSeqNameContent:
+            self.seqname+=ch
+        elif self.inStartContent:
+            self.start+=ch
+        elif self.inEndContent:
+            self.end+=ch
+        elif self.inPeriodContent:
+            self.period+=ch            
+        elif self.inUnitContent:
+            self.unit+=ch            
+        elif self.inScoreContent:
+            self.score+=ch            
+
+    def endElement(self,name):
+        if name=="window":
+            self.inWindowContent=0
+        elif name=="sequence-name":
+            self.inSeqNameContent=0
+        elif name=="repeat":
+            self.inRepContent=0
+            start=int(self.start)
+            end=int(self.end)
+            period=int(self.period)
+            score=float(self.score)
+            if score>self.filter:
+                return
+            max = 0
+            self.count+=1
+            for k,n in self.type.items():
+                if n>max:
+                    max = n
+                    k_max = k
+
+            m=re.match("^[0-9]+.+\{Cut\}",self.seqname)
+            if m!=None:
+                seqname=self.seqname[m.start(0):m.end(0)-5].rstrip()
+                seqname=re.sub("^[0-9]+ ","",seqname).lstrip()
+                tok=self.seqname[m.end(0):].split("..")
+                astart=start+int(tok[0])-1
+                aend=end+int(tok[0])-1
+            else:
+                astart=start
+                aend=end
+                seqname=self.seqname
+            if len(k_max) > 100:
+                k_max=k_max[:48]+"..."+k_max[-51:]
+            strout="%d\t(%s)%d\t%s\t%d\t%d"%\
+                               (self.count,k_max,(abs(start-end)+1)/period,\
+                                seqname,astart,aend)
+            self._fileout.write("%s\n"%(strout))
+
+        elif name=="start":
+            self.inStartContent=0
+        elif name=="end":
+            self.inEndContent=0
+        elif name=="period":
+            self.inPeriodContent=0
+        elif name=="score":
+            self.inScoreContent=0
+        elif name=="unit":
+            self.inUnitContent=0
+            if self.type.has_key(self.unit):
+                self.type[self.unit]+=1
+            else:
+                self.type[self.unit]=1
+                
+    def endDocument(self):  
+        self._fileout.close()
\ No newline at end of file