diff bedClean.py @ 20:16ba480adf96

Uploaded
author xuebing
date Sat, 31 Mar 2012 08:31:22 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bedClean.py	Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,43 @@
+import sys
+
+def readChrSize(filename):
+    f = open(filename)
+    chrSize = {}
+    for line in f:
+        chrom,size = line.strip().split()
+        chrSize[chrom]=int(size)
+    f.close()
+    return chrSize
+
+def cleanFile(filename,chrSize,outfile):
+    f = open(filename)
+    out = open(outfile,'w')
+    i = 0
+    for line in f:
+        i = i + 1
+        flds = line.strip().split('\t')
+        if len(flds) < 3:
+            print 'line',i,'incomplete line:\n',line
+        elif chrSize.has_key(flds[0]):
+            if int(flds[1]) > int(flds[2]):
+                tmp = flds[1]
+                flds[1] = flds[2]
+                flds[2] = tmp
+            if int( flds[1]) < 0 or int(flds[2]) <0:
+                print 'line',i,'negative coordinates:\n',line
+            elif int(flds[2]) > chrSize[flds[0]]:
+                print 'line',i,'end larger than chr size:\n',line
+            else:
+                if flds[5] == '*':
+                    flds[5] = '+'
+                    print 'line',i,' strand * changed to +\n', line
+                out.write('\t'.join(flds)+'\n')
+        else:
+            print 'line',i,'chromosome',flds[0],'not found!\n',line
+    f.close()
+    out.close()
+
+if len(sys.argv) < 4:
+    print "python bedClean.py in.bed chrsizefile out.bed"
+    exit()
+cleanFile(sys.argv[1],readChrSize(sys.argv[2]),sys.argv[3])