view bedClean.py @ 12:2f4ea569f048

Uploaded
author xuebing
date Sat, 10 Mar 2012 08:10:44 -0500
parents b7f1d9f8f3bc
children
line wrap: on
line source

import sys

def readChrSize(filename):
    f = open(filename)
    chrSize = {}
    for line in f:
        chrom,size = line.strip().split()
        chrSize[chrom]=int(size)
    f.close()
    return chrSize

def cleanFile(filename,chrSize,outfile):
    f = open(filename)
    out = open(outfile,'w')
    i = 0
    for line in f:
        i = i + 1
        flds = line.strip().split('\t')
        if len(flds) < 3:
            print 'line',i,'incomplete line:\n',line
        elif chrSize.has_key(flds[0]):
            if int(flds[1]) > int(flds[2]):
                tmp = flds[1]
                flds[1] = flds[2]
                flds[2] = tmp
            if int( flds[1]) < 0 or int(flds[2]) <0:
                print 'line',i,'negative coordinates:\n',line
            elif int(flds[2]) > chrSize[flds[0]]:
                print 'line',i,'end larger than chr size:\n',line
            else:
                out.write('\t'.join(flds)+'\n')
        else:
            print 'line',i,'chromosome',flds[0],'not found!\n',line
    f.close()
    out.close()

cleanFile(sys.argv[1],readChrSize(sys.argv[2]),sys.argv[3])