view bed_clean.py @ 8:c887ed5d2c51

Uploaded
author xuebing
date Sat, 31 Mar 2012 13:08:29 -0400
parents
children
line wrap: on
line source

import sys

def readChrSize(filename):
    f = open(filename)
    chrSize = {}
    for line in f:
        chrom,size = line.strip().split()
        chrSize[chrom]=int(size)
    f.close()
    return chrSize

def cleanFile(filename,chrSize,outfile):
    f = open(filename)
    out = open(outfile,'w')
    i = 0
    for line in f:
        i = i + 1
        flds = line.strip().split('\t')
        if len(flds) < 3:
            print 'line',i,'incomplete line:\n',line
        elif chrSize.has_key(flds[0]):
            if int(flds[1]) > int(flds[2]):
                tmp = flds[1]
                flds[1] = flds[2]
                flds[2] = tmp
            if int( flds[1]) < 0 or int(flds[2]) <0:
                print 'line',i,'negative coordinates:\n',line
            elif int(flds[2]) > chrSize[flds[0]]:
                print 'line',i,'end larger than chr size:\n',line
            else:
                if flds[5] == '*':
                    flds[5] = '+'
                    print 'line',i,' strand * changed to +\n', line
                out.write('\t'.join(flds)+'\n')
        else:
            print 'line',i,'chromosome',flds[0],'not found!\n',line
    f.close()
    out.close()

if len(sys.argv) < 4:
    print "python bedClean.py in.bed chrsizefile out.bed"
    exit()
cleanFile(sys.argv[1],readChrSize(sys.argv[2]),sys.argv[3])