view random_interval.py @ 12:2f4ea569f048

Uploaded
author xuebing
date Sat, 10 Mar 2012 08:10:44 -0500
parents b7f1d9f8f3bc
children
line wrap: on
line source

'''
simulate a random interval set that mimics the size and strand of a reference set 
'''

def inferSizeFromRefBed(filename):
    '''
    read reference interval set, get chrom size information
    '''
    chrSize = {}
    f = open(filename)
    for line in f:
        flds = line.strip().split('\t')
        if not chrSize.has_key(flds[0]):
            chrSize[flds[0]] = int(flds[2])
        elif chrSize[flds[0]] < int(flds[2]):
            chrSize[flds[0]] = int(flds[2])
    f.close()
    return chrSize 

def getChrSize(filename):
    chrSize = {}
    f = open(filename)
    for line in f:
        flds = line.strip().split()
        if len(flds) >1:
            chrSize[flds[0]] = int(flds[1])
    f.close()
    return chrSize
    
def makeWeightedChrom(chrSize):
    '''
    make a list of chr_id, the freq is proportional to its length
    '''
     
    genome_len = 0
    
    for chrom in chrSize:
        chrom_len = chrSize[chrom]
        genome_len += chrom_len

    weighted_chrom = []
    for chrom in chrSize:
        weight = int(round(1000*float(chrSize[chrom])/genome_len))
        weighted_chrom += [chrom]*weight

    return weighted_chrom            

def randomIntervalWithinChrom(infile,outfile,chrSize):
    '''
    '''
    fin = open(infile)
    fout = open(outfile,'w')
    n = 0
    for line in fin:
        n = n + 1
        flds = line.strip().split('\t')
        interval_size = int(flds[2]) - int(flds[1])
        flds[1] = str(random.randint(0,chrSize[flds[0]]-interval_size))
        flds[2] = str(int(flds[1])+interval_size)
        fout.write('\t'.join(flds)+'\n')
    fin.close()
    fout.close()   

def randomIntervalAcrossChrom(infile,outfile,chrSize,weighted_chrom):
    '''
    '''
    fin = open(infile)
    fout = open(outfile,'w')
    n = 0
    for line in fin:
        n = n + 1
        flds = line.strip().split('\t')
        interval_size = int(flds[2]) - int(flds[1])
        # find a random chrom
        flds[0] = weighted_chrom[random.randint(0, len(weighted_chrom) - 1)]
        flds[1] = str(random.randint(0,chrSize[flds[0]]-interval_size))
        flds[2] = str(int(flds[1])+interval_size)
        fout.write('\t'.join(flds)+'\n')
    fin.close()
    fout.close()            

import sys,random
def main():
    # python random_interval.py test100.bed testout.bed across human.hg18.genome 

    reference_interval_file = sys.argv[1]
    output_file = sys.argv[2]
    across_or_within_chrom = sys.argv[3] # within or across 
    chrom_size_file = sys.argv[4]
    chrSize = getChrSize(chrom_size_file)
    print chrSize.keys()
    if across_or_within_chrom == 'within':            
        randomIntervalWithinChrom(reference_interval_file,output_file,chrSize)
    else:
        randomIntervalAcrossChrom(reference_interval_file,output_file,chrSize,makeWeightedChrom(chrSize))   
main()