# HG changeset patch # User xuebing # Date 1333216856 14400 # Node ID 0a4959804274222b47c13fbf091bf2b8c4ab1230 # Parent 11e3609fa73c1d6c858c038368736d3791116353 Uploaded diff -r 11e3609fa73c -r 0a4959804274 bed_shuffle_chrom.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bed_shuffle_chrom.py Sat Mar 31 14:00:56 2012 -0400 @@ -0,0 +1,96 @@ +''' +simulate a random interval set that mimics the size and strand of a reference set +''' + +def inferSizeFromRefBed(filename): + ''' + read reference interval set, get chrom size information + ''' + chrSize = {} + f = open(filename) + for line in f: + flds = line.strip().split('\t') + if not chrSize.has_key(flds[0]): + chrSize[flds[0]] = int(flds[2]) + elif chrSize[flds[0]] < int(flds[2]): + chrSize[flds[0]] = int(flds[2]) + f.close() + return chrSize + +def getChrSize(filename): + chrSize = {} + f = open(filename) + for line in f: + flds = line.strip().split() + if len(flds) >1: + chrSize[flds[0]] = int(flds[1]) + f.close() + return chrSize + +def makeWeightedChrom(chrSize): + ''' + make a list of chr_id, the freq is proportional to its length + ''' + + genome_len = 0 + + for chrom in chrSize: + chrom_len = chrSize[chrom] + genome_len += chrom_len + + weighted_chrom = [] + for chrom in chrSize: + weight = int(round(1000*float(chrSize[chrom])/genome_len)) + weighted_chrom += [chrom]*weight + + return weighted_chrom + +def randomIntervalWithinChrom(infile,outfile,chrSize): + ''' + ''' + fin = open(infile) + fout = open(outfile,'w') + n = 0 + for line in fin: + n = n + 1 + flds = line.strip().split('\t') + interval_size = int(flds[2]) - int(flds[1]) + flds[1] = str(random.randint(0,chrSize[flds[0]]-interval_size)) + flds[2] = str(int(flds[1])+interval_size) + fout.write('\t'.join(flds)+'\n') + fin.close() + fout.close() + +def randomIntervalAcrossChrom(infile,outfile,chrSize,weighted_chrom): + ''' + ''' + fin = open(infile) + fout = open(outfile,'w') + n = 0 + for line in fin: + n = n + 1 + flds = line.strip().split('\t') + interval_size = int(flds[2]) - int(flds[1]) + # find a random chrom + flds[0] = weighted_chrom[random.randint(0, len(weighted_chrom) - 1)] + flds[1] = str(random.randint(0,chrSize[flds[0]]-interval_size)) + flds[2] = str(int(flds[1])+interval_size) + fout.write('\t'.join(flds)+'\n') + fin.close() + fout.close() + +import sys,random +def main(): + # python random_interval.py test100.bed testout.bed across human.hg18.genome + + reference_interval_file = sys.argv[1] + output_file = sys.argv[2] + across_or_within_chrom = sys.argv[3] # within or across + chrom_size_file = sys.argv[4] + chrSize = getChrSize(chrom_size_file) + print chrSize.keys() + if across_or_within_chrom == 'within': + randomIntervalWithinChrom(reference_interval_file,output_file,chrSize) + else: + randomIntervalAcrossChrom(reference_interval_file,output_file,chrSize,makeWeightedChrom(chrSize)) +main()