Mercurial > repos > xuebing > sharplab_interval_analysis
view shuffleBed.py @ 22:869c7664e584
Uploaded
author | xuebing |
---|---|
date | Sat, 31 Mar 2012 11:52:14 -0400 |
parents | 16ba480adf96 |
children |
line wrap: on
line source
''' simulate a random interval set that mimics the size and strand of a reference set ''' def inferSizeFromRefBed(filename,header): ''' read reference interval set, get chrom size information ''' chrSize = {} f = open(filename) if header: header = f.readline() for line in f: flds = line.strip().split('\t') if not chrSize.has_key(flds[0]): chrSize[flds[0]] = int(flds[2]) elif chrSize[flds[0]] < int(flds[2]): chrSize[flds[0]] = int(flds[2]) f.close() return chrSize def getChrSize(filename): chrSize = {} f = open(filename) for line in f: flds = line.strip().split('\t') if len(flds) >1: chrSize[flds[0]] = int(flds[1]) f.close() return chrSize def makeWeightedChrom(chrSize): ''' make a list of chr_id, the freq is proportional to its length ''' genome_len = 0 for chrom in chrSize: chrom_len = chrSize[chrom] genome_len += chrom_len weighted_chrom = [] for chrom in chrSize: weight = int(round(1000*float(chrSize[chrom])/genome_len)) weighted_chrom += [chrom]*weight return weighted_chrom def randomIntervalWithinChrom(infile,outfile,chrSize,header): ''' ''' fin = open(infile) if header: header = fin.readline() fout = open(outfile,'w') n = 0 for line in fin: n = n + 1 flds = line.strip().split('\t') interval_size = int(flds[2]) - int(flds[1]) rstart = random.randint(0,chrSize[flds[0]]-interval_size) fout.write(flds[0]+'\t'+str(rstart)+'\t'+str(rstart+interval_size)+'\t'+str(n)+'\t0\t+\n') fin.close() fout.close() def randomIntervalAcrossChrom(infile,outfile,chrSize,weighted_chrom,header): ''' ''' fin = open(infile) if header: header = fin.readline() fout = open(outfile,'w') n = 0 for line in fin: n = n + 1 flds = line.strip().split('\t') interval_size = int(flds[2]) - int(flds[1]) # find a random chrom flds[0] = weighted_chrom[random.randint(0, len(weighted_chrom) - 1)] # random start in the chrom rstart = random.randint(0,chrSize[flds[0]]-interval_size) fout.write(flds[0]+'\t'+str(rstart)+'\t'+str(rstart+interval_size)+'\t'+str(n)+'\t0\t+\n') fin.close() fout.close() import sys,random def main(): # python random_interval.py test100.bed testout.bed across header human.hg18.genome reference_interval_file = sys.argv[1] output_file = sys.argv[2] across_or_within_chrom = sys.argv[3] # within or across if sys.argv[4] == 'header': header = True else: header = False if len(sys.argv) == 6: chrom_size_file = sys.argv[5] chrSize = getChrSize(chrom_size_file) else: chrSize = inferSizeFromRefBed(reference_interval_file,header) if across_or_within_chrom == 'within': randomIntervalWithinChrom(reference_interval_file,output_file,chrSize,header) else: randomIntervalAcrossChrom(reference_interval_file,output_file,chrSize,makeWeightedChrom(chrSize),header) main()