Mercurial > repos > xuebing > sharplabtool
diff shuffleBed.py @ 14:76e1b1b21cce default tip
Deleted selected files
author | xuebing |
---|---|
date | Tue, 13 Mar 2012 19:05:10 -0400 |
parents | 292186c14b08 |
children |
line wrap: on
line diff
--- a/shuffleBed.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,107 +0,0 @@ -''' -simulate a random interval set that mimics the size and strand of a reference set -''' - -def inferSizeFromRefBed(filename,header): - ''' - read reference interval set, get chrom size information - ''' - chrSize = {} - f = open(filename) - if header: - header = f.readline() - for line in f: - flds = line.strip().split('\t') - if not chrSize.has_key(flds[0]): - chrSize[flds[0]] = int(flds[2]) - elif chrSize[flds[0]] < int(flds[2]): - chrSize[flds[0]] = int(flds[2]) - f.close() - return chrSize - -def getChrSize(filename): - chrSize = {} - f = open(filename) - for line in f: - flds = line.strip().split('\t') - if len(flds) >1: - chrSize[flds[0]] = int(flds[1]) - f.close() - return chrSize - -def makeWeightedChrom(chrSize): - ''' - make a list of chr_id, the freq is proportional to its length - ''' - - genome_len = 0 - - for chrom in chrSize: - chrom_len = chrSize[chrom] - genome_len += chrom_len - - weighted_chrom = [] - for chrom in chrSize: - weight = int(round(1000*float(chrSize[chrom])/genome_len)) - weighted_chrom += [chrom]*weight - - return weighted_chrom - -def randomIntervalWithinChrom(infile,outfile,chrSize,header): - ''' - ''' - fin = open(infile) - if header: - header = fin.readline() - fout = open(outfile,'w') - n = 0 - for line in fin: - n = n + 1 - flds = line.strip().split('\t') - interval_size = int(flds[2]) - int(flds[1]) - rstart = random.randint(0,chrSize[flds[0]]-interval_size) - fout.write(flds[0]+'\t'+str(rstart)+'\t'+str(rstart+interval_size)+'\t'+str(n)+'\t0\t+\n') - fin.close() - fout.close() - -def randomIntervalAcrossChrom(infile,outfile,chrSize,weighted_chrom,header): - ''' - ''' - fin = open(infile) - if header: - header = fin.readline() - fout = open(outfile,'w') - n = 0 - for line in fin: - n = n + 1 - flds = line.strip().split('\t') - interval_size = int(flds[2]) - int(flds[1]) - # find a random chrom - flds[0] = weighted_chrom[random.randint(0, len(weighted_chrom) - 1)] - # random start in the chrom - rstart = random.randint(0,chrSize[flds[0]]-interval_size) - fout.write(flds[0]+'\t'+str(rstart)+'\t'+str(rstart+interval_size)+'\t'+str(n)+'\t0\t+\n') - fin.close() - fout.close() - -import sys,random -def main(): - # python random_interval.py test100.bed testout.bed across header human.hg18.genome - - reference_interval_file = sys.argv[1] - output_file = sys.argv[2] - across_or_within_chrom = sys.argv[3] # within or across - if sys.argv[4] == 'header': - header = True - else: - header = False - if len(sys.argv) == 6: - chrom_size_file = sys.argv[5] - chrSize = getChrSize(chrom_size_file) - else: - chrSize = inferSizeFromRefBed(reference_interval_file,header) - if across_or_within_chrom == 'within': - randomIntervalWithinChrom(reference_interval_file,output_file,chrSize,header) - else: - randomIntervalAcrossChrom(reference_interval_file,output_file,chrSize,makeWeightedChrom(chrSize),header) -main()