Mercurial > repos > xuebing > sharplabtool
comparison shuffleBed.py @ 11:b7f1d9f8f3bc
Uploaded
| author | xuebing |
|---|---|
| date | Sat, 10 Mar 2012 07:59:27 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 10:1558594a3c2e | 11:b7f1d9f8f3bc |
|---|---|
| 1 ''' | |
| 2 simulate a random interval set that mimics the size and strand of a reference set | |
| 3 ''' | |
| 4 | |
| 5 def inferSizeFromRefBed(filename,header): | |
| 6 ''' | |
| 7 read reference interval set, get chrom size information | |
| 8 ''' | |
| 9 chrSize = {} | |
| 10 f = open(filename) | |
| 11 if header: | |
| 12 header = f.readline() | |
| 13 for line in f: | |
| 14 flds = line.strip().split('\t') | |
| 15 if not chrSize.has_key(flds[0]): | |
| 16 chrSize[flds[0]] = int(flds[2]) | |
| 17 elif chrSize[flds[0]] < int(flds[2]): | |
| 18 chrSize[flds[0]] = int(flds[2]) | |
| 19 f.close() | |
| 20 return chrSize | |
| 21 | |
| 22 def getChrSize(filename): | |
| 23 chrSize = {} | |
| 24 f = open(filename) | |
| 25 for line in f: | |
| 26 flds = line.strip().split('\t') | |
| 27 if len(flds) >1: | |
| 28 chrSize[flds[0]] = int(flds[1]) | |
| 29 f.close() | |
| 30 return chrSize | |
| 31 | |
| 32 def makeWeightedChrom(chrSize): | |
| 33 ''' | |
| 34 make a list of chr_id, the freq is proportional to its length | |
| 35 ''' | |
| 36 | |
| 37 genome_len = 0 | |
| 38 | |
| 39 for chrom in chrSize: | |
| 40 chrom_len = chrSize[chrom] | |
| 41 genome_len += chrom_len | |
| 42 | |
| 43 weighted_chrom = [] | |
| 44 for chrom in chrSize: | |
| 45 weight = int(round(1000*float(chrSize[chrom])/genome_len)) | |
| 46 weighted_chrom += [chrom]*weight | |
| 47 | |
| 48 return weighted_chrom | |
| 49 | |
| 50 def randomIntervalWithinChrom(infile,outfile,chrSize,header): | |
| 51 ''' | |
| 52 ''' | |
| 53 fin = open(infile) | |
| 54 if header: | |
| 55 header = fin.readline() | |
| 56 fout = open(outfile,'w') | |
| 57 n = 0 | |
| 58 for line in fin: | |
| 59 n = n + 1 | |
| 60 flds = line.strip().split('\t') | |
| 61 interval_size = int(flds[2]) - int(flds[1]) | |
| 62 rstart = random.randint(0,chrSize[flds[0]]-interval_size) | |
| 63 fout.write(flds[0]+'\t'+str(rstart)+'\t'+str(rstart+interval_size)+'\t'+str(n)+'\t0\t+\n') | |
| 64 fin.close() | |
| 65 fout.close() | |
| 66 | |
| 67 def randomIntervalAcrossChrom(infile,outfile,chrSize,weighted_chrom,header): | |
| 68 ''' | |
| 69 ''' | |
| 70 fin = open(infile) | |
| 71 if header: | |
| 72 header = fin.readline() | |
| 73 fout = open(outfile,'w') | |
| 74 n = 0 | |
| 75 for line in fin: | |
| 76 n = n + 1 | |
| 77 flds = line.strip().split('\t') | |
| 78 interval_size = int(flds[2]) - int(flds[1]) | |
| 79 # find a random chrom | |
| 80 flds[0] = weighted_chrom[random.randint(0, len(weighted_chrom) - 1)] | |
| 81 # random start in the chrom | |
| 82 rstart = random.randint(0,chrSize[flds[0]]-interval_size) | |
| 83 fout.write(flds[0]+'\t'+str(rstart)+'\t'+str(rstart+interval_size)+'\t'+str(n)+'\t0\t+\n') | |
| 84 fin.close() | |
| 85 fout.close() | |
| 86 | |
| 87 import sys,random | |
| 88 def main(): | |
| 89 # python random_interval.py test100.bed testout.bed across header human.hg18.genome | |
| 90 | |
| 91 reference_interval_file = sys.argv[1] | |
| 92 output_file = sys.argv[2] | |
| 93 across_or_within_chrom = sys.argv[3] # within or across | |
| 94 if sys.argv[4] == 'header': | |
| 95 header = True | |
| 96 else: | |
| 97 header = False | |
| 98 if len(sys.argv) == 6: | |
| 99 chrom_size_file = sys.argv[5] | |
| 100 chrSize = getChrSize(chrom_size_file) | |
| 101 else: | |
| 102 chrSize = inferSizeFromRefBed(reference_interval_file,header) | |
| 103 if across_or_within_chrom == 'within': | |
| 104 randomIntervalWithinChrom(reference_interval_file,output_file,chrSize,header) | |
| 105 else: | |
| 106 randomIntervalAcrossChrom(reference_interval_file,output_file,chrSize,makeWeightedChrom(chrSize),header) | |
| 107 main() |
