Mercurial > repos > xuebing > sharplabtool
comparison mytools/random_interval.py @ 7:f0dc65e7f6c0
Uploaded
| author | xuebing |
|---|---|
| date | Fri, 09 Mar 2012 19:59:07 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 6:094f377db4e5 | 7:f0dc65e7f6c0 |
|---|---|
| 1 ''' | |
| 2 simulate a random interval set that mimics the size and strand of a reference set | |
| 3 ''' | |
| 4 | |
| 5 def inferSizeFromRefBed(filename): | |
| 6 ''' | |
| 7 read reference interval set, get chrom size information | |
| 8 ''' | |
| 9 chrSize = {} | |
| 10 f = open(filename) | |
| 11 for line in f: | |
| 12 flds = line.strip().split('\t') | |
| 13 if not chrSize.has_key(flds[0]): | |
| 14 chrSize[flds[0]] = int(flds[2]) | |
| 15 elif chrSize[flds[0]] < int(flds[2]): | |
| 16 chrSize[flds[0]] = int(flds[2]) | |
| 17 f.close() | |
| 18 return chrSize | |
| 19 | |
| 20 def getChrSize(filename): | |
| 21 chrSize = {} | |
| 22 f = open(filename) | |
| 23 for line in f: | |
| 24 flds = line.strip().split() | |
| 25 if len(flds) >1: | |
| 26 chrSize[flds[0]] = int(flds[1]) | |
| 27 f.close() | |
| 28 return chrSize | |
| 29 | |
| 30 def makeWeightedChrom(chrSize): | |
| 31 ''' | |
| 32 make a list of chr_id, the freq is proportional to its length | |
| 33 ''' | |
| 34 | |
| 35 genome_len = 0 | |
| 36 | |
| 37 for chrom in chrSize: | |
| 38 chrom_len = chrSize[chrom] | |
| 39 genome_len += chrom_len | |
| 40 | |
| 41 weighted_chrom = [] | |
| 42 for chrom in chrSize: | |
| 43 weight = int(round(1000*float(chrSize[chrom])/genome_len)) | |
| 44 weighted_chrom += [chrom]*weight | |
| 45 | |
| 46 return weighted_chrom | |
| 47 | |
| 48 def randomIntervalWithinChrom(infile,outfile,chrSize): | |
| 49 ''' | |
| 50 ''' | |
| 51 fin = open(infile) | |
| 52 fout = open(outfile,'w') | |
| 53 n = 0 | |
| 54 for line in fin: | |
| 55 n = n + 1 | |
| 56 flds = line.strip().split('\t') | |
| 57 interval_size = int(flds[2]) - int(flds[1]) | |
| 58 flds[1] = str(random.randint(0,chrSize[flds[0]]-interval_size)) | |
| 59 flds[2] = str(int(flds[1])+interval_size) | |
| 60 fout.write('\t'.join(flds)+'\n') | |
| 61 fin.close() | |
| 62 fout.close() | |
| 63 | |
| 64 def randomIntervalAcrossChrom(infile,outfile,chrSize,weighted_chrom): | |
| 65 ''' | |
| 66 ''' | |
| 67 fin = open(infile) | |
| 68 fout = open(outfile,'w') | |
| 69 n = 0 | |
| 70 for line in fin: | |
| 71 n = n + 1 | |
| 72 flds = line.strip().split('\t') | |
| 73 interval_size = int(flds[2]) - int(flds[1]) | |
| 74 # find a random chrom | |
| 75 flds[0] = weighted_chrom[random.randint(0, len(weighted_chrom) - 1)] | |
| 76 flds[1] = str(random.randint(0,chrSize[flds[0]]-interval_size)) | |
| 77 flds[2] = str(int(flds[1])+interval_size) | |
| 78 fout.write('\t'.join(flds)+'\n') | |
| 79 fin.close() | |
| 80 fout.close() | |
| 81 | |
| 82 import sys,random | |
| 83 def main(): | |
| 84 # python random_interval.py test100.bed testout.bed across human.hg18.genome | |
| 85 | |
| 86 reference_interval_file = sys.argv[1] | |
| 87 output_file = sys.argv[2] | |
| 88 across_or_within_chrom = sys.argv[3] # within or across | |
| 89 chrom_size_file = sys.argv[4] | |
| 90 chrSize = getChrSize(chrom_size_file) | |
| 91 print chrSize.keys() | |
| 92 if across_or_within_chrom == 'within': | |
| 93 randomIntervalWithinChrom(reference_interval_file,output_file,chrSize) | |
| 94 else: | |
| 95 randomIntervalAcrossChrom(reference_interval_file,output_file,chrSize,makeWeightedChrom(chrSize)) | |
| 96 main() |
