Mercurial > repos > jason-ellul > calculatezprimefactor
diff test-data/generate_reads.py @ 4:796653c6376b draft
Uploaded
author | jason-ellul |
---|---|
date | Wed, 01 Jun 2016 02:36:11 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/generate_reads.py Wed Jun 01 02:36:11 2016 -0400 @@ -0,0 +1,118 @@ +#!/usr/bin/env python + + +import random +import math + + +class Region: + def __init__(self,start,stop,sequence): + self.start = start + self.stop = stop + self.sequence = sequence.strip().replace("\n","").replace(" ","") + if(len(self.sequence) != self.getSpanningLength()): + print "ERROR: sequence length: "+str(len(self.sequence))+", while spanning region is: "+str(self.getSpanningLength()) + import sys + sys.exit() + + def getSpanningLength(self): + return abs(self.stop-self.start+1) + +class ReadSynthesizer: + def __init__(self,chromosome): + self.regions = [] + self.chromosome = chromosome + + def addRegion(self,region): + self.regions.append(region) + + def produceReads(self,readDensity = 1,read_length = 50): + """ + Produces uniform reads by walking iteratively over self.regions + """ + + mRNA = self.getTotalmRNA() + spanning_length = self.getRegionSpanningLength() + n = spanning_length['total'] - read_length + 1 + + j = 0 + k = 0 + + for i in range(n): + # "alpha is playing the role of k and beta is playing the role of theta" + dd = max(0,int(round(random.lognormvariate(math.log(readDensity),0.5))))# Notice this is NOT a binomial distribution!! + + for d in range(dd): + sequence = mRNA[i:i+read_length] + + if(random.randint(0,1) == 0): + strand = 0 + else: + strand = 16 + flag = strand + 0 + + print "read_"+str(j)+"."+str(i)+"."+str(d)+"\t"+str(flag)+"\t"+self.chromosome+"\t"+str(self.regions[j].start + k)+"\t60\t"+self.getMappingString(read_length,j,k)+"\t*\t0\t0\t"+str(sequence.upper())+"\t*" + + spanning_length['iter'][j] -= 1 + if(k >= self.regions[j].getSpanningLength()-1): + j += 1 + k = 0 + else: + k += 1 + + def getMappingString(self,length,j,offset): + m = 0 + + out = "" + + for i in range(length): + k = i + offset + + if(k >= self.regions[j].getSpanningLength()): + j += 1 + + out += str(m)+"M" + out += (str(self.regions[j].start - self.regions[j-1].stop-1))+"N" + m = 1 + + offset = -k + else: + m += 1 + + out += str(m) + "M" + + + return out + + def getRegionSpanningLength(self): + length = {'total':0,'iter':[]} + for r in self.regions: + l = r.getSpanningLength() + length['iter'].append(l) + length['total'] += l + return length + + def getTotalmRNA(self): + mRNA = "" + for r in self.regions: + mRNA += r.sequence + return mRNA + +#rs = ReadSynthesizer('chr6') +#rs.addRegion(Region(100,149,'ccaggactggtttctgtaagaaacagcaggagctgtggcagcggcgaaag')) +#rs.addRegion(Region(151,152,'at')) +#rs.produceReads(3,50) + + +rs = ReadSynthesizer('chr6') +rs.addRegion(Region(154360546,154360969,'ccaggactggtttctgtaagaaacagcaggagctgtggcagcggcgaaaggaagcggctgaggcgcttggaacccgaaaagtctcggtgctcctggctacctcgcacagcggtgcccgcccggccgtcagtaccatggacagcagcgctgcccccacgaacgccagcaattgcactgatgccttggcgtactcaagttgctccccagcacccagccccggttcctgggtcaacttgtcccacttagatggcGacctgtccgacccatgcggtccgaaccgcaccgacctgggcgggagagacagcctgtgccctccgaccggcagtccctccatgatcacggccatcacgatcatggccctctactccatcgtgtgcgtggtggggctcttcggaaacttcctggtcatgtatgtgattgtcag')) +rs.addRegion(Region(154410961,154411313,'atacaccaagatgaagactgccaccaacatctacattttcaaccttgctctggcagatgccttagccaccagtaccctgcccttccagagtgtgaattacctaatgggaacatggccatttggaaccatcctttgcaagatagtgatctccatagattactataacatgttcaccagcatattcaccctctgcaccatgagtgttgatcgatacattgcagtctgccaccctgtcaaggccttagatttccgtactccccgaaatgccaaaattatcaatgtctgcaactggatcctctcttcagccattggtcttcctgtaatgttcatggctacaacaaaatacaggcaag')) +rs.addRegion(Region(154412087,154412607,'gttccatagattgtacactaacattctctcatccaacctggtactgggaaaacctgctgaagatctgtgttttcatcttcgccttcattatgccagtgctcatcattaccgtgtgctatggactgatgatcttgcgcctcaagagtgtccgcatgctctctggctccaaagaaaaggacaggaatcttcgaaggatcaccaggatggtgctggtggtggtggctgtgttcatcgtctgctggactcccattcacatttacgtcatcattaaagccttggttacaatcccagaaactacgttccagactgtttcttggcacttctgcattgctctaggttacacaaacagctgcctcaacccagtcctttatgcatttctggatgaaaacttcaaacgatgcttcagagagttctgtatcccaacctcttccaacattgagcaacaaaactccactcgaattcgtcagaacactagagaccacccctccacggccaatacagtggatagaactaatcatcag')) +rs.addRegion(Region(154428600,154428787,'gtggaattgaacctggactgtcactgtgaaaatgcaaagccttggccactgagctacaatgcagggcagtctccatttcccttcccaggaagagtctagagcattaattttgagtttgcaaaggcttgtaactatttcatatgatttttagagctgactatgacatgaaccctaaaattcctgttccc')) +rs.produceReads(3,50) + + + + + +