diff test-data/generate_reads.py @ 4:796653c6376b draft

Uploaded
author jason-ellul
date Wed, 01 Jun 2016 02:36:11 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/generate_reads.py	Wed Jun 01 02:36:11 2016 -0400
@@ -0,0 +1,118 @@
+#!/usr/bin/env python
+
+
+import random
+import math
+
+
+class Region:
+	def __init__(self,start,stop,sequence):
+		self.start = start
+		self.stop = stop
+		self.sequence = sequence.strip().replace("\n","").replace(" ","")
+		if(len(self.sequence) != self.getSpanningLength()):
+			print "ERROR: sequence length: "+str(len(self.sequence))+", while spanning region is: "+str(self.getSpanningLength())
+			import sys
+			sys.exit()
+	
+	def getSpanningLength(self):
+		return abs(self.stop-self.start+1)
+
+class ReadSynthesizer:
+	def __init__(self,chromosome):
+		self.regions = []
+		self.chromosome = chromosome
+	
+	def addRegion(self,region):
+		self.regions.append(region)
+	
+	def produceReads(self,readDensity = 1,read_length = 50):
+		"""
+		Produces uniform reads by walking iteratively over self.regions
+		"""
+		
+		mRNA = self.getTotalmRNA()
+		spanning_length = self.getRegionSpanningLength()
+		n = spanning_length['total'] - read_length + 1
+		
+		j = 0
+		k = 0
+		
+		for i in range(n):
+			#  "alpha is playing the role of k and beta is playing the role of theta"
+			dd = max(0,int(round(random.lognormvariate(math.log(readDensity),0.5))))# Notice this is NOT a binomial distribution!!
+			
+			for d in range(dd):
+				sequence = mRNA[i:i+read_length]
+				
+				if(random.randint(0,1) == 0):
+					strand = 0
+				else:
+					strand = 16
+				flag = strand + 0
+				
+				print "read_"+str(j)+"."+str(i)+"."+str(d)+"\t"+str(flag)+"\t"+self.chromosome+"\t"+str(self.regions[j].start + k)+"\t60\t"+self.getMappingString(read_length,j,k)+"\t*\t0\t0\t"+str(sequence.upper())+"\t*"
+			
+			spanning_length['iter'][j] -= 1
+			if(k >= self.regions[j].getSpanningLength()-1):
+				j += 1
+				k = 0
+			else:
+				k += 1
+	
+	def getMappingString(self,length,j,offset):
+		m = 0
+		
+		out = ""
+		
+		for i in range(length):
+			k = i + offset
+			
+			if(k >= self.regions[j].getSpanningLength()):
+				j += 1
+				
+				out += str(m)+"M"
+				out += (str(self.regions[j].start - self.regions[j-1].stop-1))+"N"
+				m = 1
+				
+				offset = -k
+			else:
+				m += 1
+		
+		out += str(m) + "M"
+		
+		
+		return out
+	
+	def getRegionSpanningLength(self):
+		length = {'total':0,'iter':[]}
+		for r in self.regions:
+			l = r.getSpanningLength()
+			length['iter'].append(l)
+			length['total'] += l
+		return length
+	
+	def getTotalmRNA(self):
+		mRNA = ""
+		for r in self.regions:
+			mRNA += r.sequence
+		return mRNA
+
+#rs = ReadSynthesizer('chr6')
+#rs.addRegion(Region(100,149,'ccaggactggtttctgtaagaaacagcaggagctgtggcagcggcgaaag'))
+#rs.addRegion(Region(151,152,'at'))
+#rs.produceReads(3,50)
+
+
+rs = ReadSynthesizer('chr6')
+rs.addRegion(Region(154360546,154360969,'ccaggactggtttctgtaagaaacagcaggagctgtggcagcggcgaaaggaagcggctgaggcgcttggaacccgaaaagtctcggtgctcctggctacctcgcacagcggtgcccgcccggccgtcagtaccatggacagcagcgctgcccccacgaacgccagcaattgcactgatgccttggcgtactcaagttgctccccagcacccagccccggttcctgggtcaacttgtcccacttagatggcGacctgtccgacccatgcggtccgaaccgcaccgacctgggcgggagagacagcctgtgccctccgaccggcagtccctccatgatcacggccatcacgatcatggccctctactccatcgtgtgcgtggtggggctcttcggaaacttcctggtcatgtatgtgattgtcag'))
+rs.addRegion(Region(154410961,154411313,'atacaccaagatgaagactgccaccaacatctacattttcaaccttgctctggcagatgccttagccaccagtaccctgcccttccagagtgtgaattacctaatgggaacatggccatttggaaccatcctttgcaagatagtgatctccatagattactataacatgttcaccagcatattcaccctctgcaccatgagtgttgatcgatacattgcagtctgccaccctgtcaaggccttagatttccgtactccccgaaatgccaaaattatcaatgtctgcaactggatcctctcttcagccattggtcttcctgtaatgttcatggctacaacaaaatacaggcaag'))
+rs.addRegion(Region(154412087,154412607,'gttccatagattgtacactaacattctctcatccaacctggtactgggaaaacctgctgaagatctgtgttttcatcttcgccttcattatgccagtgctcatcattaccgtgtgctatggactgatgatcttgcgcctcaagagtgtccgcatgctctctggctccaaagaaaaggacaggaatcttcgaaggatcaccaggatggtgctggtggtggtggctgtgttcatcgtctgctggactcccattcacatttacgtcatcattaaagccttggttacaatcccagaaactacgttccagactgtttcttggcacttctgcattgctctaggttacacaaacagctgcctcaacccagtcctttatgcatttctggatgaaaacttcaaacgatgcttcagagagttctgtatcccaacctcttccaacattgagcaacaaaactccactcgaattcgtcagaacactagagaccacccctccacggccaatacagtggatagaactaatcatcag'))
+rs.addRegion(Region(154428600,154428787,'gtggaattgaacctggactgtcactgtgaaaatgcaaagccttggccactgagctacaatgcagggcagtctccatttcccttcccaggaagagtctagagcattaattttgagtttgcaaaggcttgtaactatttcatatgatttttagagctgactatgacatgaaccctaaaattcctgttccc'))
+rs.produceReads(3,50)
+
+
+
+
+
+