annotate test-data/generate_reads.py @ 4:796653c6376b draft

Uploaded
author jason-ellul
date Wed, 01 Jun 2016 02:36:11 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
4
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
1 #!/usr/bin/env python
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
2
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
3
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
4 import random
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
5 import math
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
6
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
7
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
8 class Region:
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
9 def __init__(self,start,stop,sequence):
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
10 self.start = start
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
11 self.stop = stop
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
12 self.sequence = sequence.strip().replace("\n","").replace(" ","")
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
13 if(len(self.sequence) != self.getSpanningLength()):
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
14 print "ERROR: sequence length: "+str(len(self.sequence))+", while spanning region is: "+str(self.getSpanningLength())
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
15 import sys
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
16 sys.exit()
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
17
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
18 def getSpanningLength(self):
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
19 return abs(self.stop-self.start+1)
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
20
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
21 class ReadSynthesizer:
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
22 def __init__(self,chromosome):
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
23 self.regions = []
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
24 self.chromosome = chromosome
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
25
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
26 def addRegion(self,region):
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
27 self.regions.append(region)
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
28
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
29 def produceReads(self,readDensity = 1,read_length = 50):
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
30 """
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
31 Produces uniform reads by walking iteratively over self.regions
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
32 """
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
33
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
34 mRNA = self.getTotalmRNA()
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
35 spanning_length = self.getRegionSpanningLength()
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
36 n = spanning_length['total'] - read_length + 1
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
37
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
38 j = 0
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
39 k = 0
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
40
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
41 for i in range(n):
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
42 # "alpha is playing the role of k and beta is playing the role of theta"
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
43 dd = max(0,int(round(random.lognormvariate(math.log(readDensity),0.5))))# Notice this is NOT a binomial distribution!!
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
44
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
45 for d in range(dd):
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
46 sequence = mRNA[i:i+read_length]
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
47
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
48 if(random.randint(0,1) == 0):
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
49 strand = 0
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
50 else:
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
51 strand = 16
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
52 flag = strand + 0
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
53
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
54 print "read_"+str(j)+"."+str(i)+"."+str(d)+"\t"+str(flag)+"\t"+self.chromosome+"\t"+str(self.regions[j].start + k)+"\t60\t"+self.getMappingString(read_length,j,k)+"\t*\t0\t0\t"+str(sequence.upper())+"\t*"
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
55
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
56 spanning_length['iter'][j] -= 1
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
57 if(k >= self.regions[j].getSpanningLength()-1):
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
58 j += 1
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
59 k = 0
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
60 else:
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
61 k += 1
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
62
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
63 def getMappingString(self,length,j,offset):
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
64 m = 0
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
65
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
66 out = ""
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
67
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
68 for i in range(length):
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
69 k = i + offset
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
70
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
71 if(k >= self.regions[j].getSpanningLength()):
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
72 j += 1
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
73
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
74 out += str(m)+"M"
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
75 out += (str(self.regions[j].start - self.regions[j-1].stop-1))+"N"
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
76 m = 1
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
77
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
78 offset = -k
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
79 else:
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
80 m += 1
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
81
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
82 out += str(m) + "M"
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
83
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
84
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
85 return out
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
86
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
87 def getRegionSpanningLength(self):
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
88 length = {'total':0,'iter':[]}
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
89 for r in self.regions:
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
90 l = r.getSpanningLength()
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
91 length['iter'].append(l)
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
92 length['total'] += l
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
93 return length
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
94
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
95 def getTotalmRNA(self):
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
96 mRNA = ""
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
97 for r in self.regions:
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
98 mRNA += r.sequence
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
99 return mRNA
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
100
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
101 #rs = ReadSynthesizer('chr6')
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
102 #rs.addRegion(Region(100,149,'ccaggactggtttctgtaagaaacagcaggagctgtggcagcggcgaaag'))
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
103 #rs.addRegion(Region(151,152,'at'))
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
104 #rs.produceReads(3,50)
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
105
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
106
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
107 rs = ReadSynthesizer('chr6')
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
108 rs.addRegion(Region(154360546,154360969,'ccaggactggtttctgtaagaaacagcaggagctgtggcagcggcgaaaggaagcggctgaggcgcttggaacccgaaaagtctcggtgctcctggctacctcgcacagcggtgcccgcccggccgtcagtaccatggacagcagcgctgcccccacgaacgccagcaattgcactgatgccttggcgtactcaagttgctccccagcacccagccccggttcctgggtcaacttgtcccacttagatggcGacctgtccgacccatgcggtccgaaccgcaccgacctgggcgggagagacagcctgtgccctccgaccggcagtccctccatgatcacggccatcacgatcatggccctctactccatcgtgtgcgtggtggggctcttcggaaacttcctggtcatgtatgtgattgtcag'))
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
109 rs.addRegion(Region(154410961,154411313,'atacaccaagatgaagactgccaccaacatctacattttcaaccttgctctggcagatgccttagccaccagtaccctgcccttccagagtgtgaattacctaatgggaacatggccatttggaaccatcctttgcaagatagtgatctccatagattactataacatgttcaccagcatattcaccctctgcaccatgagtgttgatcgatacattgcagtctgccaccctgtcaaggccttagatttccgtactccccgaaatgccaaaattatcaatgtctgcaactggatcctctcttcagccattggtcttcctgtaatgttcatggctacaacaaaatacaggcaag'))
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
110 rs.addRegion(Region(154412087,154412607,'gttccatagattgtacactaacattctctcatccaacctggtactgggaaaacctgctgaagatctgtgttttcatcttcgccttcattatgccagtgctcatcattaccgtgtgctatggactgatgatcttgcgcctcaagagtgtccgcatgctctctggctccaaagaaaaggacaggaatcttcgaaggatcaccaggatggtgctggtggtggtggctgtgttcatcgtctgctggactcccattcacatttacgtcatcattaaagccttggttacaatcccagaaactacgttccagactgtttcttggcacttctgcattgctctaggttacacaaacagctgcctcaacccagtcctttatgcatttctggatgaaaacttcaaacgatgcttcagagagttctgtatcccaacctcttccaacattgagcaacaaaactccactcgaattcgtcagaacactagagaccacccctccacggccaatacagtggatagaactaatcatcag'))
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
111 rs.addRegion(Region(154428600,154428787,'gtggaattgaacctggactgtcactgtgaaaatgcaaagccttggccactgagctacaatgcagggcagtctccatttcccttcccaggaagagtctagagcattaattttgagtttgcaaaggcttgtaactatttcatatgatttttagagctgactatgacatgaaccctaaaattcctgttccc'))
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
112 rs.produceReads(3,50)
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
113
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
114
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
115
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
116
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
117
796653c6376b Uploaded
jason-ellul
parents:
diff changeset
118