Mercurial > repos > jason-ellul > calculatezprimefactor
comparison test-data/generate_reads.py @ 5:d7efb5d5352a draft default tip
Uploaded
author | jason-ellul |
---|---|
date | Wed, 01 Jun 2016 02:40:16 -0400 |
parents | 796653c6376b |
children |
comparison
equal
deleted
inserted
replaced
4:796653c6376b | 5:d7efb5d5352a |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 | |
4 import random | |
5 import math | |
6 | |
7 | |
8 class Region: | |
9 def __init__(self,start,stop,sequence): | |
10 self.start = start | |
11 self.stop = stop | |
12 self.sequence = sequence.strip().replace("\n","").replace(" ","") | |
13 if(len(self.sequence) != self.getSpanningLength()): | |
14 print "ERROR: sequence length: "+str(len(self.sequence))+", while spanning region is: "+str(self.getSpanningLength()) | |
15 import sys | |
16 sys.exit() | |
17 | |
18 def getSpanningLength(self): | |
19 return abs(self.stop-self.start+1) | |
20 | |
21 class ReadSynthesizer: | |
22 def __init__(self,chromosome): | |
23 self.regions = [] | |
24 self.chromosome = chromosome | |
25 | |
26 def addRegion(self,region): | |
27 self.regions.append(region) | |
28 | |
29 def produceReads(self,readDensity = 1,read_length = 50): | |
30 """ | |
31 Produces uniform reads by walking iteratively over self.regions | |
32 """ | |
33 | |
34 mRNA = self.getTotalmRNA() | |
35 spanning_length = self.getRegionSpanningLength() | |
36 n = spanning_length['total'] - read_length + 1 | |
37 | |
38 j = 0 | |
39 k = 0 | |
40 | |
41 for i in range(n): | |
42 # "alpha is playing the role of k and beta is playing the role of theta" | |
43 dd = max(0,int(round(random.lognormvariate(math.log(readDensity),0.5))))# Notice this is NOT a binomial distribution!! | |
44 | |
45 for d in range(dd): | |
46 sequence = mRNA[i:i+read_length] | |
47 | |
48 if(random.randint(0,1) == 0): | |
49 strand = 0 | |
50 else: | |
51 strand = 16 | |
52 flag = strand + 0 | |
53 | |
54 print "read_"+str(j)+"."+str(i)+"."+str(d)+"\t"+str(flag)+"\t"+self.chromosome+"\t"+str(self.regions[j].start + k)+"\t60\t"+self.getMappingString(read_length,j,k)+"\t*\t0\t0\t"+str(sequence.upper())+"\t*" | |
55 | |
56 spanning_length['iter'][j] -= 1 | |
57 if(k >= self.regions[j].getSpanningLength()-1): | |
58 j += 1 | |
59 k = 0 | |
60 else: | |
61 k += 1 | |
62 | |
63 def getMappingString(self,length,j,offset): | |
64 m = 0 | |
65 | |
66 out = "" | |
67 | |
68 for i in range(length): | |
69 k = i + offset | |
70 | |
71 if(k >= self.regions[j].getSpanningLength()): | |
72 j += 1 | |
73 | |
74 out += str(m)+"M" | |
75 out += (str(self.regions[j].start - self.regions[j-1].stop-1))+"N" | |
76 m = 1 | |
77 | |
78 offset = -k | |
79 else: | |
80 m += 1 | |
81 | |
82 out += str(m) + "M" | |
83 | |
84 | |
85 return out | |
86 | |
87 def getRegionSpanningLength(self): | |
88 length = {'total':0,'iter':[]} | |
89 for r in self.regions: | |
90 l = r.getSpanningLength() | |
91 length['iter'].append(l) | |
92 length['total'] += l | |
93 return length | |
94 | |
95 def getTotalmRNA(self): | |
96 mRNA = "" | |
97 for r in self.regions: | |
98 mRNA += r.sequence | |
99 return mRNA | |
100 | |
101 #rs = ReadSynthesizer('chr6') | |
102 #rs.addRegion(Region(100,149,'ccaggactggtttctgtaagaaacagcaggagctgtggcagcggcgaaag')) | |
103 #rs.addRegion(Region(151,152,'at')) | |
104 #rs.produceReads(3,50) | |
105 | |
106 | |
107 rs = ReadSynthesizer('chr6') | |
108 rs.addRegion(Region(154360546,154360969,'ccaggactggtttctgtaagaaacagcaggagctgtggcagcggcgaaaggaagcggctgaggcgcttggaacccgaaaagtctcggtgctcctggctacctcgcacagcggtgcccgcccggccgtcagtaccatggacagcagcgctgcccccacgaacgccagcaattgcactgatgccttggcgtactcaagttgctccccagcacccagccccggttcctgggtcaacttgtcccacttagatggcGacctgtccgacccatgcggtccgaaccgcaccgacctgggcgggagagacagcctgtgccctccgaccggcagtccctccatgatcacggccatcacgatcatggccctctactccatcgtgtgcgtggtggggctcttcggaaacttcctggtcatgtatgtgattgtcag')) | |
109 rs.addRegion(Region(154410961,154411313,'atacaccaagatgaagactgccaccaacatctacattttcaaccttgctctggcagatgccttagccaccagtaccctgcccttccagagtgtgaattacctaatgggaacatggccatttggaaccatcctttgcaagatagtgatctccatagattactataacatgttcaccagcatattcaccctctgcaccatgagtgttgatcgatacattgcagtctgccaccctgtcaaggccttagatttccgtactccccgaaatgccaaaattatcaatgtctgcaactggatcctctcttcagccattggtcttcctgtaatgttcatggctacaacaaaatacaggcaag')) | |
110 rs.addRegion(Region(154412087,154412607,'gttccatagattgtacactaacattctctcatccaacctggtactgggaaaacctgctgaagatctgtgttttcatcttcgccttcattatgccagtgctcatcattaccgtgtgctatggactgatgatcttgcgcctcaagagtgtccgcatgctctctggctccaaagaaaaggacaggaatcttcgaaggatcaccaggatggtgctggtggtggtggctgtgttcatcgtctgctggactcccattcacatttacgtcatcattaaagccttggttacaatcccagaaactacgttccagactgtttcttggcacttctgcattgctctaggttacacaaacagctgcctcaacccagtcctttatgcatttctggatgaaaacttcaaacgatgcttcagagagttctgtatcccaacctcttccaacattgagcaacaaaactccactcgaattcgtcagaacactagagaccacccctccacggccaatacagtggatagaactaatcatcag')) | |
111 rs.addRegion(Region(154428600,154428787,'gtggaattgaacctggactgtcactgtgaaaatgcaaagccttggccactgagctacaatgcagggcagtctccatttcccttcccaggaagagtctagagcattaattttgagtttgcaaaggcttgtaactatttcatatgatttttagagctgactatgacatgaaccctaaaattcctgttccc')) | |
112 rs.produceReads(3,50) | |
113 | |
114 | |
115 | |
116 | |
117 | |
118 |