comparison test-data/generate_reads.py @ 0:460f0749aac5 draft default tip

planemo upload for repository https://github.com/ErasmusMC-Bioinformatics/samtools_parallel_mpileup_galaxy_wrapper commit ede01f67a8def5be7c88d5c31c2435b3946f1523-dirty
author yhoogstrate
date Thu, 05 Nov 2015 07:49:02 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:460f0749aac5
1 #!/usr/bin/env python
2
3
4 import random
5 import math
6
7
8 __version_info__ = ('1', '0', '0')
9 __version__ = '.'.join(__version_info__)
10
11
12 class Region:
13 def __init__(self,start,stop,sequence):
14 self.start = start
15 self.stop = stop
16 self.sequence = sequence.strip().replace("\n","").replace(" ","")
17 if(len(self.sequence) != self.getSpanningLength()):
18 print "ERROR: sequence length: "+str(len(self.sequence))+", while spanning region is: "+str(self.getSpanningLength())
19 import sys
20 sys.exit()
21
22 def getSpanningLength(self):
23 return abs(self.stop-self.start+1)
24
25 class ReadSynthesizer:
26 def __init__(self,chromosome):
27 self.regions = []
28 self.chromosome = chromosome
29
30 def addRegion(self,region):
31 self.regions.append(region)
32
33 def produceReads(self,readDensity = 1,read_length = 50):
34 """
35 Produces uniform reads by walking iteratively over self.regions
36 """
37
38 mRNA = self.getTotalmRNA()
39 spanning_length = self.getRegionSpanningLength()
40 n = spanning_length['total'] - read_length + 1
41
42 j = 0
43 k = 0
44
45 for i in range(n):
46 # "alpha is playing the role of k and beta is playing the role of theta"
47 dd = max(0,int(round(random.lognormvariate(math.log(readDensity),0.5))))# Notice this is NOT a binomial distribution!!
48
49 for d in range(dd):
50 sequence = mRNA[i:i+read_length]
51
52 if(random.randint(0,1) == 0):
53 strand = 0
54 else:
55 strand = 16
56 flag = strand + 0
57
58 print "read_"+str(j)+"."+str(i)+"."+str(d)+"\t"+str(flag)+"\t"+self.chromosome+"\t"+str(self.regions[j].start + k)+"\t60\t"+self.getMappingString(read_length,j,k)+"\t*\t0\t0\t"+str(sequence.upper())+"\t*"
59
60 spanning_length['iter'][j] -= 1
61 if(k >= self.regions[j].getSpanningLength()-1):
62 j += 1
63 k = 0
64 else:
65 k += 1
66
67 def getMappingString(self,length,j,offset):
68 m = 0
69
70 out = ""
71
72 for i in range(length):
73 k = i + offset
74
75 if(k >= self.regions[j].getSpanningLength()):
76 j += 1
77
78 out += str(m)+"M"
79 out += (str(self.regions[j].start - self.regions[j-1].stop-1))+"N"
80 m = 1
81
82 offset = -k
83 else:
84 m += 1
85
86 out += str(m) + "M"
87
88
89 return out
90
91 def getRegionSpanningLength(self):
92 length = {'total':0,'iter':[]}
93 for r in self.regions:
94 l = r.getSpanningLength()
95 length['iter'].append(l)
96 length['total'] += l
97 return length
98
99 def getTotalmRNA(self):
100 mRNA = ""
101 for r in self.regions:
102 mRNA += r.sequence
103 return mRNA
104
105
106
107 if __name__ == "__main__":
108 # Artificial SNP
109 rs = ReadSynthesizer('chr1')
110 rs.addRegion(Region( 0+1, 59+1,'aaataggtcccaaacgttacgca'+'G'+'tctatgcctgacaaagttgcgaccacttcctctgcc'))#c -> G
111 rs.addRegion(Region( 60+1,119+1,'ttgtgtgacacgccggagatagg'+'A'+'catcagcaagtacgttaagtacactgaacgaactgg'))#g -> A
112 rs.addRegion(Region(120+1,179+1,'aggtttctacatcgtgcgtgatggc'+'C'+'ctaggagaagtgggtgtatctgcacagcataagt'))#t -> C
113 rs.addRegion(Region(180+1,239+1,'tataagacggaagtaaagcgtcttc'+'G'+'ccgttcagcaccccacgctcatagtcaatgctgg'))#a -> G
114 #rs.addRegion(Region(240+1,299+1,'ttcagcatagtcaagcgccggtggcctccaaaaagacgcactgagtagcttagctacttt'))
115 #rs.addRegion(Region(300+1,359+1,'gctccgcttgcggaagcactaagaggagattgaatttccaaatcccccccgatacctgtg'))
116 #rs.addRegion(Region(360+1,419+1,'cggtcgctacgtaagtgcgaagttctgttagatacgctccccttagtatatgggcgttaa'))
117 #rs.addRegion(Region(420+1,479+1,'tcggaccgtcggtactcactgcattccaggtctcatatagttcgccctagaagcctggga'))
118 rs.addRegion(Region(480+1,539+1,'tgaacgttgaacta'+'GCC'+'ctgatgtaaaccccgcgtgccaattccaggcgtcatgggggca'))#tag -> gcc
119 #rs.addRegion(Region(540+1,599+1,'acccctcgcagcctccctcttgctgttggtgcctagtatttcatgatttcgagccgacat'))
120 rs.produceReads(2,35)