5
|
1 #!/usr/bin/python
|
|
2
|
|
3 __author__ = 'Abdelrahman Hosny'
|
|
4
|
|
5 import random
|
|
6 import subprocess
|
|
7 import os
|
|
8 import sys
|
|
9 import datetime
|
|
10 import shutil
|
|
11
|
|
12 from . import fileio
|
|
13
|
|
14
|
|
15 def _log(message):
|
|
16 print '[CNV SIM {:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) + "] " + message
|
|
17
|
|
18
|
|
19 def getScriptPath():
|
|
20 return os.path.dirname(os.path.realpath(__file__))
|
|
21
|
|
22
|
|
23 def _generateCNVMatrix(targets_list, regions_count):
|
|
24 '''
|
|
25 A function to randomly divide sequential exonic targets into whole CNV regions
|
|
26 :param targets_list: a list of target exons loaded from the file provided by the user
|
|
27 :return: A matrix where rows represent the region index and the first column as a list of targets in this region
|
|
28 '''
|
|
29 regions_count -= 1
|
|
30 number_of_targets = len(targets_list)
|
|
31 comine_size = number_of_targets / regions_count
|
|
32 cnv_matrix = []
|
|
33
|
|
34 i = 0
|
|
35 while True:
|
|
36 if i == len(targets_list):
|
|
37 break
|
|
38
|
|
39 first_target = i
|
|
40 i += comine_size
|
|
41 if i > len(targets_list):
|
|
42 i = len(targets_list)
|
|
43 last_target = i
|
|
44 cnv_matrix.append(targets_list[first_target:last_target])
|
|
45
|
|
46 return cnv_matrix
|
|
47
|
|
48
|
|
49 def _loadCNVMatrix(targets_list, cnv_regions):
|
|
50 '''
|
|
51 A function to map targets to their corresponding CNV regions
|
|
52 :param targets_list: a list of target exons loaded from the file provided by the user
|
|
53 :param cnv_regions: a list of CNV regions loaded from the CNV file provided by the user
|
|
54 :return: A matrix where rows represent the region index and the first column as a list of targets in this region
|
|
55 '''
|
|
56 cnv_matrix = []
|
|
57 for cnv in cnv_regions:
|
|
58 region_chromosome = cnv[0]
|
|
59 region_start = cnv[1]
|
|
60 region_end = cnv[2]
|
|
61
|
|
62 region_targets = []
|
|
63 for target in targets_list:
|
|
64 target_chromosome = target[0]
|
|
65 target_start = target[1]
|
|
66 target_end = target[2]
|
|
67 if target_chromosome == region_chromosome and target_start >= region_start and target_end <= region_end:
|
|
68 region_targets.append(target)
|
|
69
|
|
70 cnv_matrix.append(region_targets)
|
|
71
|
|
72 return cnv_matrix
|
|
73
|
|
74
|
|
75 def _generateCNVMask(number_of_exons, p_amplify, p_delete, min_variation, max_variation):
|
|
76 '''
|
|
77 This function generates random Copy Number Variations mask list
|
|
78 :param number_of_exons: the length of the target regions.
|
|
79 :param p_amplify: percentage of amplifications introduced
|
|
80 :param p_delete: percentage of deletions introduced
|
|
81 :return: a list of the same length as the exons list. each list item
|
|
82 indicates the variation to be added to the exon in the same position.
|
|
83 Positive number: number of amplification
|
|
84 Zero: no change
|
|
85 -1: delete this exon
|
|
86 '''
|
|
87
|
|
88 number_of_amplifications = int(p_amplify * number_of_exons)
|
|
89 number_of_deletions = int(p_delete * number_of_exons)
|
|
90 cnv_mask = [0] * number_of_exons
|
|
91
|
|
92 # generate CNV mask (a list of amplifications and deletions to be applied to the exons)
|
|
93 while number_of_amplifications > 0:
|
|
94 choice = random.randrange(0, number_of_exons)
|
|
95 while cnv_mask[choice] != 0:
|
|
96 choice = random.randrange(0, number_of_exons)
|
|
97 cnv_mask[choice] = random.randrange(min_variation, max_variation) # random amplifications in the range [min_variation, max_variation)
|
|
98 number_of_amplifications -= 1
|
|
99 random.shuffle(cnv_mask)
|
|
100 while number_of_deletions > 0:
|
|
101 choice = random.randrange(0, number_of_exons)
|
|
102 while cnv_mask[choice] != 0:
|
|
103 choice = random.randrange(0, number_of_exons)
|
|
104 cnv_mask[choice] = -1*random.randrange(min_variation, max_variation) # random deletions in the range [min_variation, max_variation)
|
|
105 number_of_deletions -= 1
|
|
106 random.shuffle(cnv_mask)
|
|
107
|
|
108 return cnv_mask
|
|
109
|
|
110
|
|
111 def _simulateCNV(genome, cnv_matrix, mask, read_length):
|
|
112 '''
|
|
113 This function creates two new genome references and target files for control and CNV
|
|
114 :param genome: text string of the genome
|
|
115 :param cnv_matrix: A matrix where rows represent the region index and the first column as a list of targets in this region
|
|
116 :param mask: a list to indicate the variations introduced to each CNV region
|
|
117 :return: control genome, control target list, modified genome and modified target list (in order)
|
|
118 '''
|
|
119
|
|
120 control_genome = []
|
|
121 control_targets = [] # initialize control target list
|
|
122 cnv_genome = []
|
|
123 cnv_targets = [] # initialize cnv target list
|
|
124
|
|
125 CONTROL_APPEND_INDEX = 0 # a value to re-base the (start, end) of amplified/deleted targets in a region
|
|
126 CNV_APPEND_INDEX = 0 # a value to re-base the (start, end) of amplified/deleted targets in a region
|
|
127
|
|
128 for k, cnv_region in enumerate(cnv_matrix):
|
|
129
|
|
130 number_of_copies = mask[k]
|
|
131
|
|
132 if number_of_copies > 0:
|
|
133 # if amplification
|
|
134 for target in cnv_region:
|
|
135 for _ in range(number_of_copies):
|
|
136 amplification = genome[target[1]-read_length:target[2]+read_length]
|
|
137 cnv_genome.append(amplification)
|
|
138 chromosome = target[0]
|
|
139 start = CNV_APPEND_INDEX + read_length
|
|
140 end = CNV_APPEND_INDEX + len(amplification) - read_length
|
|
141 cnv_targets.append((chromosome, start, end))
|
|
142 CNV_APPEND_INDEX += len(amplification)
|
|
143 elif number_of_copies < 0:
|
|
144 # if deletion
|
|
145 for target in cnv_region:
|
|
146 for _ in range(abs(number_of_copies)):
|
|
147 deletion = genome[target[1]-read_length:target[2]+read_length]
|
|
148 control_genome.append(deletion)
|
|
149 chromosome = target[0]
|
|
150 start = CONTROL_APPEND_INDEX + read_length
|
|
151 end = CONTROL_APPEND_INDEX + len(deletion) - read_length
|
|
152 control_targets.append((chromosome, start, end))
|
|
153 CONTROL_APPEND_INDEX += len(deletion)
|
|
154
|
|
155 return ''.join(control_genome), control_targets, ''.join(cnv_genome), cnv_targets
|
|
156
|
|
157
|
|
158 def _callWessim(genome_file, target_file, output_file, number_of_reads, read_length, model_file="models/ill100v4_p.gzip", number_of_threads=4):
|
|
159 '''
|
|
160 Calls Wessim to generate artificial reads for the targets on the reference genome
|
|
161 :param genome_file: reference genome file in FASTA format
|
|
162 :param target_file: target regions file in BED format
|
|
163 :param output_file: output file name
|
|
164 :param number_of_reads: the number of reads to generate for this simulation
|
|
165 :param read_length: the read length
|
|
166 :param model_file: GemSim's empirical error models used by Wessim for NGS read generation
|
|
167 :param number_of_threads: the number of threads to run this simulation
|
|
168 :return: None
|
|
169 '''
|
|
170 os.chdir(os.path.join(getScriptPath(), "Wessim" ))
|
|
171 subprocess.call(["python", "Wessim1.py", \
|
|
172 "-R" + genome_file, \
|
|
173 "-B" + target_file, \
|
|
174 "-n" + str(number_of_reads), \
|
|
175 "-l" + str(read_length), \
|
|
176 "-M" + model_file, \
|
|
177 "-o" + output_file, \
|
|
178 "-t" + str(number_of_threads), \
|
|
179 "-p"], stderr=None)
|
|
180 os.chdir('..')
|
|
181
|
|
182
|
|
183 def simulate_exome_cnv(simulation_parameters, cnv_list_parameters=None):
|
|
184 '''
|
|
185 Simulate copy number variations on the passed reference genome based on the given simulation parameters
|
|
186 :param genome: path to the FASTA file containing the reference genome
|
|
187 :param simulation_parameters: a dictionary containing parameters for simulation
|
|
188 :return: simulation status {'simulation_status': bool, 'message': str}
|
|
189 '''
|
|
190 _log('simulation type: whole exome')
|
|
191
|
|
192 # create a temporary directory for intermediate files
|
|
193 if not os.path.exists(simulation_parameters['tmp_dir']):
|
|
194 os.makedirs(simulation_parameters['tmp_dir'])
|
|
195
|
|
196 # create output directory for final results
|
|
197 if not os.path.exists(simulation_parameters['output_dir']):
|
|
198 os.makedirs(simulation_parameters['output_dir'])
|
|
199
|
|
200 if simulation_parameters['target_file'] is None:
|
|
201 _log("ERROR: target file cannot be None for whole exome simulation!")
|
|
202 fileio.clean(simulation_parameters['output_dir'])
|
|
203 fileio.clean(simulation_parameters['tmp_dir'])
|
|
204 exit()
|
|
205 target_file = simulation_parameters['target_file']
|
|
206
|
|
207 # copy genome and target to the tmp folder
|
|
208 shutil.copyfile(simulation_parameters['genome_file'], os.path.join(simulation_parameters['tmp_dir'], "reference.fa"))
|
|
209 shutil.copyfile(target_file, os.path.join(simulation_parameters['tmp_dir'], "target.bed"))
|
|
210 genome_file = os.path.join(simulation_parameters['tmp_dir'], "reference.fa")
|
|
211 target_file = os.path.join(simulation_parameters['tmp_dir'], "target.bed")
|
|
212
|
|
213 # initialize variables for temporary files
|
|
214 control_genome_file = os.path.join(simulation_parameters['tmp_dir'], "ControlGenome.fa")
|
|
215 control_target_file = os.path.join(simulation_parameters['tmp_dir'], "ControlTarget.bed")
|
|
216 cnv_genome_file = os.path.join(simulation_parameters['tmp_dir'], "CNVGenome.fa")
|
|
217 cnv_target_file = os.path.join(simulation_parameters['tmp_dir'], "CNVTarget.bed")
|
|
218 base_reads_file = os.path.join(simulation_parameters['tmp_dir'], "base")
|
|
219 control_reads_file = os.path.join(simulation_parameters['tmp_dir'], "control")
|
|
220 cnv_reads_file = os.path.join(simulation_parameters['tmp_dir'], "cnv")
|
|
221
|
|
222 _log("loading genome file ..")
|
|
223 header, genome = fileio.readGenome(genome_file)
|
|
224 _log("successfully loaded a genome of length " + `len(genome)`)
|
|
225
|
|
226 _log("loading target file ..")
|
|
227 _log("sorting and merging targets ..")
|
|
228 target_file = fileio.prepareTargetFile(target_file)
|
|
229 targets = fileio.readTargets(target_file)
|
|
230 _log("successfully loaded " + `len(targets)` + " targets ..")
|
|
231
|
|
232 if simulation_parameters['cnv_list_file'] is None:
|
|
233 # CNV list file
|
|
234 cnv_list_file = os.path.join(simulation_parameters['output_dir'], "CNVList.bed")
|
|
235
|
|
236 _log("generating CNV list ..")
|
|
237 cnv_matrix = _generateCNVMatrix(targets, cnv_list_parameters['regions_count'])
|
|
238 mask = _generateCNVMask(len(cnv_matrix), cnv_list_parameters['amplifications'], \
|
|
239 cnv_list_parameters['deletions'], cnv_list_parameters['minimum_variations'], \
|
|
240 cnv_list_parameters['maximum_variations'])
|
|
241
|
|
242 with open(cnv_list_file, 'w') as f:
|
|
243 for i, cnv_region in enumerate(cnv_matrix):
|
|
244 line = cnv_region[0][0] + '\t' \
|
|
245 + `cnv_region[0][1]` + '\t' \
|
|
246 + `cnv_region[-1][2]` + '\t' \
|
|
247 + `mask[i]` + '\n'
|
|
248 f.write(line)
|
|
249 _log("randomly generated CNV list saved to " + cnv_list_file)
|
|
250 else:
|
|
251 _log("loading CNV list ..")
|
|
252 with open(simulation_parameters['cnv_list_file'], "r") as f:
|
|
253 cnv_list = []
|
|
254 lines = f.readlines()
|
|
255 for line in lines:
|
|
256 chromosome, region_start, region_end, variation = line.strip().split("\t")
|
|
257 cnv_list.append((chromosome, int(region_start), int(region_end), int(variation)))
|
|
258
|
|
259 cnv_matrix = _loadCNVMatrix(targets, cnv_list)
|
|
260 mask = map(lambda x: x[3], cnv_list)
|
|
261 _log("successfully loaded CNV list that contains " + `len(lines)` + " regions ..")
|
|
262
|
|
263 # call Wessim to generate reads from the genome file and the target list
|
|
264 _log("generating reads for the target exons ..")
|
|
265 _log("delegating job to Wessim ...")
|
|
266 _callWessim(genome_file, target_file, base_reads_file, simulation_parameters['number_of_reads'], simulation_parameters['read_length'])
|
|
267
|
|
268 # generate genome and target files for the control and the CNV
|
|
269 _log("simulating copy number variations (amplifications/deletions)")
|
|
270 control_genome, control_targets, cnv_genome, cnv_targets = _simulateCNV(genome, cnv_matrix, mask, simulation_parameters['read_length'])
|
|
271
|
|
272 _log("saving to the control genome file ..")
|
|
273 with open(control_genome_file, 'w') as fw:
|
|
274 fw.write(header + "\n")
|
|
275 n = 50
|
|
276 l = [control_genome[i:i + n] for i in range(0, len(control_genome), n)]
|
|
277 for line in l:
|
|
278 fw.write(line + "\n")
|
|
279
|
|
280 _log("saving to the control target file ..")
|
|
281 with open(control_target_file, 'w') as tw:
|
|
282 for target in control_targets:
|
|
283 line = target[0] + "\t" + `target[1]` + "\t" + `target[2]` + "\n"
|
|
284 tw.write(line)
|
|
285
|
|
286 _log("delegating job to Wessim ...")
|
|
287 _callWessim(control_genome_file, control_target_file, control_reads_file,
|
|
288 int(cnv_list_parameters['deletions'] * simulation_parameters['number_of_reads']), simulation_parameters['read_length'])
|
|
289
|
|
290 _log("saving to the CNV genome file ..")
|
|
291 with open(cnv_genome_file, 'w') as fw:
|
|
292 fw.write(header + "\n")
|
|
293 n = 50
|
|
294 l = [cnv_genome[i:i + n] for i in range(0, len(cnv_genome), n)]
|
|
295 for line in l:
|
|
296 fw.write(line + "\n")
|
|
297
|
|
298 _log("saving to the CNV target file ..")
|
|
299 with open(cnv_target_file, 'w') as tw:
|
|
300 for target in cnv_targets:
|
|
301 line = target[0] + "\t" + `target[1]` + "\t" + `target[2]` + "\n"
|
|
302 tw.write(line)
|
|
303
|
|
304 _log("delegating job to Wessim ...")
|
|
305 _callWessim(cnv_genome_file, cnv_target_file, cnv_reads_file,
|
|
306 int(cnv_list_parameters['amplifications']* simulation_parameters['number_of_reads']), simulation_parameters['read_length'])
|
|
307
|
|
308 _log("merging results ..")
|
|
309 fileio.mergeWessimReads(simulation_parameters['tmp_dir'], simulation_parameters['output_dir'])
|
|
310
|
|
311 _log("cleaning temporary files ..")
|
|
312 fileio.clean(simulation_parameters['tmp_dir'])
|
|
313
|
|
314 _log("simulation completed. find results in " + simulation_parameters['output_dir']) |