5
|
1 #!/usr/bin/python
|
|
2
|
|
3 __author__ = 'Abdelrahman Hosny'
|
|
4
|
|
5 import random
|
|
6 import subprocess
|
|
7 import os
|
|
8 import sys
|
|
9 import datetime
|
|
10 import shutil
|
|
11
|
|
12 from . import fileio
|
|
13
|
|
14
|
|
15 def _log(message):
|
|
16 print '[CNV SIM {:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) + "] " + message
|
|
17
|
|
18
|
|
19 def getScriptPath():
|
|
20 return os.path.dirname(os.path.realpath(__file__))
|
|
21
|
|
22
|
17
|
23 def _generateCNVMatrix(targets_list, regions_count, region_min_length, region_max_length):
|
5
|
24 '''
|
|
25 A function to randomly divide sequential exonic targets into whole CNV regions
|
|
26 :param targets_list: a list of target exons loaded from the file provided by the user
|
|
27 :return: A matrix where rows represent the region index and the first column as a list of targets in this region
|
|
28 '''
|
|
29 regions_count -= 1
|
|
30 number_of_targets = len(targets_list)
|
17
|
31 combine_size = number_of_targets / regions_count
|
|
32 cnv_regions = []
|
5
|
33
|
|
34 i = 0
|
|
35 while True:
|
|
36 if i == len(targets_list):
|
|
37 break
|
|
38
|
17
|
39 region_start = targets_list[i][1]
|
|
40 region_end = region_start + random.randint(region_min_length, region_max_length) - 1
|
|
41 cnv_regions.append((targets_list[i][0], region_start, region_end))
|
|
42
|
|
43 i += combine_size
|
5
|
44 if i > len(targets_list):
|
|
45 i = len(targets_list)
|
|
46
|
17
|
47 cnv_matrix = _loadCNVMatrix(targets_list, cnv_regions)
|
|
48
|
|
49 return cnv_matrix, cnv_regions
|
5
|
50
|
|
51
|
|
52 def _loadCNVMatrix(targets_list, cnv_regions):
|
|
53 '''
|
|
54 A function to map targets to their corresponding CNV regions
|
|
55 :param targets_list: a list of target exons loaded from the file provided by the user
|
|
56 :param cnv_regions: a list of CNV regions loaded from the CNV file provided by the user
|
|
57 :return: A matrix where rows represent the region index and the first column as a list of targets in this region
|
|
58 '''
|
|
59 cnv_matrix = []
|
|
60 for cnv in cnv_regions:
|
|
61 region_chromosome = cnv[0]
|
|
62 region_start = cnv[1]
|
|
63 region_end = cnv[2]
|
|
64
|
|
65 region_targets = []
|
|
66 for target in targets_list:
|
|
67 target_chromosome = target[0]
|
|
68 target_start = target[1]
|
|
69 target_end = target[2]
|
17
|
70 if target_chromosome == region_chromosome and target_start >= region_start:
|
|
71 if target_end <= region_end:
|
|
72 region_targets.append(target)
|
|
73 elif target_start <= region_end:
|
|
74 truncated_target = (target_chromosome, target_start, region_end)
|
|
75 region_targets.append(truncated_target)
|
5
|
76
|
|
77 cnv_matrix.append(region_targets)
|
|
78
|
|
79 return cnv_matrix
|
|
80
|
|
81
|
|
82 def _generateCNVMask(number_of_exons, p_amplify, p_delete, min_variation, max_variation):
|
|
83 '''
|
|
84 This function generates random Copy Number Variations mask list
|
|
85 :param number_of_exons: the length of the target regions.
|
|
86 :param p_amplify: percentage of amplifications introduced
|
|
87 :param p_delete: percentage of deletions introduced
|
|
88 :return: a list of the same length as the exons list. each list item
|
|
89 indicates the variation to be added to the exon in the same position.
|
|
90 Positive number: number of amplification
|
|
91 Zero: no change
|
|
92 -1: delete this exon
|
|
93 '''
|
|
94
|
|
95 number_of_amplifications = int(p_amplify * number_of_exons)
|
|
96 number_of_deletions = int(p_delete * number_of_exons)
|
|
97 cnv_mask = [0] * number_of_exons
|
|
98
|
|
99 # generate CNV mask (a list of amplifications and deletions to be applied to the exons)
|
|
100 while number_of_amplifications > 0:
|
|
101 choice = random.randrange(0, number_of_exons)
|
|
102 while cnv_mask[choice] != 0:
|
|
103 choice = random.randrange(0, number_of_exons)
|
|
104 cnv_mask[choice] = random.randrange(min_variation, max_variation) # random amplifications in the range [min_variation, max_variation)
|
|
105 number_of_amplifications -= 1
|
|
106 random.shuffle(cnv_mask)
|
|
107 while number_of_deletions > 0:
|
|
108 choice = random.randrange(0, number_of_exons)
|
|
109 while cnv_mask[choice] != 0:
|
|
110 choice = random.randrange(0, number_of_exons)
|
|
111 cnv_mask[choice] = -1*random.randrange(min_variation, max_variation) # random deletions in the range [min_variation, max_variation)
|
|
112 number_of_deletions -= 1
|
|
113 random.shuffle(cnv_mask)
|
|
114
|
|
115 return cnv_mask
|
|
116
|
|
117
|
|
118 def _simulateCNV(genome, cnv_matrix, mask, read_length):
|
|
119 '''
|
|
120 This function creates two new genome references and target files for control and CNV
|
|
121 :param genome: text string of the genome
|
|
122 :param cnv_matrix: A matrix where rows represent the region index and the first column as a list of targets in this region
|
|
123 :param mask: a list to indicate the variations introduced to each CNV region
|
|
124 :return: control genome, control target list, modified genome and modified target list (in order)
|
|
125 '''
|
|
126
|
|
127 control_genome = []
|
|
128 control_targets = [] # initialize control target list
|
|
129 cnv_genome = []
|
|
130 cnv_targets = [] # initialize cnv target list
|
|
131
|
|
132 CONTROL_APPEND_INDEX = 0 # a value to re-base the (start, end) of amplified/deleted targets in a region
|
|
133 CNV_APPEND_INDEX = 0 # a value to re-base the (start, end) of amplified/deleted targets in a region
|
|
134
|
|
135 for k, cnv_region in enumerate(cnv_matrix):
|
|
136
|
|
137 number_of_copies = mask[k]
|
|
138
|
|
139 if number_of_copies > 0:
|
|
140 # if amplification
|
|
141 for target in cnv_region:
|
|
142 for _ in range(number_of_copies):
|
|
143 amplification = genome[target[1]-read_length:target[2]+read_length]
|
|
144 cnv_genome.append(amplification)
|
|
145 chromosome = target[0]
|
|
146 start = CNV_APPEND_INDEX + read_length
|
|
147 end = CNV_APPEND_INDEX + len(amplification) - read_length
|
|
148 cnv_targets.append((chromosome, start, end))
|
|
149 CNV_APPEND_INDEX += len(amplification)
|
|
150 elif number_of_copies < 0:
|
|
151 # if deletion
|
|
152 for target in cnv_region:
|
|
153 for _ in range(abs(number_of_copies)):
|
|
154 deletion = genome[target[1]-read_length:target[2]+read_length]
|
|
155 control_genome.append(deletion)
|
|
156 chromosome = target[0]
|
|
157 start = CONTROL_APPEND_INDEX + read_length
|
|
158 end = CONTROL_APPEND_INDEX + len(deletion) - read_length
|
|
159 control_targets.append((chromosome, start, end))
|
|
160 CONTROL_APPEND_INDEX += len(deletion)
|
|
161
|
|
162 return ''.join(control_genome), control_targets, ''.join(cnv_genome), cnv_targets
|
|
163
|
|
164
|
|
165 def _callWessim(genome_file, target_file, output_file, number_of_reads, read_length, model_file="models/ill100v4_p.gzip", number_of_threads=4):
|
|
166 '''
|
|
167 Calls Wessim to generate artificial reads for the targets on the reference genome
|
|
168 :param genome_file: reference genome file in FASTA format
|
|
169 :param target_file: target regions file in BED format
|
|
170 :param output_file: output file name
|
|
171 :param number_of_reads: the number of reads to generate for this simulation
|
|
172 :param read_length: the read length
|
|
173 :param model_file: GemSim's empirical error models used by Wessim for NGS read generation
|
|
174 :param number_of_threads: the number of threads to run this simulation
|
|
175 :return: None
|
|
176 '''
|
|
177 os.chdir(os.path.join(getScriptPath(), "Wessim" ))
|
|
178 subprocess.call(["python", "Wessim1.py", \
|
|
179 "-R" + genome_file, \
|
|
180 "-B" + target_file, \
|
|
181 "-n" + str(number_of_reads), \
|
|
182 "-l" + str(read_length), \
|
|
183 "-M" + model_file, \
|
|
184 "-o" + output_file, \
|
|
185 "-t" + str(number_of_threads), \
|
|
186 "-p"], stderr=None)
|
|
187 os.chdir('..')
|
|
188
|
|
189
|
|
190 def simulate_exome_cnv(simulation_parameters, cnv_list_parameters=None):
|
|
191 '''
|
|
192 Simulate copy number variations on the passed reference genome based on the given simulation parameters
|
|
193 :param genome: path to the FASTA file containing the reference genome
|
|
194 :param simulation_parameters: a dictionary containing parameters for simulation
|
|
195 :return: simulation status {'simulation_status': bool, 'message': str}
|
|
196 '''
|
17
|
197 _log('simulation type: targeted exome sequencing')
|
5
|
198
|
|
199 # create a temporary directory for intermediate files
|
|
200 if not os.path.exists(simulation_parameters['tmp_dir']):
|
|
201 os.makedirs(simulation_parameters['tmp_dir'])
|
|
202
|
|
203 # create output directory for final results
|
|
204 if not os.path.exists(simulation_parameters['output_dir']):
|
|
205 os.makedirs(simulation_parameters['output_dir'])
|
|
206
|
|
207 if simulation_parameters['target_file'] is None:
|
17
|
208 _log("ERROR: target file cannot be None for target exome simulation!")
|
5
|
209 fileio.clean(simulation_parameters['output_dir'])
|
|
210 fileio.clean(simulation_parameters['tmp_dir'])
|
|
211 exit()
|
|
212 target_file = simulation_parameters['target_file']
|
|
213
|
|
214 # copy genome and target to the tmp folder
|
|
215 shutil.copyfile(simulation_parameters['genome_file'], os.path.join(simulation_parameters['tmp_dir'], "reference.fa"))
|
|
216 shutil.copyfile(target_file, os.path.join(simulation_parameters['tmp_dir'], "target.bed"))
|
|
217 genome_file = os.path.join(simulation_parameters['tmp_dir'], "reference.fa")
|
|
218 target_file = os.path.join(simulation_parameters['tmp_dir'], "target.bed")
|
|
219
|
|
220 # initialize variables for temporary files
|
|
221 control_genome_file = os.path.join(simulation_parameters['tmp_dir'], "ControlGenome.fa")
|
|
222 control_target_file = os.path.join(simulation_parameters['tmp_dir'], "ControlTarget.bed")
|
|
223 cnv_genome_file = os.path.join(simulation_parameters['tmp_dir'], "CNVGenome.fa")
|
|
224 cnv_target_file = os.path.join(simulation_parameters['tmp_dir'], "CNVTarget.bed")
|
|
225 base_reads_file = os.path.join(simulation_parameters['tmp_dir'], "base")
|
|
226 control_reads_file = os.path.join(simulation_parameters['tmp_dir'], "control")
|
|
227 cnv_reads_file = os.path.join(simulation_parameters['tmp_dir'], "cnv")
|
|
228
|
|
229 _log("loading genome file ..")
|
|
230 header, genome = fileio.readGenome(genome_file)
|
|
231 _log("successfully loaded a genome of length " + `len(genome)`)
|
|
232
|
|
233 _log("loading target file ..")
|
|
234 _log("sorting and merging targets ..")
|
|
235 target_file = fileio.prepareTargetFile(target_file)
|
|
236 targets = fileio.readTargets(target_file)
|
|
237 _log("successfully loaded " + `len(targets)` + " targets ..")
|
|
238
|
|
239 if simulation_parameters['cnv_list_file'] is None:
|
|
240 # CNV list file
|
17
|
241 cnv_list_file = os.path.join(simulation_parameters['output_dir'], "copynumber.bed")
|
5
|
242
|
|
243 _log("generating CNV list ..")
|
17
|
244 cnv_matrix, cnv_regions = _generateCNVMatrix(targets, cnv_list_parameters['regions_count'], \
|
|
245 cnv_list_parameters['minimum_length'], cnv_list_parameters['maximum_length'])
|
5
|
246 mask = _generateCNVMask(len(cnv_matrix), cnv_list_parameters['amplifications'], \
|
|
247 cnv_list_parameters['deletions'], cnv_list_parameters['minimum_variations'], \
|
|
248 cnv_list_parameters['maximum_variations'])
|
|
249
|
|
250 with open(cnv_list_file, 'w') as f:
|
17
|
251 line = 'chrom\tchr_start\tchrom_stop\tnum_positions\tcopy_number\n'
|
|
252 f.write(line)
|
|
253 for i, cnv_region in enumerate(cnv_regions):
|
|
254 num_positions = cnv_region[2] - cnv_region[1] + 1
|
|
255 line = cnv_region[0] + '\t' \
|
|
256 + `cnv_region[1]` + '\t' \
|
|
257 + `cnv_region[2]` + '\t' \
|
|
258 + `num_positions` + '\t' \
|
5
|
259 + `mask[i]` + '\n'
|
|
260 f.write(line)
|
|
261 _log("randomly generated CNV list saved to " + cnv_list_file)
|
|
262 else:
|
|
263 _log("loading CNV list ..")
|
|
264 with open(simulation_parameters['cnv_list_file'], "r") as f:
|
|
265 cnv_list = []
|
|
266 lines = f.readlines()
|
17
|
267 lines.pop(0)
|
5
|
268 for line in lines:
|
17
|
269 chromosome, region_start, region_end, num_positions, variation = line.strip().split("\t")
|
5
|
270 cnv_list.append((chromosome, int(region_start), int(region_end), int(variation)))
|
|
271
|
|
272 cnv_matrix = _loadCNVMatrix(targets, cnv_list)
|
|
273 mask = map(lambda x: x[3], cnv_list)
|
|
274 _log("successfully loaded CNV list that contains " + `len(lines)` + " regions ..")
|
|
275
|
|
276 # call Wessim to generate reads from the genome file and the target list
|
|
277 _log("generating reads for the target exons ..")
|
|
278 _log("delegating job to Wessim ...")
|
|
279 _callWessim(genome_file, target_file, base_reads_file, simulation_parameters['number_of_reads'], simulation_parameters['read_length'])
|
|
280
|
|
281 # generate genome and target files for the control and the CNV
|
|
282 _log("simulating copy number variations (amplifications/deletions)")
|
|
283 control_genome, control_targets, cnv_genome, cnv_targets = _simulateCNV(genome, cnv_matrix, mask, simulation_parameters['read_length'])
|
|
284
|
|
285 _log("saving to the control genome file ..")
|
|
286 with open(control_genome_file, 'w') as fw:
|
|
287 fw.write(header + "\n")
|
|
288 n = 50
|
|
289 l = [control_genome[i:i + n] for i in range(0, len(control_genome), n)]
|
|
290 for line in l:
|
|
291 fw.write(line + "\n")
|
|
292
|
|
293 _log("saving to the control target file ..")
|
|
294 with open(control_target_file, 'w') as tw:
|
|
295 for target in control_targets:
|
|
296 line = target[0] + "\t" + `target[1]` + "\t" + `target[2]` + "\n"
|
|
297 tw.write(line)
|
|
298
|
|
299 _log("delegating job to Wessim ...")
|
|
300 _callWessim(control_genome_file, control_target_file, control_reads_file,
|
|
301 int(cnv_list_parameters['deletions'] * simulation_parameters['number_of_reads']), simulation_parameters['read_length'])
|
|
302
|
|
303 _log("saving to the CNV genome file ..")
|
|
304 with open(cnv_genome_file, 'w') as fw:
|
|
305 fw.write(header + "\n")
|
|
306 n = 50
|
|
307 l = [cnv_genome[i:i + n] for i in range(0, len(cnv_genome), n)]
|
|
308 for line in l:
|
|
309 fw.write(line + "\n")
|
|
310
|
|
311 _log("saving to the CNV target file ..")
|
|
312 with open(cnv_target_file, 'w') as tw:
|
|
313 for target in cnv_targets:
|
|
314 line = target[0] + "\t" + `target[1]` + "\t" + `target[2]` + "\n"
|
|
315 tw.write(line)
|
|
316
|
|
317 _log("delegating job to Wessim ...")
|
|
318 _callWessim(cnv_genome_file, cnv_target_file, cnv_reads_file,
|
|
319 int(cnv_list_parameters['amplifications']* simulation_parameters['number_of_reads']), simulation_parameters['read_length'])
|
|
320
|
|
321 _log("merging results ..")
|
|
322 fileio.mergeWessimReads(simulation_parameters['tmp_dir'], simulation_parameters['output_dir'])
|
|
323
|
|
324 _log("cleaning temporary files ..")
|
|
325 fileio.clean(simulation_parameters['tmp_dir'])
|
|
326
|
|
327 _log("simulation completed. find results in " + simulation_parameters['output_dir']) |