comparison cnvsim/exome_simulator.py @ 17:e4ebf3435054 draft

Uploaded
author ahosny
date Wed, 07 Sep 2016 09:36:31 -0400
parents 63955244b2fa
children
comparison
equal deleted inserted replaced
16:7e8047d291e4 17:e4ebf3435054
18 18
19 def getScriptPath(): 19 def getScriptPath():
20 return os.path.dirname(os.path.realpath(__file__)) 20 return os.path.dirname(os.path.realpath(__file__))
21 21
22 22
23 def _generateCNVMatrix(targets_list, regions_count): 23 def _generateCNVMatrix(targets_list, regions_count, region_min_length, region_max_length):
24 ''' 24 '''
25 A function to randomly divide sequential exonic targets into whole CNV regions 25 A function to randomly divide sequential exonic targets into whole CNV regions
26 :param targets_list: a list of target exons loaded from the file provided by the user 26 :param targets_list: a list of target exons loaded from the file provided by the user
27 :return: A matrix where rows represent the region index and the first column as a list of targets in this region 27 :return: A matrix where rows represent the region index and the first column as a list of targets in this region
28 ''' 28 '''
29 regions_count -= 1 29 regions_count -= 1
30 number_of_targets = len(targets_list) 30 number_of_targets = len(targets_list)
31 comine_size = number_of_targets / regions_count 31 combine_size = number_of_targets / regions_count
32 cnv_matrix = [] 32 cnv_regions = []
33 33
34 i = 0 34 i = 0
35 while True: 35 while True:
36 if i == len(targets_list): 36 if i == len(targets_list):
37 break 37 break
38 38
39 first_target = i 39 region_start = targets_list[i][1]
40 i += comine_size 40 region_end = region_start + random.randint(region_min_length, region_max_length) - 1
41 cnv_regions.append((targets_list[i][0], region_start, region_end))
42
43 i += combine_size
41 if i > len(targets_list): 44 if i > len(targets_list):
42 i = len(targets_list) 45 i = len(targets_list)
43 last_target = i 46
44 cnv_matrix.append(targets_list[first_target:last_target]) 47 cnv_matrix = _loadCNVMatrix(targets_list, cnv_regions)
45 48
46 return cnv_matrix 49 return cnv_matrix, cnv_regions
47 50
48 51
49 def _loadCNVMatrix(targets_list, cnv_regions): 52 def _loadCNVMatrix(targets_list, cnv_regions):
50 ''' 53 '''
51 A function to map targets to their corresponding CNV regions 54 A function to map targets to their corresponding CNV regions
62 region_targets = [] 65 region_targets = []
63 for target in targets_list: 66 for target in targets_list:
64 target_chromosome = target[0] 67 target_chromosome = target[0]
65 target_start = target[1] 68 target_start = target[1]
66 target_end = target[2] 69 target_end = target[2]
67 if target_chromosome == region_chromosome and target_start >= region_start and target_end <= region_end: 70 if target_chromosome == region_chromosome and target_start >= region_start:
68 region_targets.append(target) 71 if target_end <= region_end:
72 region_targets.append(target)
73 elif target_start <= region_end:
74 truncated_target = (target_chromosome, target_start, region_end)
75 region_targets.append(truncated_target)
69 76
70 cnv_matrix.append(region_targets) 77 cnv_matrix.append(region_targets)
71 78
72 return cnv_matrix 79 return cnv_matrix
73 80
185 Simulate copy number variations on the passed reference genome based on the given simulation parameters 192 Simulate copy number variations on the passed reference genome based on the given simulation parameters
186 :param genome: path to the FASTA file containing the reference genome 193 :param genome: path to the FASTA file containing the reference genome
187 :param simulation_parameters: a dictionary containing parameters for simulation 194 :param simulation_parameters: a dictionary containing parameters for simulation
188 :return: simulation status {'simulation_status': bool, 'message': str} 195 :return: simulation status {'simulation_status': bool, 'message': str}
189 ''' 196 '''
190 _log('simulation type: whole exome') 197 _log('simulation type: targeted exome sequencing')
191 198
192 # create a temporary directory for intermediate files 199 # create a temporary directory for intermediate files
193 if not os.path.exists(simulation_parameters['tmp_dir']): 200 if not os.path.exists(simulation_parameters['tmp_dir']):
194 os.makedirs(simulation_parameters['tmp_dir']) 201 os.makedirs(simulation_parameters['tmp_dir'])
195 202
196 # create output directory for final results 203 # create output directory for final results
197 if not os.path.exists(simulation_parameters['output_dir']): 204 if not os.path.exists(simulation_parameters['output_dir']):
198 os.makedirs(simulation_parameters['output_dir']) 205 os.makedirs(simulation_parameters['output_dir'])
199 206
200 if simulation_parameters['target_file'] is None: 207 if simulation_parameters['target_file'] is None:
201 _log("ERROR: target file cannot be None for whole exome simulation!") 208 _log("ERROR: target file cannot be None for target exome simulation!")
202 fileio.clean(simulation_parameters['output_dir']) 209 fileio.clean(simulation_parameters['output_dir'])
203 fileio.clean(simulation_parameters['tmp_dir']) 210 fileio.clean(simulation_parameters['tmp_dir'])
204 exit() 211 exit()
205 target_file = simulation_parameters['target_file'] 212 target_file = simulation_parameters['target_file']
206 213
229 targets = fileio.readTargets(target_file) 236 targets = fileio.readTargets(target_file)
230 _log("successfully loaded " + `len(targets)` + " targets ..") 237 _log("successfully loaded " + `len(targets)` + " targets ..")
231 238
232 if simulation_parameters['cnv_list_file'] is None: 239 if simulation_parameters['cnv_list_file'] is None:
233 # CNV list file 240 # CNV list file
234 cnv_list_file = os.path.join(simulation_parameters['output_dir'], "CNVList.bed") 241 cnv_list_file = os.path.join(simulation_parameters['output_dir'], "copynumber.bed")
235 242
236 _log("generating CNV list ..") 243 _log("generating CNV list ..")
237 cnv_matrix = _generateCNVMatrix(targets, cnv_list_parameters['regions_count']) 244 cnv_matrix, cnv_regions = _generateCNVMatrix(targets, cnv_list_parameters['regions_count'], \
245 cnv_list_parameters['minimum_length'], cnv_list_parameters['maximum_length'])
238 mask = _generateCNVMask(len(cnv_matrix), cnv_list_parameters['amplifications'], \ 246 mask = _generateCNVMask(len(cnv_matrix), cnv_list_parameters['amplifications'], \
239 cnv_list_parameters['deletions'], cnv_list_parameters['minimum_variations'], \ 247 cnv_list_parameters['deletions'], cnv_list_parameters['minimum_variations'], \
240 cnv_list_parameters['maximum_variations']) 248 cnv_list_parameters['maximum_variations'])
241 249
242 with open(cnv_list_file, 'w') as f: 250 with open(cnv_list_file, 'w') as f:
243 for i, cnv_region in enumerate(cnv_matrix): 251 line = 'chrom\tchr_start\tchrom_stop\tnum_positions\tcopy_number\n'
244 line = cnv_region[0][0] + '\t' \ 252 f.write(line)
245 + `cnv_region[0][1]` + '\t' \ 253 for i, cnv_region in enumerate(cnv_regions):
246 + `cnv_region[-1][2]` + '\t' \ 254 num_positions = cnv_region[2] - cnv_region[1] + 1
255 line = cnv_region[0] + '\t' \
256 + `cnv_region[1]` + '\t' \
257 + `cnv_region[2]` + '\t' \
258 + `num_positions` + '\t' \
247 + `mask[i]` + '\n' 259 + `mask[i]` + '\n'
248 f.write(line) 260 f.write(line)
249 _log("randomly generated CNV list saved to " + cnv_list_file) 261 _log("randomly generated CNV list saved to " + cnv_list_file)
250 else: 262 else:
251 _log("loading CNV list ..") 263 _log("loading CNV list ..")
252 with open(simulation_parameters['cnv_list_file'], "r") as f: 264 with open(simulation_parameters['cnv_list_file'], "r") as f:
253 cnv_list = [] 265 cnv_list = []
254 lines = f.readlines() 266 lines = f.readlines()
267 lines.pop(0)
255 for line in lines: 268 for line in lines:
256 chromosome, region_start, region_end, variation = line.strip().split("\t") 269 chromosome, region_start, region_end, num_positions, variation = line.strip().split("\t")
257 cnv_list.append((chromosome, int(region_start), int(region_end), int(variation))) 270 cnv_list.append((chromosome, int(region_start), int(region_end), int(variation)))
258 271
259 cnv_matrix = _loadCNVMatrix(targets, cnv_list) 272 cnv_matrix = _loadCNVMatrix(targets, cnv_list)
260 mask = map(lambda x: x[3], cnv_list) 273 mask = map(lambda x: x[3], cnv_list)
261 _log("successfully loaded CNV list that contains " + `len(lines)` + " regions ..") 274 _log("successfully loaded CNV list that contains " + `len(lines)` + " regions ..")