Mercurial > repos > ahosny > cnvsim
comparison cnvsim/exome_simulator.py @ 17:e4ebf3435054 draft
Uploaded
author | ahosny |
---|---|
date | Wed, 07 Sep 2016 09:36:31 -0400 |
parents | 63955244b2fa |
children |
comparison
equal
deleted
inserted
replaced
16:7e8047d291e4 | 17:e4ebf3435054 |
---|---|
18 | 18 |
19 def getScriptPath(): | 19 def getScriptPath(): |
20 return os.path.dirname(os.path.realpath(__file__)) | 20 return os.path.dirname(os.path.realpath(__file__)) |
21 | 21 |
22 | 22 |
23 def _generateCNVMatrix(targets_list, regions_count): | 23 def _generateCNVMatrix(targets_list, regions_count, region_min_length, region_max_length): |
24 ''' | 24 ''' |
25 A function to randomly divide sequential exonic targets into whole CNV regions | 25 A function to randomly divide sequential exonic targets into whole CNV regions |
26 :param targets_list: a list of target exons loaded from the file provided by the user | 26 :param targets_list: a list of target exons loaded from the file provided by the user |
27 :return: A matrix where rows represent the region index and the first column as a list of targets in this region | 27 :return: A matrix where rows represent the region index and the first column as a list of targets in this region |
28 ''' | 28 ''' |
29 regions_count -= 1 | 29 regions_count -= 1 |
30 number_of_targets = len(targets_list) | 30 number_of_targets = len(targets_list) |
31 comine_size = number_of_targets / regions_count | 31 combine_size = number_of_targets / regions_count |
32 cnv_matrix = [] | 32 cnv_regions = [] |
33 | 33 |
34 i = 0 | 34 i = 0 |
35 while True: | 35 while True: |
36 if i == len(targets_list): | 36 if i == len(targets_list): |
37 break | 37 break |
38 | 38 |
39 first_target = i | 39 region_start = targets_list[i][1] |
40 i += comine_size | 40 region_end = region_start + random.randint(region_min_length, region_max_length) - 1 |
41 cnv_regions.append((targets_list[i][0], region_start, region_end)) | |
42 | |
43 i += combine_size | |
41 if i > len(targets_list): | 44 if i > len(targets_list): |
42 i = len(targets_list) | 45 i = len(targets_list) |
43 last_target = i | 46 |
44 cnv_matrix.append(targets_list[first_target:last_target]) | 47 cnv_matrix = _loadCNVMatrix(targets_list, cnv_regions) |
45 | 48 |
46 return cnv_matrix | 49 return cnv_matrix, cnv_regions |
47 | 50 |
48 | 51 |
49 def _loadCNVMatrix(targets_list, cnv_regions): | 52 def _loadCNVMatrix(targets_list, cnv_regions): |
50 ''' | 53 ''' |
51 A function to map targets to their corresponding CNV regions | 54 A function to map targets to their corresponding CNV regions |
62 region_targets = [] | 65 region_targets = [] |
63 for target in targets_list: | 66 for target in targets_list: |
64 target_chromosome = target[0] | 67 target_chromosome = target[0] |
65 target_start = target[1] | 68 target_start = target[1] |
66 target_end = target[2] | 69 target_end = target[2] |
67 if target_chromosome == region_chromosome and target_start >= region_start and target_end <= region_end: | 70 if target_chromosome == region_chromosome and target_start >= region_start: |
68 region_targets.append(target) | 71 if target_end <= region_end: |
72 region_targets.append(target) | |
73 elif target_start <= region_end: | |
74 truncated_target = (target_chromosome, target_start, region_end) | |
75 region_targets.append(truncated_target) | |
69 | 76 |
70 cnv_matrix.append(region_targets) | 77 cnv_matrix.append(region_targets) |
71 | 78 |
72 return cnv_matrix | 79 return cnv_matrix |
73 | 80 |
185 Simulate copy number variations on the passed reference genome based on the given simulation parameters | 192 Simulate copy number variations on the passed reference genome based on the given simulation parameters |
186 :param genome: path to the FASTA file containing the reference genome | 193 :param genome: path to the FASTA file containing the reference genome |
187 :param simulation_parameters: a dictionary containing parameters for simulation | 194 :param simulation_parameters: a dictionary containing parameters for simulation |
188 :return: simulation status {'simulation_status': bool, 'message': str} | 195 :return: simulation status {'simulation_status': bool, 'message': str} |
189 ''' | 196 ''' |
190 _log('simulation type: whole exome') | 197 _log('simulation type: targeted exome sequencing') |
191 | 198 |
192 # create a temporary directory for intermediate files | 199 # create a temporary directory for intermediate files |
193 if not os.path.exists(simulation_parameters['tmp_dir']): | 200 if not os.path.exists(simulation_parameters['tmp_dir']): |
194 os.makedirs(simulation_parameters['tmp_dir']) | 201 os.makedirs(simulation_parameters['tmp_dir']) |
195 | 202 |
196 # create output directory for final results | 203 # create output directory for final results |
197 if not os.path.exists(simulation_parameters['output_dir']): | 204 if not os.path.exists(simulation_parameters['output_dir']): |
198 os.makedirs(simulation_parameters['output_dir']) | 205 os.makedirs(simulation_parameters['output_dir']) |
199 | 206 |
200 if simulation_parameters['target_file'] is None: | 207 if simulation_parameters['target_file'] is None: |
201 _log("ERROR: target file cannot be None for whole exome simulation!") | 208 _log("ERROR: target file cannot be None for target exome simulation!") |
202 fileio.clean(simulation_parameters['output_dir']) | 209 fileio.clean(simulation_parameters['output_dir']) |
203 fileio.clean(simulation_parameters['tmp_dir']) | 210 fileio.clean(simulation_parameters['tmp_dir']) |
204 exit() | 211 exit() |
205 target_file = simulation_parameters['target_file'] | 212 target_file = simulation_parameters['target_file'] |
206 | 213 |
229 targets = fileio.readTargets(target_file) | 236 targets = fileio.readTargets(target_file) |
230 _log("successfully loaded " + `len(targets)` + " targets ..") | 237 _log("successfully loaded " + `len(targets)` + " targets ..") |
231 | 238 |
232 if simulation_parameters['cnv_list_file'] is None: | 239 if simulation_parameters['cnv_list_file'] is None: |
233 # CNV list file | 240 # CNV list file |
234 cnv_list_file = os.path.join(simulation_parameters['output_dir'], "CNVList.bed") | 241 cnv_list_file = os.path.join(simulation_parameters['output_dir'], "copynumber.bed") |
235 | 242 |
236 _log("generating CNV list ..") | 243 _log("generating CNV list ..") |
237 cnv_matrix = _generateCNVMatrix(targets, cnv_list_parameters['regions_count']) | 244 cnv_matrix, cnv_regions = _generateCNVMatrix(targets, cnv_list_parameters['regions_count'], \ |
245 cnv_list_parameters['minimum_length'], cnv_list_parameters['maximum_length']) | |
238 mask = _generateCNVMask(len(cnv_matrix), cnv_list_parameters['amplifications'], \ | 246 mask = _generateCNVMask(len(cnv_matrix), cnv_list_parameters['amplifications'], \ |
239 cnv_list_parameters['deletions'], cnv_list_parameters['minimum_variations'], \ | 247 cnv_list_parameters['deletions'], cnv_list_parameters['minimum_variations'], \ |
240 cnv_list_parameters['maximum_variations']) | 248 cnv_list_parameters['maximum_variations']) |
241 | 249 |
242 with open(cnv_list_file, 'w') as f: | 250 with open(cnv_list_file, 'w') as f: |
243 for i, cnv_region in enumerate(cnv_matrix): | 251 line = 'chrom\tchr_start\tchrom_stop\tnum_positions\tcopy_number\n' |
244 line = cnv_region[0][0] + '\t' \ | 252 f.write(line) |
245 + `cnv_region[0][1]` + '\t' \ | 253 for i, cnv_region in enumerate(cnv_regions): |
246 + `cnv_region[-1][2]` + '\t' \ | 254 num_positions = cnv_region[2] - cnv_region[1] + 1 |
255 line = cnv_region[0] + '\t' \ | |
256 + `cnv_region[1]` + '\t' \ | |
257 + `cnv_region[2]` + '\t' \ | |
258 + `num_positions` + '\t' \ | |
247 + `mask[i]` + '\n' | 259 + `mask[i]` + '\n' |
248 f.write(line) | 260 f.write(line) |
249 _log("randomly generated CNV list saved to " + cnv_list_file) | 261 _log("randomly generated CNV list saved to " + cnv_list_file) |
250 else: | 262 else: |
251 _log("loading CNV list ..") | 263 _log("loading CNV list ..") |
252 with open(simulation_parameters['cnv_list_file'], "r") as f: | 264 with open(simulation_parameters['cnv_list_file'], "r") as f: |
253 cnv_list = [] | 265 cnv_list = [] |
254 lines = f.readlines() | 266 lines = f.readlines() |
267 lines.pop(0) | |
255 for line in lines: | 268 for line in lines: |
256 chromosome, region_start, region_end, variation = line.strip().split("\t") | 269 chromosome, region_start, region_end, num_positions, variation = line.strip().split("\t") |
257 cnv_list.append((chromosome, int(region_start), int(region_end), int(variation))) | 270 cnv_list.append((chromosome, int(region_start), int(region_end), int(variation))) |
258 | 271 |
259 cnv_matrix = _loadCNVMatrix(targets, cnv_list) | 272 cnv_matrix = _loadCNVMatrix(targets, cnv_list) |
260 mask = map(lambda x: x[3], cnv_list) | 273 mask = map(lambda x: x[3], cnv_list) |
261 _log("successfully loaded CNV list that contains " + `len(lines)` + " regions ..") | 274 _log("successfully loaded CNV list that contains " + `len(lines)` + " regions ..") |