annotate cnvsim/exome_simulator.py @ 18:eca72016b5b3 draft default tip

Uploaded
author ahosny
date Wed, 07 Sep 2016 09:37:49 -0400
parents e4ebf3435054
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
1 #!/usr/bin/python
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
2
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
3 __author__ = 'Abdelrahman Hosny'
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
4
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
5 import random
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
6 import subprocess
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
7 import os
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
8 import sys
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
9 import datetime
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
10 import shutil
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
11
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
12 from . import fileio
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
13
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
14
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
15 def _log(message):
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
16 print '[CNV SIM {:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) + "] " + message
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
17
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
18
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
19 def getScriptPath():
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
20 return os.path.dirname(os.path.realpath(__file__))
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
21
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
22
17
e4ebf3435054 Uploaded
ahosny
parents: 5
diff changeset
23 def _generateCNVMatrix(targets_list, regions_count, region_min_length, region_max_length):
5
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
24 '''
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
25 A function to randomly divide sequential exonic targets into whole CNV regions
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
26 :param targets_list: a list of target exons loaded from the file provided by the user
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
27 :return: A matrix where rows represent the region index and the first column as a list of targets in this region
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
28 '''
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
29 regions_count -= 1
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
30 number_of_targets = len(targets_list)
17
e4ebf3435054 Uploaded
ahosny
parents: 5
diff changeset
31 combine_size = number_of_targets / regions_count
e4ebf3435054 Uploaded
ahosny
parents: 5
diff changeset
32 cnv_regions = []
5
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
33
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
34 i = 0
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
35 while True:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
36 if i == len(targets_list):
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
37 break
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
38
17
e4ebf3435054 Uploaded
ahosny
parents: 5
diff changeset
39 region_start = targets_list[i][1]
e4ebf3435054 Uploaded
ahosny
parents: 5
diff changeset
40 region_end = region_start + random.randint(region_min_length, region_max_length) - 1
e4ebf3435054 Uploaded
ahosny
parents: 5
diff changeset
41 cnv_regions.append((targets_list[i][0], region_start, region_end))
e4ebf3435054 Uploaded
ahosny
parents: 5
diff changeset
42
e4ebf3435054 Uploaded
ahosny
parents: 5
diff changeset
43 i += combine_size
5
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
44 if i > len(targets_list):
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
45 i = len(targets_list)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
46
17
e4ebf3435054 Uploaded
ahosny
parents: 5
diff changeset
47 cnv_matrix = _loadCNVMatrix(targets_list, cnv_regions)
e4ebf3435054 Uploaded
ahosny
parents: 5
diff changeset
48
e4ebf3435054 Uploaded
ahosny
parents: 5
diff changeset
49 return cnv_matrix, cnv_regions
5
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
50
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
51
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
52 def _loadCNVMatrix(targets_list, cnv_regions):
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
53 '''
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
54 A function to map targets to their corresponding CNV regions
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
55 :param targets_list: a list of target exons loaded from the file provided by the user
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
56 :param cnv_regions: a list of CNV regions loaded from the CNV file provided by the user
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
57 :return: A matrix where rows represent the region index and the first column as a list of targets in this region
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
58 '''
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
59 cnv_matrix = []
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
60 for cnv in cnv_regions:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
61 region_chromosome = cnv[0]
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
62 region_start = cnv[1]
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
63 region_end = cnv[2]
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
64
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
65 region_targets = []
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
66 for target in targets_list:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
67 target_chromosome = target[0]
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
68 target_start = target[1]
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
69 target_end = target[2]
17
e4ebf3435054 Uploaded
ahosny
parents: 5
diff changeset
70 if target_chromosome == region_chromosome and target_start >= region_start:
e4ebf3435054 Uploaded
ahosny
parents: 5
diff changeset
71 if target_end <= region_end:
e4ebf3435054 Uploaded
ahosny
parents: 5
diff changeset
72 region_targets.append(target)
e4ebf3435054 Uploaded
ahosny
parents: 5
diff changeset
73 elif target_start <= region_end:
e4ebf3435054 Uploaded
ahosny
parents: 5
diff changeset
74 truncated_target = (target_chromosome, target_start, region_end)
e4ebf3435054 Uploaded
ahosny
parents: 5
diff changeset
75 region_targets.append(truncated_target)
5
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
76
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
77 cnv_matrix.append(region_targets)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
78
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
79 return cnv_matrix
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
80
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
81
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
82 def _generateCNVMask(number_of_exons, p_amplify, p_delete, min_variation, max_variation):
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
83 '''
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
84 This function generates random Copy Number Variations mask list
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
85 :param number_of_exons: the length of the target regions.
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
86 :param p_amplify: percentage of amplifications introduced
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
87 :param p_delete: percentage of deletions introduced
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
88 :return: a list of the same length as the exons list. each list item
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
89 indicates the variation to be added to the exon in the same position.
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
90 Positive number: number of amplification
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
91 Zero: no change
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
92 -1: delete this exon
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
93 '''
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
94
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
95 number_of_amplifications = int(p_amplify * number_of_exons)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
96 number_of_deletions = int(p_delete * number_of_exons)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
97 cnv_mask = [0] * number_of_exons
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
98
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
99 # generate CNV mask (a list of amplifications and deletions to be applied to the exons)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
100 while number_of_amplifications > 0:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
101 choice = random.randrange(0, number_of_exons)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
102 while cnv_mask[choice] != 0:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
103 choice = random.randrange(0, number_of_exons)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
104 cnv_mask[choice] = random.randrange(min_variation, max_variation) # random amplifications in the range [min_variation, max_variation)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
105 number_of_amplifications -= 1
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
106 random.shuffle(cnv_mask)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
107 while number_of_deletions > 0:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
108 choice = random.randrange(0, number_of_exons)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
109 while cnv_mask[choice] != 0:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
110 choice = random.randrange(0, number_of_exons)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
111 cnv_mask[choice] = -1*random.randrange(min_variation, max_variation) # random deletions in the range [min_variation, max_variation)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
112 number_of_deletions -= 1
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
113 random.shuffle(cnv_mask)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
114
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
115 return cnv_mask
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
116
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
117
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
118 def _simulateCNV(genome, cnv_matrix, mask, read_length):
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
119 '''
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
120 This function creates two new genome references and target files for control and CNV
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
121 :param genome: text string of the genome
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
122 :param cnv_matrix: A matrix where rows represent the region index and the first column as a list of targets in this region
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
123 :param mask: a list to indicate the variations introduced to each CNV region
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
124 :return: control genome, control target list, modified genome and modified target list (in order)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
125 '''
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
126
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
127 control_genome = []
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
128 control_targets = [] # initialize control target list
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
129 cnv_genome = []
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
130 cnv_targets = [] # initialize cnv target list
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
131
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
132 CONTROL_APPEND_INDEX = 0 # a value to re-base the (start, end) of amplified/deleted targets in a region
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
133 CNV_APPEND_INDEX = 0 # a value to re-base the (start, end) of amplified/deleted targets in a region
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
134
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
135 for k, cnv_region in enumerate(cnv_matrix):
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
136
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
137 number_of_copies = mask[k]
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
138
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
139 if number_of_copies > 0:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
140 # if amplification
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
141 for target in cnv_region:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
142 for _ in range(number_of_copies):
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
143 amplification = genome[target[1]-read_length:target[2]+read_length]
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
144 cnv_genome.append(amplification)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
145 chromosome = target[0]
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
146 start = CNV_APPEND_INDEX + read_length
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
147 end = CNV_APPEND_INDEX + len(amplification) - read_length
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
148 cnv_targets.append((chromosome, start, end))
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
149 CNV_APPEND_INDEX += len(amplification)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
150 elif number_of_copies < 0:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
151 # if deletion
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
152 for target in cnv_region:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
153 for _ in range(abs(number_of_copies)):
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
154 deletion = genome[target[1]-read_length:target[2]+read_length]
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
155 control_genome.append(deletion)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
156 chromosome = target[0]
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
157 start = CONTROL_APPEND_INDEX + read_length
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
158 end = CONTROL_APPEND_INDEX + len(deletion) - read_length
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
159 control_targets.append((chromosome, start, end))
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
160 CONTROL_APPEND_INDEX += len(deletion)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
161
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
162 return ''.join(control_genome), control_targets, ''.join(cnv_genome), cnv_targets
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
163
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
164
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
165 def _callWessim(genome_file, target_file, output_file, number_of_reads, read_length, model_file="models/ill100v4_p.gzip", number_of_threads=4):
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
166 '''
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
167 Calls Wessim to generate artificial reads for the targets on the reference genome
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
168 :param genome_file: reference genome file in FASTA format
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
169 :param target_file: target regions file in BED format
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
170 :param output_file: output file name
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
171 :param number_of_reads: the number of reads to generate for this simulation
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
172 :param read_length: the read length
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
173 :param model_file: GemSim's empirical error models used by Wessim for NGS read generation
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
174 :param number_of_threads: the number of threads to run this simulation
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
175 :return: None
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
176 '''
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
177 os.chdir(os.path.join(getScriptPath(), "Wessim" ))
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
178 subprocess.call(["python", "Wessim1.py", \
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
179 "-R" + genome_file, \
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
180 "-B" + target_file, \
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
181 "-n" + str(number_of_reads), \
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
182 "-l" + str(read_length), \
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
183 "-M" + model_file, \
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
184 "-o" + output_file, \
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
185 "-t" + str(number_of_threads), \
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
186 "-p"], stderr=None)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
187 os.chdir('..')
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
188
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
189
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
190 def simulate_exome_cnv(simulation_parameters, cnv_list_parameters=None):
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
191 '''
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
192 Simulate copy number variations on the passed reference genome based on the given simulation parameters
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
193 :param genome: path to the FASTA file containing the reference genome
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
194 :param simulation_parameters: a dictionary containing parameters for simulation
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
195 :return: simulation status {'simulation_status': bool, 'message': str}
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
196 '''
17
e4ebf3435054 Uploaded
ahosny
parents: 5
diff changeset
197 _log('simulation type: targeted exome sequencing')
5
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
198
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
199 # create a temporary directory for intermediate files
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
200 if not os.path.exists(simulation_parameters['tmp_dir']):
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
201 os.makedirs(simulation_parameters['tmp_dir'])
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
202
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
203 # create output directory for final results
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
204 if not os.path.exists(simulation_parameters['output_dir']):
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
205 os.makedirs(simulation_parameters['output_dir'])
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
206
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
207 if simulation_parameters['target_file'] is None:
17
e4ebf3435054 Uploaded
ahosny
parents: 5
diff changeset
208 _log("ERROR: target file cannot be None for target exome simulation!")
5
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
209 fileio.clean(simulation_parameters['output_dir'])
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
210 fileio.clean(simulation_parameters['tmp_dir'])
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
211 exit()
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
212 target_file = simulation_parameters['target_file']
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
213
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
214 # copy genome and target to the tmp folder
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
215 shutil.copyfile(simulation_parameters['genome_file'], os.path.join(simulation_parameters['tmp_dir'], "reference.fa"))
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
216 shutil.copyfile(target_file, os.path.join(simulation_parameters['tmp_dir'], "target.bed"))
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
217 genome_file = os.path.join(simulation_parameters['tmp_dir'], "reference.fa")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
218 target_file = os.path.join(simulation_parameters['tmp_dir'], "target.bed")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
219
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
220 # initialize variables for temporary files
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
221 control_genome_file = os.path.join(simulation_parameters['tmp_dir'], "ControlGenome.fa")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
222 control_target_file = os.path.join(simulation_parameters['tmp_dir'], "ControlTarget.bed")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
223 cnv_genome_file = os.path.join(simulation_parameters['tmp_dir'], "CNVGenome.fa")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
224 cnv_target_file = os.path.join(simulation_parameters['tmp_dir'], "CNVTarget.bed")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
225 base_reads_file = os.path.join(simulation_parameters['tmp_dir'], "base")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
226 control_reads_file = os.path.join(simulation_parameters['tmp_dir'], "control")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
227 cnv_reads_file = os.path.join(simulation_parameters['tmp_dir'], "cnv")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
228
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
229 _log("loading genome file ..")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
230 header, genome = fileio.readGenome(genome_file)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
231 _log("successfully loaded a genome of length " + `len(genome)`)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
232
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
233 _log("loading target file ..")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
234 _log("sorting and merging targets ..")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
235 target_file = fileio.prepareTargetFile(target_file)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
236 targets = fileio.readTargets(target_file)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
237 _log("successfully loaded " + `len(targets)` + " targets ..")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
238
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
239 if simulation_parameters['cnv_list_file'] is None:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
240 # CNV list file
17
e4ebf3435054 Uploaded
ahosny
parents: 5
diff changeset
241 cnv_list_file = os.path.join(simulation_parameters['output_dir'], "copynumber.bed")
5
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
242
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
243 _log("generating CNV list ..")
17
e4ebf3435054 Uploaded
ahosny
parents: 5
diff changeset
244 cnv_matrix, cnv_regions = _generateCNVMatrix(targets, cnv_list_parameters['regions_count'], \
e4ebf3435054 Uploaded
ahosny
parents: 5
diff changeset
245 cnv_list_parameters['minimum_length'], cnv_list_parameters['maximum_length'])
5
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
246 mask = _generateCNVMask(len(cnv_matrix), cnv_list_parameters['amplifications'], \
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
247 cnv_list_parameters['deletions'], cnv_list_parameters['minimum_variations'], \
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
248 cnv_list_parameters['maximum_variations'])
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
249
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
250 with open(cnv_list_file, 'w') as f:
17
e4ebf3435054 Uploaded
ahosny
parents: 5
diff changeset
251 line = 'chrom\tchr_start\tchrom_stop\tnum_positions\tcopy_number\n'
e4ebf3435054 Uploaded
ahosny
parents: 5
diff changeset
252 f.write(line)
e4ebf3435054 Uploaded
ahosny
parents: 5
diff changeset
253 for i, cnv_region in enumerate(cnv_regions):
e4ebf3435054 Uploaded
ahosny
parents: 5
diff changeset
254 num_positions = cnv_region[2] - cnv_region[1] + 1
e4ebf3435054 Uploaded
ahosny
parents: 5
diff changeset
255 line = cnv_region[0] + '\t' \
e4ebf3435054 Uploaded
ahosny
parents: 5
diff changeset
256 + `cnv_region[1]` + '\t' \
e4ebf3435054 Uploaded
ahosny
parents: 5
diff changeset
257 + `cnv_region[2]` + '\t' \
e4ebf3435054 Uploaded
ahosny
parents: 5
diff changeset
258 + `num_positions` + '\t' \
5
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
259 + `mask[i]` + '\n'
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
260 f.write(line)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
261 _log("randomly generated CNV list saved to " + cnv_list_file)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
262 else:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
263 _log("loading CNV list ..")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
264 with open(simulation_parameters['cnv_list_file'], "r") as f:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
265 cnv_list = []
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
266 lines = f.readlines()
17
e4ebf3435054 Uploaded
ahosny
parents: 5
diff changeset
267 lines.pop(0)
5
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
268 for line in lines:
17
e4ebf3435054 Uploaded
ahosny
parents: 5
diff changeset
269 chromosome, region_start, region_end, num_positions, variation = line.strip().split("\t")
5
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
270 cnv_list.append((chromosome, int(region_start), int(region_end), int(variation)))
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
271
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
272 cnv_matrix = _loadCNVMatrix(targets, cnv_list)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
273 mask = map(lambda x: x[3], cnv_list)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
274 _log("successfully loaded CNV list that contains " + `len(lines)` + " regions ..")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
275
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
276 # call Wessim to generate reads from the genome file and the target list
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
277 _log("generating reads for the target exons ..")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
278 _log("delegating job to Wessim ...")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
279 _callWessim(genome_file, target_file, base_reads_file, simulation_parameters['number_of_reads'], simulation_parameters['read_length'])
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
280
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
281 # generate genome and target files for the control and the CNV
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
282 _log("simulating copy number variations (amplifications/deletions)")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
283 control_genome, control_targets, cnv_genome, cnv_targets = _simulateCNV(genome, cnv_matrix, mask, simulation_parameters['read_length'])
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
284
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
285 _log("saving to the control genome file ..")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
286 with open(control_genome_file, 'w') as fw:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
287 fw.write(header + "\n")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
288 n = 50
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
289 l = [control_genome[i:i + n] for i in range(0, len(control_genome), n)]
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
290 for line in l:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
291 fw.write(line + "\n")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
292
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
293 _log("saving to the control target file ..")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
294 with open(control_target_file, 'w') as tw:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
295 for target in control_targets:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
296 line = target[0] + "\t" + `target[1]` + "\t" + `target[2]` + "\n"
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
297 tw.write(line)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
298
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
299 _log("delegating job to Wessim ...")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
300 _callWessim(control_genome_file, control_target_file, control_reads_file,
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
301 int(cnv_list_parameters['deletions'] * simulation_parameters['number_of_reads']), simulation_parameters['read_length'])
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
302
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
303 _log("saving to the CNV genome file ..")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
304 with open(cnv_genome_file, 'w') as fw:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
305 fw.write(header + "\n")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
306 n = 50
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
307 l = [cnv_genome[i:i + n] for i in range(0, len(cnv_genome), n)]
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
308 for line in l:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
309 fw.write(line + "\n")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
310
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
311 _log("saving to the CNV target file ..")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
312 with open(cnv_target_file, 'w') as tw:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
313 for target in cnv_targets:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
314 line = target[0] + "\t" + `target[1]` + "\t" + `target[2]` + "\n"
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
315 tw.write(line)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
316
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
317 _log("delegating job to Wessim ...")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
318 _callWessim(cnv_genome_file, cnv_target_file, cnv_reads_file,
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
319 int(cnv_list_parameters['amplifications']* simulation_parameters['number_of_reads']), simulation_parameters['read_length'])
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
320
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
321 _log("merging results ..")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
322 fileio.mergeWessimReads(simulation_parameters['tmp_dir'], simulation_parameters['output_dir'])
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
323
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
324 _log("cleaning temporary files ..")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
325 fileio.clean(simulation_parameters['tmp_dir'])
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
326
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
327 _log("simulation completed. find results in " + simulation_parameters['output_dir'])