annotate cnvsim/exome_simulator.py @ 5:63955244b2fa draft

Uploaded cnvsim library package
author ahosny
date Sat, 06 Aug 2016 15:27:30 -0400
parents
children e4ebf3435054
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
1 #!/usr/bin/python
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
2
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
3 __author__ = 'Abdelrahman Hosny'
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
4
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
5 import random
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
6 import subprocess
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
7 import os
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
8 import sys
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
9 import datetime
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
10 import shutil
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
11
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
12 from . import fileio
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
13
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
14
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
15 def _log(message):
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
16 print '[CNV SIM {:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) + "] " + message
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
17
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
18
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
19 def getScriptPath():
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
20 return os.path.dirname(os.path.realpath(__file__))
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
21
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
22
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
23 def _generateCNVMatrix(targets_list, regions_count):
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
24 '''
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
25 A function to randomly divide sequential exonic targets into whole CNV regions
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
26 :param targets_list: a list of target exons loaded from the file provided by the user
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
27 :return: A matrix where rows represent the region index and the first column as a list of targets in this region
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
28 '''
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
29 regions_count -= 1
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
30 number_of_targets = len(targets_list)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
31 comine_size = number_of_targets / regions_count
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
32 cnv_matrix = []
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
33
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
34 i = 0
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
35 while True:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
36 if i == len(targets_list):
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
37 break
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
38
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
39 first_target = i
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
40 i += comine_size
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
41 if i > len(targets_list):
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
42 i = len(targets_list)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
43 last_target = i
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
44 cnv_matrix.append(targets_list[first_target:last_target])
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
45
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
46 return cnv_matrix
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
47
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
48
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
49 def _loadCNVMatrix(targets_list, cnv_regions):
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
50 '''
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
51 A function to map targets to their corresponding CNV regions
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
52 :param targets_list: a list of target exons loaded from the file provided by the user
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
53 :param cnv_regions: a list of CNV regions loaded from the CNV file provided by the user
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
54 :return: A matrix where rows represent the region index and the first column as a list of targets in this region
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
55 '''
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
56 cnv_matrix = []
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
57 for cnv in cnv_regions:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
58 region_chromosome = cnv[0]
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
59 region_start = cnv[1]
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
60 region_end = cnv[2]
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
61
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
62 region_targets = []
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
63 for target in targets_list:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
64 target_chromosome = target[0]
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
65 target_start = target[1]
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
66 target_end = target[2]
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
67 if target_chromosome == region_chromosome and target_start >= region_start and target_end <= region_end:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
68 region_targets.append(target)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
69
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
70 cnv_matrix.append(region_targets)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
71
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
72 return cnv_matrix
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
73
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
74
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
75 def _generateCNVMask(number_of_exons, p_amplify, p_delete, min_variation, max_variation):
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
76 '''
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
77 This function generates random Copy Number Variations mask list
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
78 :param number_of_exons: the length of the target regions.
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
79 :param p_amplify: percentage of amplifications introduced
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
80 :param p_delete: percentage of deletions introduced
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
81 :return: a list of the same length as the exons list. each list item
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
82 indicates the variation to be added to the exon in the same position.
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
83 Positive number: number of amplification
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
84 Zero: no change
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
85 -1: delete this exon
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
86 '''
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
87
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
88 number_of_amplifications = int(p_amplify * number_of_exons)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
89 number_of_deletions = int(p_delete * number_of_exons)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
90 cnv_mask = [0] * number_of_exons
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
91
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
92 # generate CNV mask (a list of amplifications and deletions to be applied to the exons)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
93 while number_of_amplifications > 0:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
94 choice = random.randrange(0, number_of_exons)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
95 while cnv_mask[choice] != 0:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
96 choice = random.randrange(0, number_of_exons)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
97 cnv_mask[choice] = random.randrange(min_variation, max_variation) # random amplifications in the range [min_variation, max_variation)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
98 number_of_amplifications -= 1
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
99 random.shuffle(cnv_mask)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
100 while number_of_deletions > 0:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
101 choice = random.randrange(0, number_of_exons)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
102 while cnv_mask[choice] != 0:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
103 choice = random.randrange(0, number_of_exons)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
104 cnv_mask[choice] = -1*random.randrange(min_variation, max_variation) # random deletions in the range [min_variation, max_variation)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
105 number_of_deletions -= 1
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
106 random.shuffle(cnv_mask)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
107
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
108 return cnv_mask
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
109
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
110
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
111 def _simulateCNV(genome, cnv_matrix, mask, read_length):
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
112 '''
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
113 This function creates two new genome references and target files for control and CNV
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
114 :param genome: text string of the genome
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
115 :param cnv_matrix: A matrix where rows represent the region index and the first column as a list of targets in this region
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
116 :param mask: a list to indicate the variations introduced to each CNV region
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
117 :return: control genome, control target list, modified genome and modified target list (in order)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
118 '''
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
119
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
120 control_genome = []
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
121 control_targets = [] # initialize control target list
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
122 cnv_genome = []
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
123 cnv_targets = [] # initialize cnv target list
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
124
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
125 CONTROL_APPEND_INDEX = 0 # a value to re-base the (start, end) of amplified/deleted targets in a region
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
126 CNV_APPEND_INDEX = 0 # a value to re-base the (start, end) of amplified/deleted targets in a region
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
127
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
128 for k, cnv_region in enumerate(cnv_matrix):
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
129
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
130 number_of_copies = mask[k]
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
131
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
132 if number_of_copies > 0:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
133 # if amplification
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
134 for target in cnv_region:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
135 for _ in range(number_of_copies):
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
136 amplification = genome[target[1]-read_length:target[2]+read_length]
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
137 cnv_genome.append(amplification)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
138 chromosome = target[0]
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
139 start = CNV_APPEND_INDEX + read_length
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
140 end = CNV_APPEND_INDEX + len(amplification) - read_length
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
141 cnv_targets.append((chromosome, start, end))
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
142 CNV_APPEND_INDEX += len(amplification)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
143 elif number_of_copies < 0:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
144 # if deletion
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
145 for target in cnv_region:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
146 for _ in range(abs(number_of_copies)):
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
147 deletion = genome[target[1]-read_length:target[2]+read_length]
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
148 control_genome.append(deletion)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
149 chromosome = target[0]
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
150 start = CONTROL_APPEND_INDEX + read_length
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
151 end = CONTROL_APPEND_INDEX + len(deletion) - read_length
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
152 control_targets.append((chromosome, start, end))
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
153 CONTROL_APPEND_INDEX += len(deletion)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
154
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
155 return ''.join(control_genome), control_targets, ''.join(cnv_genome), cnv_targets
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
156
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
157
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
158 def _callWessim(genome_file, target_file, output_file, number_of_reads, read_length, model_file="models/ill100v4_p.gzip", number_of_threads=4):
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
159 '''
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
160 Calls Wessim to generate artificial reads for the targets on the reference genome
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
161 :param genome_file: reference genome file in FASTA format
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
162 :param target_file: target regions file in BED format
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
163 :param output_file: output file name
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
164 :param number_of_reads: the number of reads to generate for this simulation
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
165 :param read_length: the read length
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
166 :param model_file: GemSim's empirical error models used by Wessim for NGS read generation
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
167 :param number_of_threads: the number of threads to run this simulation
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
168 :return: None
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
169 '''
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
170 os.chdir(os.path.join(getScriptPath(), "Wessim" ))
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
171 subprocess.call(["python", "Wessim1.py", \
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
172 "-R" + genome_file, \
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
173 "-B" + target_file, \
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
174 "-n" + str(number_of_reads), \
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
175 "-l" + str(read_length), \
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
176 "-M" + model_file, \
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
177 "-o" + output_file, \
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
178 "-t" + str(number_of_threads), \
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
179 "-p"], stderr=None)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
180 os.chdir('..')
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
181
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
182
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
183 def simulate_exome_cnv(simulation_parameters, cnv_list_parameters=None):
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
184 '''
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
185 Simulate copy number variations on the passed reference genome based on the given simulation parameters
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
186 :param genome: path to the FASTA file containing the reference genome
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
187 :param simulation_parameters: a dictionary containing parameters for simulation
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
188 :return: simulation status {'simulation_status': bool, 'message': str}
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
189 '''
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
190 _log('simulation type: whole exome')
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
191
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
192 # create a temporary directory for intermediate files
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
193 if not os.path.exists(simulation_parameters['tmp_dir']):
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
194 os.makedirs(simulation_parameters['tmp_dir'])
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
195
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
196 # create output directory for final results
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
197 if not os.path.exists(simulation_parameters['output_dir']):
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
198 os.makedirs(simulation_parameters['output_dir'])
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
199
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
200 if simulation_parameters['target_file'] is None:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
201 _log("ERROR: target file cannot be None for whole exome simulation!")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
202 fileio.clean(simulation_parameters['output_dir'])
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
203 fileio.clean(simulation_parameters['tmp_dir'])
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
204 exit()
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
205 target_file = simulation_parameters['target_file']
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
206
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
207 # copy genome and target to the tmp folder
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
208 shutil.copyfile(simulation_parameters['genome_file'], os.path.join(simulation_parameters['tmp_dir'], "reference.fa"))
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
209 shutil.copyfile(target_file, os.path.join(simulation_parameters['tmp_dir'], "target.bed"))
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
210 genome_file = os.path.join(simulation_parameters['tmp_dir'], "reference.fa")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
211 target_file = os.path.join(simulation_parameters['tmp_dir'], "target.bed")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
212
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
213 # initialize variables for temporary files
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
214 control_genome_file = os.path.join(simulation_parameters['tmp_dir'], "ControlGenome.fa")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
215 control_target_file = os.path.join(simulation_parameters['tmp_dir'], "ControlTarget.bed")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
216 cnv_genome_file = os.path.join(simulation_parameters['tmp_dir'], "CNVGenome.fa")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
217 cnv_target_file = os.path.join(simulation_parameters['tmp_dir'], "CNVTarget.bed")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
218 base_reads_file = os.path.join(simulation_parameters['tmp_dir'], "base")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
219 control_reads_file = os.path.join(simulation_parameters['tmp_dir'], "control")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
220 cnv_reads_file = os.path.join(simulation_parameters['tmp_dir'], "cnv")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
221
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
222 _log("loading genome file ..")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
223 header, genome = fileio.readGenome(genome_file)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
224 _log("successfully loaded a genome of length " + `len(genome)`)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
225
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
226 _log("loading target file ..")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
227 _log("sorting and merging targets ..")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
228 target_file = fileio.prepareTargetFile(target_file)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
229 targets = fileio.readTargets(target_file)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
230 _log("successfully loaded " + `len(targets)` + " targets ..")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
231
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
232 if simulation_parameters['cnv_list_file'] is None:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
233 # CNV list file
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
234 cnv_list_file = os.path.join(simulation_parameters['output_dir'], "CNVList.bed")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
235
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
236 _log("generating CNV list ..")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
237 cnv_matrix = _generateCNVMatrix(targets, cnv_list_parameters['regions_count'])
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
238 mask = _generateCNVMask(len(cnv_matrix), cnv_list_parameters['amplifications'], \
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
239 cnv_list_parameters['deletions'], cnv_list_parameters['minimum_variations'], \
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
240 cnv_list_parameters['maximum_variations'])
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
241
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
242 with open(cnv_list_file, 'w') as f:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
243 for i, cnv_region in enumerate(cnv_matrix):
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
244 line = cnv_region[0][0] + '\t' \
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
245 + `cnv_region[0][1]` + '\t' \
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
246 + `cnv_region[-1][2]` + '\t' \
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
247 + `mask[i]` + '\n'
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
248 f.write(line)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
249 _log("randomly generated CNV list saved to " + cnv_list_file)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
250 else:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
251 _log("loading CNV list ..")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
252 with open(simulation_parameters['cnv_list_file'], "r") as f:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
253 cnv_list = []
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
254 lines = f.readlines()
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
255 for line in lines:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
256 chromosome, region_start, region_end, variation = line.strip().split("\t")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
257 cnv_list.append((chromosome, int(region_start), int(region_end), int(variation)))
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
258
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
259 cnv_matrix = _loadCNVMatrix(targets, cnv_list)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
260 mask = map(lambda x: x[3], cnv_list)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
261 _log("successfully loaded CNV list that contains " + `len(lines)` + " regions ..")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
262
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
263 # call Wessim to generate reads from the genome file and the target list
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
264 _log("generating reads for the target exons ..")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
265 _log("delegating job to Wessim ...")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
266 _callWessim(genome_file, target_file, base_reads_file, simulation_parameters['number_of_reads'], simulation_parameters['read_length'])
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
267
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
268 # generate genome and target files for the control and the CNV
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
269 _log("simulating copy number variations (amplifications/deletions)")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
270 control_genome, control_targets, cnv_genome, cnv_targets = _simulateCNV(genome, cnv_matrix, mask, simulation_parameters['read_length'])
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
271
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
272 _log("saving to the control genome file ..")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
273 with open(control_genome_file, 'w') as fw:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
274 fw.write(header + "\n")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
275 n = 50
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
276 l = [control_genome[i:i + n] for i in range(0, len(control_genome), n)]
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
277 for line in l:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
278 fw.write(line + "\n")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
279
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
280 _log("saving to the control target file ..")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
281 with open(control_target_file, 'w') as tw:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
282 for target in control_targets:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
283 line = target[0] + "\t" + `target[1]` + "\t" + `target[2]` + "\n"
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
284 tw.write(line)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
285
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
286 _log("delegating job to Wessim ...")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
287 _callWessim(control_genome_file, control_target_file, control_reads_file,
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
288 int(cnv_list_parameters['deletions'] * simulation_parameters['number_of_reads']), simulation_parameters['read_length'])
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
289
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
290 _log("saving to the CNV genome file ..")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
291 with open(cnv_genome_file, 'w') as fw:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
292 fw.write(header + "\n")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
293 n = 50
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
294 l = [cnv_genome[i:i + n] for i in range(0, len(cnv_genome), n)]
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
295 for line in l:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
296 fw.write(line + "\n")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
297
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
298 _log("saving to the CNV target file ..")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
299 with open(cnv_target_file, 'w') as tw:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
300 for target in cnv_targets:
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
301 line = target[0] + "\t" + `target[1]` + "\t" + `target[2]` + "\n"
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
302 tw.write(line)
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
303
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
304 _log("delegating job to Wessim ...")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
305 _callWessim(cnv_genome_file, cnv_target_file, cnv_reads_file,
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
306 int(cnv_list_parameters['amplifications']* simulation_parameters['number_of_reads']), simulation_parameters['read_length'])
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
307
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
308 _log("merging results ..")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
309 fileio.mergeWessimReads(simulation_parameters['tmp_dir'], simulation_parameters['output_dir'])
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
310
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
311 _log("cleaning temporary files ..")
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
312 fileio.clean(simulation_parameters['tmp_dir'])
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
313
63955244b2fa Uploaded cnvsim library package
ahosny
parents:
diff changeset
314 _log("simulation completed. find results in " + simulation_parameters['output_dir'])