changeset 18:eca72016b5b3 draft default tip

Uploaded
author ahosny
date Wed, 07 Sep 2016 09:37:49 -0400
parents e4ebf3435054
children
files cnvsim/genome_simulator.py
diffstat 1 files changed, 24 insertions(+), 15 deletions(-) [+]
line wrap: on
line diff
--- a/cnvsim/genome_simulator.py	Wed Sep 07 09:36:31 2016 -0400
+++ b/cnvsim/genome_simulator.py	Wed Sep 07 09:37:49 2016 -0400
@@ -53,16 +53,18 @@
     return cnv_mask
 
 
-def _generateCNVList(chromosome_length, number_of_regions, p_amplify, p_delete, min_variation, max_variation):
+def _generateCNVList(chromosome_length, number_of_regions, p_amplify, p_delete, min_variation, max_variation, region_min_length, region_max_length):
     '''
-
-    :param chromosome_length:
-    :param number_of_regions:
-    :param p_amplify:
-    :param p_delete:
-    :param min_variation:
-    :param max_variation:
-    :return:
+    This function generates a CNV list to be used for simulating the CNVs in the next step
+    :param chromosome_length: the length of the simulated chromosome
+    :param number_of_regions: the number of regions to be affected by CNVs
+    :param p_amplify: percentage of amplification
+    :param p_delete: percentage of deletions
+    :param min_variation: minimum level of variation
+    :param max_variation: maximum level of variation
+    :param region_min_length: minimum length of a CNV region
+    :param region_max_length: maximum length of a CNV region
+    :return: CNV list
     '''
     region_length = chromosome_length / number_of_regions
     start = 0
@@ -74,7 +76,7 @@
         jump_start = start + int(random.randrange(int(0.40*region_length), int(0.45*region_length)))
 
         # jump backward end
-        jump_end = end - int(random.randrange(int(0.40*region_length), int(0.45*region_length)))
+        jump_end = jump_start + random.randint(region_min_length, region_max_length) - 1
 
         cnv_list.append([jump_start, jump_end, mask[i]])
         start = end
@@ -105,7 +107,7 @@
             cnv_genome.append(amplification)
         elif variation < 0:
             # deletion
-            deletion = genome[start-read_length: start] + sequence * variation + genome[end:end+read_length]
+            deletion = genome[start-read_length: start] + sequence * abs(variation) + genome[end:end+read_length]
             control_genome.append(deletion)
 
     return ''.join(control_genome), ''.join(cnv_genome)
@@ -140,7 +142,7 @@
     :param cnv_list_parameters: a dictionary containing parameters for CNV List creation
     :return: None
     '''
-    _log('simulation type: whole genome')
+    _log('simulation type: whole genome sequencing')
 
     # create a temporary directory for intermediate files
     if not os.path.exists(simulation_parameters['tmp_dir']):
@@ -167,20 +169,26 @@
 
     if simulation_parameters['cnv_list_file'] is None:
         # CNV list file
-        cnv_list_file = os.path.join(simulation_parameters['output_dir'], "CNVList.bed")
+        cnv_list_file = os.path.join(simulation_parameters['output_dir'], "copynumber.bed")
 
         _log("generating CNV list ..")
         cnv_list = _generateCNVList(len(genome), cnv_list_parameters['regions_count'], \
                                     cnv_list_parameters['amplifications'], cnv_list_parameters['deletions'], \
                                     cnv_list_parameters['minimum_variations'], \
-                                    cnv_list_parameters['maximum_variations'])
+                                    cnv_list_parameters['maximum_variations'], \
+                                    cnv_list_parameters['minimum_length'], \
+                                    cnv_list_parameters['maximum_length'])
         cnv_list = map(lambda l: [header.replace('>', '')] + l, cnv_list)
 
         with open(cnv_list_file, 'w') as f:
+            line = 'chrom\tchr_start\tchrom_stop\tnum_positions\tcopy_number\n'
+            f.write(line)
             for i, cnv_region in enumerate(cnv_list):
+                num_positions = cnv_region[2] - cnv_region[1] + 1
                 line = cnv_region[0] + '\t' \
                        + `cnv_region[1]` + '\t' \
                        + `cnv_region[2]` + '\t' \
+                       + `num_positions` + '\t' \
                        + `cnv_region[3]` + '\n'
                 f.write(line)
         _log("randomly generated CNV list saved to " + cnv_list_file)
@@ -190,8 +198,9 @@
         with open(simulation_parameters['cnv_list_file'], "r") as f:
             cnv_list = []
             lines = f.readlines()
+            lines.pop(0)
             for line in lines:
-                chromosome, region_start, region_end, variation = line.strip().split("\t")
+                chromosome, region_start, region_end, num_positions, variation = line.strip().split("\t")
                 cnv_list.append((chromosome, int(region_start), int(region_end), int(variation)))
 
         _log("successfully loaded CNV list that contains " + `len(lines)` + " regions ..")