diff scripts/ReMatCh/modules/rematch_module.py @ 3:0cbed1c0a762 draft default tip

planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author cstrittmatter
date Tue, 28 Jan 2020 10:42:31 -0500
parents 965517909457
children
line wrap: on
line diff
--- a/scripts/ReMatCh/modules/rematch_module.py	Wed Jan 22 09:10:12 2020 -0500
+++ b/scripts/ReMatCh/modules/rematch_module.py	Tue Jan 28 10:42:31 2020 -0500
@@ -1,43 +1,52 @@
 import os.path
 import multiprocessing
-import utils
 import functools
 import sys
 import pickle
 
+# https://chrisyeh96.github.io/2017/08/08/definitive-guide-python-imports.html#case-2-syspath-could-change
+sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)))
+import utils
 
-def index_fasta_samtools(fasta, region_None, region_outfile_none, print_comand_True):
+
+def index_fasta_samtools(fasta, region_none, region_outfile_none, print_comand_true):
     command = ['samtools', 'faidx', fasta, '', '', '']
     shell_true = False
-    if region_None is not None:
-        command[3] = region_None
+    if region_none is not None:
+        command[3] = region_none
     if region_outfile_none is not None:
         command[4] = '>'
         command[5] = region_outfile_none
         shell_true = True
-    run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, shell_true, None, print_comand_True)
+    run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, shell_true, None, print_comand_true)
     return run_successfully, stdout
 
 
 # Indexing reference file using Bowtie2
-def indexSequenceBowtie2(referenceFile, threads):
-    if os.path.isfile(str(referenceFile + '.1.bt2')):
+def index_sequence_bowtie2(reference_file, threads):
+    if os.path.isfile(str(reference_file + '.1.bt2')):
         run_successfully = True
     else:
-        command = ['bowtie2-build', '--threads', str(threads), referenceFile, referenceFile]
-        run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True)
+        command = ['bowtie2-build', '--threads', str(threads), reference_file, reference_file]
+        run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, True)
     return run_successfully
 
 
 # Mapping with Bowtie2
-def mappingBowtie2(fastq_files, referenceFile, threads, outdir, conserved_True, numMapLoc, bowtieOPT):
+def mapping_bowtie2(fastq_files, reference_file, threads, outdir, num_map_loc,
+                    bowtie_algorithm='--very-sensitive-local', bowtie_opt=None):
     sam_file = os.path.join(outdir, str('alignment.sam'))
 
     # Index reference file
-    run_successfully = indexSequenceBowtie2(referenceFile, threads)
+    run_successfully = index_sequence_bowtie2(reference_file, threads)
 
     if run_successfully:
-        command = ['bowtie2', '-k', str(numMapLoc), '-q', '', '--threads', str(threads), '-x', referenceFile, '', '--no-unal', '', '-S', sam_file]
+        command = ['bowtie2', '', '', '-q', bowtie_algorithm, '--threads', str(threads), '-x',
+                   reference_file, '', '--no-unal', '', '-S', sam_file]
+
+        if num_map_loc is not None and num_map_loc > 1:
+            command[1] = '-k'
+            command[2] = str(num_map_loc)
 
         if len(fastq_files) == 1:
             command[9] = '-U ' + fastq_files[0]
@@ -46,15 +55,10 @@
         else:
             return False, None
 
-        if conserved_True:
-            command[4] = '--sensitive'
-        else:
-            command[4] = '--very-sensitive-local'
+        if bowtie_opt is not None:
+            command[11] = bowtie_opt
 
-        if bowtieOPT is not None:
-            command[11] = bowtieOPT
-
-        run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True)
+        run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, True)
 
     if not run_successfully:
         sam_file = None
@@ -77,14 +81,17 @@
     return splited_cigars
 
 
-def recode_cigar_based_on_base_quality(cigar, bases_quality, softClip_baseQuality, mapping_position, direct_strand_true, softClip_cigarFlagRecode):
+def recode_cigar_based_on_base_quality(cigar, bases_quality, soft_clip_base_quality, mapping_position,
+                                       direct_strand_true, soft_clip_cigar_flag_recode):
     cigar = split_cigar(cigar)
     soft_left = []
     soft_right = []
     cigar_flags_for_reads_length = ('M', 'I', 'S', '=', 'X')
-    read_length_without_right_s = sum([cigar_part[0] for cigar_part in cigar if cigar_part[1] in cigar_flags_for_reads_length]) - (cigar[len(cigar) - 1][0] if cigar[len(cigar) - 1][1] == 'S' else 0)
+    read_length_without_right_s = sum([cigar_part[0] for cigar_part in cigar if
+                                       cigar_part[1] in cigar_flags_for_reads_length]) - \
+                                  (cigar[len(cigar) - 1][0] if cigar[len(cigar) - 1][1] == 'S' else 0)
     for x, base in enumerate(bases_quality):
-        if ord(base) - 33 >= softClip_baseQuality:
+        if ord(base) - 33 >= soft_clip_base_quality:
             if x <= cigar[0][0] - 1:
                 if cigar[0][1] == 'S':
                     soft_left.append(x)
@@ -96,10 +103,10 @@
     if len(soft_left) > 0:
         soft_left = min(soft_left) + 1
         if soft_left == 1:
-            cigar = [[cigar[0][0], softClip_cigarFlagRecode]] + cigar[1:]
+            cigar = [[cigar[0][0], soft_clip_cigar_flag_recode]] + cigar[1:]
             left_changed = (True, cigar[0][0])
         elif cigar[0][0] - soft_left > 0:
-            cigar = [[soft_left, 'S']] + [[cigar[0][0] - soft_left, softClip_cigarFlagRecode]] + cigar[1:]
+            cigar = [[soft_left, 'S']] + [[cigar[0][0] - soft_left, soft_clip_cigar_flag_recode]] + cigar[1:]
             left_changed = (True, cigar[0][0] - soft_left)
 
     right_changed = (False, 0)
@@ -107,7 +114,7 @@
         soft_right = max(soft_right) + 1
         cigar = cigar[:-1]
         if soft_right - read_length_without_right_s > 0:
-            cigar.append([soft_right - read_length_without_right_s, softClip_cigarFlagRecode])
+            cigar.append([soft_right - read_length_without_right_s, soft_clip_cigar_flag_recode])
             right_changed = (True, soft_right - read_length_without_right_s)
         if len(bases_quality) - soft_right > 0:
             cigar.append([len(bases_quality) - soft_right, 'S'])
@@ -140,7 +147,7 @@
     return direct_strand
 
 
-def verify_mapped_tip(reference_length, mapping_position, read_length, cigar):
+def verify_mapped_tip(reference_length, mapping_position, cigar):
     tip = False
     if 'S' in cigar:
         cigar = split_cigar(cigar)
@@ -174,26 +181,31 @@
 
 
 @utils.trace_unhandled_exceptions
-def parallelized_recode_soft_clipping(line_collection, pickleFile, softClip_baseQuality, sequences_length, softClip_cigarFlagRecode):
+def parallelized_recode_soft_clipping(line_collection, pickle_file, soft_clip_base_quality, sequences_length,
+                                      soft_clip_cigar_flag_recode):
     lines_sam = []
     for line in line_collection:
-        line = line.splitlines()[0]
+        line = line.rstrip('\r\n')
         if len(line) > 0:
             if line.startswith('@'):
                 lines_sam.append(line)
             else:
                 line = line.split('\t')
-                if not verify_mapped_tip(sequences_length[line[2]], int(line[3]), len(line[9]), line[5]):
-                    line[5], line[3] = recode_cigar_based_on_base_quality(line[5], line[10], softClip_baseQuality, int(line[3]), verify_mapped_direct_strand(int(line[1])), softClip_cigarFlagRecode)
+                if not verify_mapped_tip(sequences_length[line[2]], int(line[3]), line[5]):
+                    line[5], line[3] = recode_cigar_based_on_base_quality(line[5], line[10], soft_clip_base_quality,
+                                                                          int(line[3]),
+                                                                          verify_mapped_direct_strand(int(line[1])),
+                                                                          soft_clip_cigar_flag_recode)
                 lines_sam.append('\t'.join(line))
-    with open(pickleFile, 'wb') as writer:
+    with open(pickle_file, 'wb') as writer:
         pickle.dump(lines_sam, writer)
 
 
-def recode_soft_clipping_from_sam(sam_file, outdir, threads, softClip_baseQuality, reference_dict, softClip_cigarFlagRecode):
+def recode_soft_clipping_from_sam(sam_file, outdir, threads, soft_clip_base_quality, reference_dict,
+                                  soft_clip_cigar_flag_recode):
     pickle_files = []
     sequences_length = {}
-    for x, seq_info in reference_dict.items():
+    for x, seq_info in list(reference_dict.items()):
         sequences_length[seq_info['header']] = seq_info['length']
 
     with open(sam_file, 'rtU') as reader:
@@ -203,15 +215,18 @@
         for x, line in enumerate(reader):
             line_collection.append(line)
             if x % 10000 == 0:
-                pickleFile = os.path.join(outdir, 'remove_soft_clipping.' + str(x) + '.pkl')
-                pickle_files.append(pickleFile)
-                pool.apply_async(parallelized_recode_soft_clipping, args=(line_collection, pickleFile, softClip_baseQuality, sequences_length, softClip_cigarFlagRecode,))
+                pickle_file = os.path.join(outdir, 'remove_soft_clipping.' + str(x) + '.pkl')
+                pickle_files.append(pickle_file)
+                pool.apply_async(parallelized_recode_soft_clipping, args=(line_collection, pickle_file,
+                                                                          soft_clip_base_quality, sequences_length,
+                                                                          soft_clip_cigar_flag_recode,))
                 line_collection = []
         if len(line_collection) > 0:
-            pickleFile = os.path.join(outdir, 'remove_soft_clipping.' + str(x) + '.pkl')
-            pickle_files.append(pickleFile)
-            pool.apply_async(parallelized_recode_soft_clipping, args=(line_collection, pickleFile, softClip_baseQuality, sequences_length, softClip_cigarFlagRecode,))
-            line_collection = []
+            pickle_file = os.path.join(outdir, 'remove_soft_clipping.' + str(x) + '.pkl')
+            pickle_files.append(pickle_file)
+            pool.apply_async(parallelized_recode_soft_clipping, args=(line_collection, pickle_file,
+                                                                      soft_clip_base_quality, sequences_length,
+                                                                      soft_clip_cigar_flag_recode,))
         pool.close()
         pool.join()
 
@@ -219,110 +234,140 @@
 
     new_sam_file = os.path.join(outdir, 'alignment_with_soft_clipping_recoded.sam')
     with open(new_sam_file, 'wt') as writer:
-        for pickleFile in pickle_files:
-            if os.path.isfile(pickleFile):
+        for pickle_file in pickle_files:
+            if os.path.isfile(pickle_file):
                 lines_sam = None
-                with open(pickleFile, 'rb') as reader:
+                with open(pickle_file, 'rb') as reader:
                     lines_sam = pickle.load(reader)
                 if lines_sam is not None:
                     for line in lines_sam:
                         writer.write(line + '\n')
-                os.remove(pickleFile)
+                os.remove(pickle_file)
 
     return new_sam_file
 
 
 # Sort alignment file
-def sortAlignment(alignment_file, output_file, sortByName_True, threads):
-    outFormat_string = os.path.splitext(output_file)[1][1:].lower()
-    command = ['samtools', 'sort', '-o', output_file, '-O', outFormat_string, '', '-@', str(threads), alignment_file]
-    if sortByName_True:
+def sort_alignment(alignment_file, output_file, sort_by_name_true, threads):
+    out_format_string = os.path.splitext(output_file)[1][1:].lower()
+    command = ['samtools', 'sort', '-o', output_file, '-O', out_format_string, '', '-@', str(threads), alignment_file]
+    if sort_by_name_true:
         command[6] = '-n'
-    run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True)
+    run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, True)
     if not run_successfully:
         output_file = None
     return run_successfully, output_file
 
 
 # Index alignment file
-def indexAlignment(alignment_file):
+def index_alignment(alignment_file):
     command = ['samtools', 'index', alignment_file]
-    run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True)
+    run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, True)
     return run_successfully
 
 
-def mapping_reads(fastq_files, reference_file, threads, outdir, conserved_True, numMapLoc, rematch_run, softClip_baseQuality, softClip_recodeRun, reference_dict, softClip_cigarFlagRecode, bowtieOPT):
+def mapping_reads(fastq_files, reference_file, threads, outdir, num_map_loc, rematch_run,
+                  soft_clip_base_quality, soft_clip_recode_run, reference_dict, soft_clip_cigar_flag_recode,
+                  bowtie_algorithm, bowtie_opt, clean_run=True):
     # Create a symbolic link to the reference_file
-    reference_link = os.path.join(outdir, os.path.basename(reference_file))
-    os.symlink(reference_file, reference_link)
+    if clean_run:
+        reference_link = os.path.join(outdir, os.path.basename(reference_file))
+        if os.path.islink(reference_link):
+            os.unlink(reference_link)
+        os.symlink(reference_file, reference_link)
+        reference_file = reference_link
 
     bam_file = None
     # Mapping reads using Bowtie2
-    run_successfully, sam_file = mappingBowtie2(fastq_files, reference_link, threads, outdir, conserved_True, numMapLoc, bowtieOPT)
+    run_successfully, sam_file = mapping_bowtie2(fastq_files=fastq_files, reference_file=reference_file,
+                                                 threads=threads, outdir=outdir, num_map_loc=num_map_loc,
+                                                 bowtie_algorithm=bowtie_algorithm, bowtie_opt=bowtie_opt)
 
     if run_successfully:
         # Remove soft clipping
-        if rematch_run == softClip_recodeRun or softClip_recodeRun == 'both':
-            print 'Recoding soft clipped regions'
-            sam_file = recode_soft_clipping_from_sam(sam_file, outdir, threads, softClip_baseQuality, reference_dict, softClip_cigarFlagRecode)
+        if rematch_run == soft_clip_recode_run or soft_clip_recode_run == 'both':
+            print('Recoding soft clipped regions')
+            sam_file = recode_soft_clipping_from_sam(sam_file, outdir, threads, soft_clip_base_quality, reference_dict,
+                                                     soft_clip_cigar_flag_recode)
 
         # Convert sam to bam and sort bam
-        run_successfully, bam_file = sortAlignment(sam_file, str(os.path.splitext(sam_file)[0] + '.bam'), False, threads)
+        run_successfully, bam_file = sort_alignment(sam_file, str(os.path.splitext(sam_file)[0] + '.bam'), False,
+                                                    threads)
 
         if run_successfully:
             os.remove(sam_file)
             # Index bam
-            run_successfully = indexAlignment(bam_file)
+            run_successfully = index_alignment(bam_file)
 
-    return run_successfully, bam_file, reference_link
+    return run_successfully, bam_file, reference_file
 
 
 def create_vcf(bam_file, sequence_to_analyse, outdir, counter, reference_file):
     gene_vcf = os.path.join(outdir, 'samtools_mpileup.sequence_' + str(counter) + '.vcf')
 
-    command = ['samtools', 'mpileup', '--count-orphans', '--no-BAQ', '--min-BQ', '0', '--min-MQ', str(7), '--fasta-ref', reference_file, '--region', sequence_to_analyse, '--output', gene_vcf, '--VCF', '--uncompressed', '--output-tags', 'INFO/AD,AD,DP', bam_file]
+    command = ['samtools', 'mpileup', '--count-orphans', '--no-BAQ', '--min-BQ', '0', '--min-MQ', str(7), '--fasta-ref',
+               reference_file, '--region', sequence_to_analyse, '--output', gene_vcf, '--VCF', '--uncompressed',
+               '--output-tags', 'INFO/AD,AD,DP', bam_file]
 
-    run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, False)
+    run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, False)
     if not run_successfully:
         gene_vcf = None
     return run_successfully, gene_vcf
 
 
 # Read vcf file
-class Vcf():
-    def __init__(self, vcfFile):
-        self.vcf = open(vcfFile, 'rtU')
+class Vcf:
+    def __init__(self, vcf_file, encoding=None, newline=None):
+        try:
+            self.vcf = open(vcf_file, 'rt', encoding=encoding, newline=newline)
+        except TypeError:
+            self.vcf = open(vcf_file, 'rt')
         self.line_read = self.vcf.readline()
+        self.contigs_info_dict = {}
         while self.line_read.startswith('#'):
+            if self.line_read.startswith('##contig=<ID='):
+                seq = self.line_read.split('=')[2].split(',')[0]
+                seq_len = self.line_read.split('=')[3].split('>')[0]
+                self.contigs_info_dict[seq] = int(seq_len)
             self.line_read = self.vcf.readline()
         self.line = self.line_read
 
     def readline(self):
-        self.line_stored = self.line
+        line_stored = self.line
         self.line = self.vcf.readline()
-        return self.line_stored
+        return line_stored
 
     def close(self):
         self.vcf.close()
 
+    def get_contig_legth(self, contig):
+        return self.contigs_info_dict[contig]
 
-def get_variants(gene_vcf):
+
+def get_variants(gene_vcf, seq_name, encoding=None, newline=None):
     variants = {}
 
-    vfc_file = Vcf(gene_vcf)
+    vfc_file = Vcf(vcf_file=gene_vcf, encoding=encoding, newline=newline)
     line = vfc_file.readline()
+    counter = 1
     while len(line) > 0:
-        fields = line.splitlines()[0].split('\t')
+        fields = line.rstrip('\r\n').split('\t')
         if len(fields) > 0:
             fields[1] = int(fields[1])
 
             info_field = {}
-            for i in fields[7].split(';'):
-                i = i.split('=')
-                if len(i) > 1:
-                    info_field[i[0]] = i[1]
+            try:
+                for i in fields[7].split(';'):
+                    i = i.split('=')
+                    if len(i) > 1:
+                        info_field[i[0]] = i[1]
+                    else:
+                        info_field[i[0]] = None
+            except IndexError:
+                if counter > vfc_file.get_contig_legth(contig=seq_name):
+                    break
                 else:
-                    info_field[i[0]] = None
+                    raise IndexError
 
             format_field = {}
             format_field_name = fields[8].split(':')
@@ -331,13 +376,22 @@
             for i in range(0, len(format_data)):
                 format_field[format_field_name[i]] = format_data[i].split(',')
 
-            fields_to_store = {'REF': fields[3], 'ALT': fields[4].split(','), 'info': info_field, 'format': format_field}
+            fields_to_store = {'REF': fields[3], 'ALT': fields[4].split(','), 'info': info_field,
+                               'format': format_field}
             if fields[1] in variants:
                 variants[fields[1]][len(variants[fields[1]])] = fields_to_store
             else:
                 variants[fields[1]] = {0: fields_to_store}
 
-        line = vfc_file.readline()
+        try:
+            line = vfc_file.readline()
+        except UnicodeDecodeError:
+            if counter + 1 > vfc_file.get_contig_legth(contig=seq_name):
+                break
+            else:
+                raise UnicodeDecodeError
+
+        counter += 1
     vfc_file.close()
 
     return variants
@@ -347,7 +401,7 @@
     entry_with_indel = []
     entry_with_snp = None
     for i in variant_position:
-        keys = variant_position[i]['info'].keys()
+        keys = list(variant_position[i]['info'].keys())
         if 'INDEL' in keys:
             entry_with_indel.append(i)
         else:
@@ -356,9 +410,11 @@
     return entry_with_indel, entry_with_snp
 
 
-def get_alt_noMatter(variant_position, indel_true):
+def get_alt_no_matter(variant_position, indel_true):
     dp = sum(map(int, variant_position['format']['AD']))
-    index_alleles_sorted_position = sorted(zip(map(int, variant_position['format']['AD']), range(0, len(variant_position['format']['AD']))), reverse=True)
+    index_alleles_sorted_position = sorted(zip(list(map(int, variant_position['format']['AD'])),
+                                               list(range(0, len(variant_position['format']['AD'])))),
+                                           reverse=True)
     index_dominant_allele = None
     if not indel_true:
         ad_idv = index_alleles_sorted_position[0][0]
@@ -377,7 +433,8 @@
 
         if float(ad_idv) / float(dp) >= 0.5:
             if len([x for x in index_alleles_sorted_position if x[0] == index_alleles_sorted_position[0][0]]) > 1:
-                index_alleles_sorted_position = sorted([x for x in index_alleles_sorted_position if x[0] == index_alleles_sorted_position[0][0]])
+                index_alleles_sorted_position = sorted([x for x in index_alleles_sorted_position if
+                                                        x[0] == index_alleles_sorted_position[0][0]])
 
             index_dominant_allele = index_alleles_sorted_position[0][1]
             if index_dominant_allele == 0:
@@ -404,7 +461,8 @@
     return number_diferences
 
 
-def get_alt_correct(variant_position, alt_noMatter, dp, ad_idv, index_dominant_allele, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele):
+def get_alt_correct(variant_position, alt_no_matter, dp, ad_idv, index_dominant_allele, minimum_depth_presence,
+                    minimum_depth_call, minimum_depth_frequency_dominant_allele):
     alt = None
     low_coverage = False
     multiple_alleles = False
@@ -423,16 +481,18 @@
                 if float(ad_idv) / float(dp) < minimum_depth_frequency_dominant_allele:
                     alt = 'N' * len(variant_position['REF'])
                     if index_dominant_allele is not None:
-                        variants_coverage = [int(variant_position['format']['AD'][i]) for i in range(0, len(variant_position['ALT']) + 1) if i != index_dominant_allele]
+                        variants_coverage = [int(variant_position['format']['AD'][i]) for i in
+                                             range(0, len(variant_position['ALT']) + 1) if i != index_dominant_allele]
                         if sum(variants_coverage) > 0:
                             if float(max(variants_coverage)) / float(sum(variants_coverage)) > 0.5:
                                 multiple_alleles = True
-                            elif float(max(variants_coverage)) / float(sum(variants_coverage)) == 0.5 and len(variants_coverage) > 2:
+                            elif float(max(variants_coverage)) / float(sum(variants_coverage)) == 0.5 and \
+                                    len(variants_coverage) > 2:
                                 multiple_alleles = True
                     else:
                         multiple_alleles = True
                 else:
-                    alt = alt_noMatter
+                    alt = alt_no_matter
     else:
         low_coverage = True
 
@@ -464,17 +524,22 @@
     return indel_coverage.index(str(max(indel_coverage.values())))
 
 
-def determine_variant(variant_position, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, indel_true):
-    alt_noMatter, dp, ad_idv, index_dominant_allele = get_alt_noMatter(variant_position, indel_true)
+def determine_variant(variant_position, minimum_depth_presence, minimum_depth_call,
+                      minimum_depth_frequency_dominant_allele, indel_true):
+    alt_no_matter, dp, ad_idv, index_dominant_allele = get_alt_no_matter(variant_position, indel_true)
 
-    alt_correct, low_coverage, multiple_alleles = get_alt_correct(variant_position, alt_noMatter, dp, ad_idv, index_dominant_allele, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele)
+    alt_correct, low_coverage, multiple_alleles = get_alt_correct(variant_position, alt_no_matter, dp, ad_idv,
+                                                                  index_dominant_allele, minimum_depth_presence,
+                                                                  minimum_depth_call,
+                                                                  minimum_depth_frequency_dominant_allele)
 
     alt_alignment = get_alt_alignment(variant_position['REF'], alt_correct)
 
-    return variant_position['REF'], alt_correct, low_coverage, multiple_alleles, alt_noMatter, alt_alignment
+    return variant_position['REF'], alt_correct, low_coverage, multiple_alleles, alt_no_matter, alt_alignment
 
 
-def confirm_nucleotides_indel(ref, alt, variants, position_start_indel, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, alignment_true):
+def confirm_nucleotides_indel(ref, alt, variants, position_start_indel, minimum_depth_presence, minimum_depth_call,
+                              minimum_depth_frequency_dominant_allele, alignment_true):
     alt = list(alt)
 
     for i in range(0, len(alt) - 1):
@@ -496,9 +561,11 @@
                     break
 
             entry_with_indel, entry_with_snp = indel_entry(variants[new_position])
-            new_ref, alt_correct, low_coverage, multiple_alleles, alt_noMatter, alt_alignment = determine_variant(variants[new_position][entry_with_snp], minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, False)
-            if alt_noMatter != '.' and alt[alt_position] != alt_noMatter:
-                alt[alt_position] = alt_noMatter
+            new_ref, alt_correct, low_coverage, multiple_alleles, alt_no_matter, alt_alignment = \
+                determine_variant(variants[new_position][entry_with_snp], minimum_depth_presence, minimum_depth_call,
+                                  minimum_depth_frequency_dominant_allele, False)
+            if alt_no_matter != '.' and alt[alt_position] != alt_no_matter:
+                alt[alt_position] = alt_no_matter
 
     return ''.join(alt)
 
@@ -507,64 +574,74 @@
     entry_with_indel, entry_with_snp = indel_entry(variants[position])
 
     if len(entry_with_indel) == 0:
-        ref, alt_correct, low_coverage, multiple_alleles, alt_noMatter, alt_alignment = determine_variant(variants[position][entry_with_snp], minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, False)
+        ref, alt_correct, low_coverage, multiple_alleles, alt_no_matter, alt_alignment = \
+            determine_variant(variants[position][entry_with_snp], minimum_depth_presence, minimum_depth_call,
+                              minimum_depth_frequency_dominant_allele, False)
     else:
-        ref_snp, alt_correct_snp, low_coverage_snp, multiple_alleles_snp, alt_noMatter_snp, alt_alignment_snp = determine_variant(variants[position][entry_with_snp], minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, False)
+        ref_snp, alt_correct_snp, low_coverage_snp, multiple_alleles_snp, alt_no_matter_snp, alt_alignment_snp = \
+            determine_variant(variants[position][entry_with_snp], minimum_depth_presence, minimum_depth_call,
+                              minimum_depth_frequency_dominant_allele, False)
 
         indel_more_likely = entry_with_indel[0]
         if len(entry_with_indel) > 1:
             indel_more_likely = get_indel_more_likely(variants[position], entry_with_indel)
 
-        ref, alt_correct, low_coverage, multiple_alleles, alt_noMatter, alt_alignment = determine_variant(variants[position][indel_more_likely], minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, True)
+        ref, alt_correct, low_coverage, multiple_alleles, alt_no_matter, alt_alignment = \
+            determine_variant(variants[position][indel_more_likely], minimum_depth_presence, minimum_depth_call,
+                              minimum_depth_frequency_dominant_allele, True)
 
-        if alt_noMatter == '.':
-            ref, alt_correct, low_coverage, multiple_alleles, alt_noMatter, alt_alignment = ref_snp, alt_correct_snp, low_coverage_snp, multiple_alleles_snp, alt_noMatter_snp, alt_alignment_snp
+        if alt_no_matter == '.':
+            ref, alt_correct, low_coverage, multiple_alleles, alt_no_matter, alt_alignment = \
+                ref_snp, alt_correct_snp, low_coverage_snp, multiple_alleles_snp, alt_no_matter_snp, alt_alignment_snp
         else:
             if alt_correct is None and alt_correct_snp is not None:
                 alt_correct = alt_correct_snp
             elif alt_correct is not None and alt_correct_snp is not None:
                 if alt_correct_snp != '.' and alt_correct[0] != alt_correct_snp:
                     alt_correct = alt_correct_snp + alt_correct[1:] if len(alt_correct) > 1 else alt_correct_snp
-            if alt_noMatter_snp != '.' and alt_noMatter[0] != alt_noMatter_snp:
-                alt_noMatter = alt_noMatter_snp + alt_noMatter[1:] if len(alt_noMatter) > 1 else alt_noMatter_snp
+            if alt_no_matter_snp != '.' and alt_no_matter[0] != alt_no_matter_snp:
+                alt_no_matter = alt_no_matter_snp + alt_no_matter[1:] if len(alt_no_matter) > 1 else alt_no_matter_snp
             if alt_alignment_snp != '.' and alt_alignment[0] != alt_alignment_snp:
                 alt_alignment = alt_alignment_snp + alt_alignment[1:] if len(alt_alignment) > 1 else alt_alignment_snp
 
-            # if alt_noMatter != '.':
-            #     alt_noMatter = confirm_nucleotides_indel(ref, alt_noMatter, variants, position, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, False)
+            # if alt_no_matter != '.':
+            #     alt_no_matter = confirm_nucleotides_indel(ref, alt_no_matter, variants, position, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, False)
             # if alt_correct is not None and alt_correct != '.':
             #     alt_correct = confirm_nucleotides_indel(ref, alt_correct, variants, position, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, False)
             # if alt_alignment != '.':
             #     alt_alignment = confirm_nucleotides_indel(ref, alt_alignment, variants, position, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, True)
 
-    return ref, alt_correct, low_coverage, multiple_alleles, alt_noMatter, alt_alignment
+    return ref, alt_correct, low_coverage, multiple_alleles, alt_no_matter, alt_alignment
 
 
-def get_true_variants(variants, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, sequence):
+def get_true_variants(variants, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele,
+                      sequence):
     variants_correct = {}
-    variants_noMatter = {}
+    variants_no_matter = {}
     variants_alignment = {}
 
     correct_absent_positions = {}
     correct_last_absent_position = ''
 
-    noMatter_absent_positions = {}
-    noMatter_last_absent_position = ''
+    no_matter_absent_positions = {}
+    no_matter_last_absent_position = ''
 
     multiple_alleles_found = []
 
     counter = 1
     while counter <= len(sequence):
         if counter in variants:
-            noMatter_last_absent_position = ''
+            no_matter_last_absent_position = ''
 
-            ref, alt_correct, low_coverage, multiple_alleles, alt_noMatter, alt_alignment = snp_indel(variants, counter, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele)
+            ref, alt_correct, low_coverage, multiple_alleles, alt_no_matter, alt_alignment = \
+                snp_indel(variants, counter, minimum_depth_presence, minimum_depth_call,
+                          minimum_depth_frequency_dominant_allele)
 
             if alt_alignment != '.':
                 variants_alignment[counter] = {'REF': ref, 'ALT': alt_alignment}
 
-            if alt_noMatter != '.':
-                variants_noMatter[counter] = {'REF': ref, 'ALT': alt_noMatter}
+            if alt_no_matter != '.':
+                variants_no_matter[counter] = {'REF': ref, 'ALT': alt_no_matter}
 
             if alt_correct is None:
                 if counter - len(correct_last_absent_position) in correct_absent_positions:
@@ -600,11 +677,12 @@
                 correct_absent_positions[counter] = {'REF': sequence[counter - 1], 'ALT': ''}
             correct_last_absent_position += sequence[counter - 1]
 
-            if counter - len(noMatter_last_absent_position) in noMatter_absent_positions:
-                noMatter_absent_positions[counter - len(noMatter_last_absent_position)]['REF'] += sequence[counter - 1]
+            if counter - len(no_matter_last_absent_position) in no_matter_absent_positions:
+                no_matter_absent_positions[counter - len(no_matter_last_absent_position)]['REF'] += \
+                    sequence[counter - 1]
             else:
-                noMatter_absent_positions[counter] = {'REF': sequence[counter - 1], 'ALT': ''}
-            noMatter_last_absent_position += sequence[counter - 1]
+                no_matter_absent_positions[counter] = {'REF': sequence[counter - 1], 'ALT': ''}
+            no_matter_last_absent_position += sequence[counter - 1]
 
             counter += 1
 
@@ -613,23 +691,42 @@
             variants_correct[position] = {'REF': correct_absent_positions[position]['REF'], 'ALT': 'N'}
         else:
             if position - 1 not in variants_correct:
-                variants_correct[position - 1] = {'REF': sequence[position - 2] + correct_absent_positions[position]['REF'], 'ALT': sequence[position - 2] + correct_absent_positions[position]['ALT']}
+                variants_correct[position - 1] = \
+                    {'REF': sequence[position - 2] + correct_absent_positions[position]['REF'],
+                     'ALT': sequence[position - 2] + correct_absent_positions[position]['ALT']}
             else:
-                variants_correct[position - 1] = {'REF': variants_correct[position - 1]['REF'] + correct_absent_positions[position]['REF'][len(variants_correct[position - 1]['REF']) - 1:], 'ALT': variants_correct[position - 1]['ALT'] + correct_absent_positions[position]['ALT'][len(variants_correct[position - 1]['ALT']) - 1 if len(variants_correct[position - 1]['ALT']) > 0 else 0:]}
+                variants_correct[position - 1] = \
+                    {'REF': variants_correct[position - 1]['REF'] +
+                            correct_absent_positions[position]['REF'][len(variants_correct[position - 1]['REF']) - 1:],
+                     'ALT': variants_correct[position - 1]['ALT'] +
+                            correct_absent_positions[position]['ALT'][len(variants_correct[position - 1]['ALT']) - 1 if
+                                                                      len(variants_correct[position - 1]['ALT']) > 0
+                                                                      else 0:]}
 
-    for position in noMatter_absent_positions:
+    for position in no_matter_absent_positions:
         if position == 1:
-            variants_noMatter[position] = {'REF': noMatter_absent_positions[position]['REF'], 'ALT': 'N'}
+            variants_no_matter[position] = {'REF': no_matter_absent_positions[position]['REF'], 'ALT': 'N'}
         else:
-            if position - 1 not in variants_noMatter:
-                variants_noMatter[position - 1] = {'REF': sequence[position - 2] + noMatter_absent_positions[position]['REF'], 'ALT': sequence[position - 2] + noMatter_absent_positions[position]['ALT']}
+            if position - 1 not in variants_no_matter:
+                variants_no_matter[position - 1] = \
+                    {'REF': sequence[position - 2] + no_matter_absent_positions[position]['REF'],
+                     'ALT': sequence[position - 2] + no_matter_absent_positions[position]['ALT']}
             else:
-                variants_noMatter[position - 1] = {'REF': variants_noMatter[position - 1]['REF'] + noMatter_absent_positions[position]['REF'][len(variants_noMatter[position - 1]['REF']) - 1:], 'ALT': variants_noMatter[position - 1]['ALT'] + noMatter_absent_positions[position]['ALT'][len(variants_noMatter[position - 1]['ALT']) - 1 if len(variants_noMatter[position - 1]['ALT']) > 0 else 0:]}
+                variants_no_matter[position - 1] = \
+                    {'REF': variants_no_matter[position - 1]['REF'] +
+                            no_matter_absent_positions[position]['REF'][len(variants_no_matter[position - 1]['REF']) -
+                                                                        1:],
+                     'ALT': variants_no_matter[position - 1]['ALT'] +
+                            no_matter_absent_positions[position]['ALT'][len(variants_no_matter[position - 1]['ALT']) -
+                                                                        1 if
+                                                                        len(variants_no_matter[position - 1]['ALT']) > 0
+                                                                        else 0:]}
 
-    return variants_correct, variants_noMatter, variants_alignment, multiple_alleles_found
+    return variants_correct, variants_no_matter, variants_alignment, multiple_alleles_found
 
 
-def clean_variant_in_extra_seq_left(variant_dict, position, length_extra_seq, multiple_alleles_found, number_multi_alleles):
+def clean_variant_in_extra_seq_left(variant_dict, position, length_extra_seq, multiple_alleles_found,
+                                    number_multi_alleles):
     number_diferences = 0
 
     if position + len(variant_dict[position]['REF']) - 1 > length_extra_seq:
@@ -640,8 +737,11 @@
         del variant_dict[position]
         variant_dict[length_extra_seq] = {}
         variant_dict[length_extra_seq]['REF'] = temp_variant['REF'][length_extra_seq - position:]
-        variant_dict[length_extra_seq]['ALT'] = temp_variant['ALT'][length_extra_seq - position:] if len(temp_variant['ALT']) > length_extra_seq - position else temp_variant['REF'][length_extra_seq - position]
-        number_diferences = count_number_diferences(variant_dict[length_extra_seq]['REF'], variant_dict[length_extra_seq]['ALT'])
+        variant_dict[length_extra_seq]['ALT'] = temp_variant['ALT'][length_extra_seq - position:] if \
+            len(temp_variant['ALT']) > length_extra_seq - position else \
+            temp_variant['REF'][length_extra_seq - position]
+        number_diferences = count_number_diferences(variant_dict[length_extra_seq]['REF'],
+                                                    variant_dict[length_extra_seq]['ALT'])
     else:
         del variant_dict[position]
 
@@ -650,15 +750,20 @@
 
 def clean_variant_in_extra_seq_rigth(variant_dict, position, sequence_length, length_extra_seq):
     if position + len(variant_dict[position]['REF']) - 1 > sequence_length - length_extra_seq:
-        variant_dict[position]['REF'] = variant_dict[position]['REF'][: - (position - (sequence_length - length_extra_seq)) + 1]
-        variant_dict[position]['ALT'] = variant_dict[position]['ALT'][: - (position - (sequence_length - length_extra_seq)) + 1] if len(variant_dict[position]['ALT']) >= - (position - (sequence_length - length_extra_seq)) + 1 else variant_dict[position]['ALT']
+        variant_dict[position]['REF'] = \
+            variant_dict[position]['REF'][: - (position - (sequence_length - length_extra_seq)) + 1]
+        variant_dict[position]['ALT'] = \
+            variant_dict[position]['ALT'][: - (position - (sequence_length - length_extra_seq)) + 1] if \
+                len(variant_dict[position]['ALT']) >= - (position - (sequence_length - length_extra_seq)) + 1 else \
+                variant_dict[position]['ALT']
 
     number_diferences = count_number_diferences(variant_dict[position]['REF'], variant_dict[position]['ALT'])
 
     return variant_dict, number_diferences
 
 
-def cleanning_variants_extra_seq(variants_correct, variants_noMatter, variants_alignment, multiple_alleles_found, length_extra_seq, sequence_length):
+def cleanning_variants_extra_seq(variants_correct, variants_no_matter, variants_alignment, multiple_alleles_found,
+                                 length_extra_seq, sequence_length):
     number_multi_alleles = 0
     number_diferences = 0
 
@@ -666,32 +771,39 @@
     while counter <= sequence_length:
         if counter <= length_extra_seq:
             if counter in variants_correct:
-                variants_correct, number_multi_alleles, number_diferences = clean_variant_in_extra_seq_left(variants_correct, counter, length_extra_seq, multiple_alleles_found, number_multi_alleles)
-            if counter in variants_noMatter:
-                variants_noMatter, ignore, ignore = clean_variant_in_extra_seq_left(variants_noMatter, counter, length_extra_seq, None, None)
+                variants_correct, number_multi_alleles, number_diferences = \
+                    clean_variant_in_extra_seq_left(variants_correct, counter, length_extra_seq, multiple_alleles_found,
+                                                    number_multi_alleles)
+            if counter in variants_no_matter:
+                variants_no_matter, ignore, ignore = \
+                    clean_variant_in_extra_seq_left(variants_no_matter, counter, length_extra_seq, None, None)
             if counter in variants_alignment:
-                variants_alignment, ignore, ignore = clean_variant_in_extra_seq_left(variants_alignment, counter, length_extra_seq, None, None)
-        elif counter > length_extra_seq and counter <= sequence_length - length_extra_seq:
+                variants_alignment, ignore, ignore = \
+                    clean_variant_in_extra_seq_left(variants_alignment, counter, length_extra_seq, None, None)
+        elif sequence_length - length_extra_seq >= counter > length_extra_seq:
             if counter in variants_correct:
                 if counter in multiple_alleles_found:
                     number_multi_alleles += 1
-                variants_correct, number_diferences_found = clean_variant_in_extra_seq_rigth(variants_correct, counter, sequence_length, length_extra_seq)
+                variants_correct, number_diferences_found = \
+                    clean_variant_in_extra_seq_rigth(variants_correct, counter, sequence_length, length_extra_seq)
                 number_diferences += number_diferences_found
-            if counter in variants_noMatter:
-                variants_noMatter, ignore = clean_variant_in_extra_seq_rigth(variants_noMatter, counter, sequence_length, length_extra_seq)
+            if counter in variants_no_matter:
+                variants_no_matter, ignore = \
+                    clean_variant_in_extra_seq_rigth(variants_no_matter, counter, sequence_length, length_extra_seq)
             if counter in variants_alignment:
-                variants_alignment, ignore = clean_variant_in_extra_seq_rigth(variants_alignment, counter, sequence_length, length_extra_seq)
+                variants_alignment, ignore = \
+                    clean_variant_in_extra_seq_rigth(variants_alignment, counter, sequence_length, length_extra_seq)
         else:
             if counter in variants_correct:
                 del variants_correct[counter]
-            if counter in variants_noMatter:
-                del variants_noMatter[counter]
+            if counter in variants_no_matter:
+                del variants_no_matter[counter]
             if counter in variants_alignment:
                 del variants_alignment[counter]
 
         counter += 1
 
-    return variants_correct, variants_noMatter, variants_alignment, number_multi_alleles, number_diferences
+    return variants_correct, variants_no_matter, variants_alignment, number_multi_alleles, number_diferences
 
 
 def get_coverage(gene_coverage):
@@ -699,7 +811,7 @@
 
     with open(gene_coverage, 'rtU') as reader:
         for line in reader:
-            line = line.splitlines()[0]
+            line = line.rstrip('\r\n')
             if len(line) > 0:
                 line = line.split('\t')
                 coverage[int(line[1])] = int(line[2])
@@ -712,34 +824,36 @@
         return sequence_length - 2 * length_extra_seq, 100.0, 0.0
 
     count_absent = 0
-    count_lowCoverage = 0
+    count_low_coverage = 0
     sum_coverage = 0
 
     counter = 1
     while counter <= sequence_length:
-        if counter > length_extra_seq and counter <= sequence_length - length_extra_seq:
+        if sequence_length - length_extra_seq >= counter > length_extra_seq:
             if coverage[counter] < minimum_depth_presence:
                 count_absent += 1
             else:
                 if coverage[counter] < minimum_depth_call:
-                    count_lowCoverage += 1
+                    count_low_coverage += 1
                 sum_coverage += coverage[counter]
         counter += 1
 
     mean_coverage = 0
-    percentage_lowCoverage = 0
+    percentage_low_coverage = 0
     if sequence_length - 2 * length_extra_seq - count_absent > 0:
         mean_coverage = float(sum_coverage) / float(sequence_length - 2 * length_extra_seq - count_absent)
-        percentage_lowCoverage = float(count_lowCoverage) / float(sequence_length - 2 * length_extra_seq - count_absent) * 100
+        percentage_low_coverage = \
+            float(count_low_coverage) / float(sequence_length - 2 * length_extra_seq - count_absent) * 100
 
-    return count_absent, percentage_lowCoverage, mean_coverage
+    return count_absent, percentage_low_coverage, mean_coverage
 
 
 # Get genome coverage data
 def compute_genome_coverage_data(alignment_file, sequence_to_analyse, outdir, counter):
     genome_coverage_data_file = os.path.join(outdir, 'samtools_depth.sequence_' + str(counter) + '.tab')
-    command = ['samtools', 'depth', '-a', '-q', '0', '-r', sequence_to_analyse, alignment_file, '>', genome_coverage_data_file]
-    run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None, False)
+    command = ['samtools', 'depth', '-a', '-q', '0', '-r', sequence_to_analyse, alignment_file, '>',
+               genome_coverage_data_file]
+    run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, True, None, False)
     return run_successfully, genome_coverage_data_file
 
 
@@ -747,16 +861,18 @@
     vcf_file = os.path.join(outdir, str(sequence_to_analyse + '.' + sufix + '.vcf'))
     with open(vcf_file, 'wt') as writer:
         writer.write('##fileformat=VCFv4.2' + '\n')
-        writer.write('#' + '\t'.join(['SEQUENCE', 'POSITION', 'ID_unused', 'REFERENCE_sequence', 'ALTERNATIVE_sequence', 'QUALITY_unused', 'FILTER_unused', 'INFO_unused', 'FORMAT_unused']) + '\n')
+        writer.write('#' + '\t'.join(['SEQUENCE', 'POSITION', 'ID_unused', 'REFERENCE_sequence', 'ALTERNATIVE_sequence',
+                                      'QUALITY_unused', 'FILTER_unused', 'INFO_unused', 'FORMAT_unused']) + '\n')
         for i in sorted(variants.keys()):
-            writer.write('\t'.join([sequence_to_analyse, str(i), '.', variants[i]['REF'], variants[i]['ALT'], '.', '.', '.', '.']) + '\n')
+            writer.write('\t'.join([sequence_to_analyse, str(i), '.', variants[i]['REF'], variants[i]['ALT'], '.', '.',
+                                    '.', '.']) + '\n')
 
     compressed_vcf_file = vcf_file + '.gz'
     command = ['bcftools', 'convert', '-o', compressed_vcf_file, '-O', 'z', vcf_file]
-    run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, False)
+    run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, False)
     if run_successfully:
         command = ['bcftools', 'index', compressed_vcf_file]
-        run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, False)
+        run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, False)
 
     if not run_successfully:
         compressed_vcf_file = None
@@ -764,7 +880,7 @@
     return run_successfully, compressed_vcf_file
 
 
-def parse_fasta_inMemory(fasta_memory):
+def parse_fasta_in_memory(fasta_memory):
     fasta_memory = fasta_memory.splitlines()
     sequence_dict = {}
     for line in fasta_memory:
@@ -777,7 +893,7 @@
     return sequence_dict
 
 
-def compute_consensus_sequence(reference_file, sequence_to_analyse, compressed_vcf_file, outdir, sufix):
+def compute_consensus_sequence(reference_file, sequence_to_analyse, compressed_vcf_file, outdir):
     sequence_dict = None
 
     gene_fasta = os.path.join(outdir, str(sequence_to_analyse + '.fasta'))
@@ -785,56 +901,97 @@
     run_successfully, stdout = index_fasta_samtools(reference_file, sequence_to_analyse, gene_fasta, False)
     if run_successfully:
         command = ['bcftools', 'norm', '-c', 's', '-f', gene_fasta, '-Ov',  compressed_vcf_file]
-        run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, False)
+        run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, False)    
         if run_successfully:
             command = ['bcftools', 'consensus', '-f', gene_fasta, compressed_vcf_file, '-H', '1']
-            run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, False)
+            run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, False)
             if run_successfully:
-                sequence_dict = parse_fasta_inMemory(stdout)
+                sequence_dict = parse_fasta_in_memory(stdout)
 
     return run_successfully, sequence_dict
 
 
-def create_sample_consensus_sequence(outdir, sequence_to_analyse, reference_file, variants, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, sequence, length_extra_seq):
-    variants_correct, variants_noMatter, variants_alignment, multiple_alleles_found = get_true_variants(variants, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, sequence)
+def create_sample_consensus_sequence(outdir, sequence_to_analyse, reference_file, variants, minimum_depth_presence,
+                                     minimum_depth_call, minimum_depth_frequency_dominant_allele, sequence,
+                                     length_extra_seq):
+    variants_correct, variants_noMatter, variants_alignment, multiple_alleles_found = \
+        get_true_variants(variants, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele,
+                          sequence)
 
-    variants_correct, variants_noMatter, variants_alignment, number_multi_alleles, number_diferences = cleanning_variants_extra_seq(variants_correct, variants_noMatter, variants_alignment, multiple_alleles_found, length_extra_seq, len(sequence))
+    variants_correct, variants_noMatter, variants_alignment, number_multi_alleles, number_diferences = \
+        cleanning_variants_extra_seq(variants_correct, variants_noMatter, variants_alignment, multiple_alleles_found,
+                                     length_extra_seq, len(sequence))
 
     run_successfully = False
     consensus = {'correct': {}, 'noMatter': {}, 'alignment': {}}
     for variant_type in ['variants_correct', 'variants_noMatter', 'variants_alignment']:
-        run_successfully, compressed_vcf_file = write_variants_vcf(eval(variant_type), outdir, sequence_to_analyse, variant_type.split('_', 1)[1])
+        run_successfully, compressed_vcf_file = \
+            write_variants_vcf(eval(variant_type), outdir, sequence_to_analyse, variant_type.split('_', 1)[1])
         if run_successfully:
-            run_successfully, sequence_dict = compute_consensus_sequence(reference_file, sequence_to_analyse, compressed_vcf_file, outdir, variant_type.split('_', 1)[1])
+            run_successfully, sequence_dict = \
+                compute_consensus_sequence(reference_file, sequence_to_analyse, compressed_vcf_file, outdir)
             if run_successfully:
-                consensus[variant_type.split('_', 1)[1]] = {'header': sequence_dict['header'], 'sequence': sequence_dict['sequence'][length_extra_seq:len(sequence_dict['sequence']) - length_extra_seq]}
+                consensus[variant_type.split('_', 1)[1]] = \
+                    {'header': sequence_dict['header'],
+                     'sequence': sequence_dict['sequence'][length_extra_seq:len(sequence_dict['sequence']) -
+                                                                            length_extra_seq]}
 
     return run_successfully, number_multi_alleles, consensus, number_diferences
 
 
 @utils.trace_unhandled_exceptions
-def analyse_sequence_data(bam_file, sequence_information, outdir, counter, reference_file, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, threads):
+def analyse_sequence_data(bam_file, sequence_information, outdir, counter, reference_file, length_extra_seq,
+                          minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele):
     count_absent = None
-    percentage_lowCoverage = None
-    meanCoverage = None
+    percentage_low_coverage = None
+    mean_coverage = None
     number_diferences = 0
+    number_multi_alleles = 0
+    consensus_sequence = {'correct': {}, 'noMatter': {}, 'alignment': {}}
 
     # Create vcf file (for multiple alleles check)
     run_successfully, gene_vcf = create_vcf(bam_file, sequence_information['header'], outdir, counter, reference_file)
     if run_successfully:
         # Create coverage tab file
-        run_successfully, gene_coverage = compute_genome_coverage_data(bam_file, sequence_information['header'], outdir, counter)
+        run_successfully, gene_coverage = \
+            compute_genome_coverage_data(bam_file, sequence_information['header'], outdir, counter)
 
         if run_successfully:
-            variants = get_variants(gene_vcf)
+            try:
+                variants = get_variants(gene_vcf=gene_vcf, seq_name=sequence_information['header'],
+                                        encoding=sys.getdefaultencoding())
+            except UnicodeDecodeError:
+                try:
+                    print('It was found an enconding error while parsing the following VCF, but lets try forcing it to'
+                          ' "utf_8" encoding: {}'.format(gene_vcf))
+                    variants = get_variants(gene_vcf=gene_vcf, seq_name=sequence_information['header'],
+                                            encoding='utf_8')
+                except UnicodeDecodeError:
+                    print('It was found an enconding error while parsing the following VCF, but lets try forcing it to'
+                          ' "latin_1" encoding: {}'.format(gene_vcf))
+                    variants = get_variants(gene_vcf=gene_vcf, seq_name=sequence_information['header'],
+                                            encoding='latin_1')
 
             coverage = get_coverage(gene_coverage)
 
-            run_successfully, number_multi_alleles, consensus_sequence, number_diferences = create_sample_consensus_sequence(outdir, sequence_information['header'], reference_file, variants, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, sequence_information['sequence'], length_extra_seq)
+            run_successfully, number_multi_alleles, consensus_sequence, number_diferences = \
+                create_sample_consensus_sequence(outdir, sequence_information['header'], reference_file, variants,
+                                                 minimum_depth_presence, minimum_depth_call,
+                                                 minimum_depth_frequency_dominant_allele,
+                                                 sequence_information['sequence'], length_extra_seq)
 
-            count_absent, percentage_lowCoverage, meanCoverage = get_coverage_report(coverage, sequence_information['length'], minimum_depth_presence, minimum_depth_call, length_extra_seq)
+            try:
+                count_absent, percentage_low_coverage, mean_coverage = \
+                    get_coverage_report(coverage, sequence_information['length'], minimum_depth_presence,
+                                        minimum_depth_call, length_extra_seq)
+            except KeyError:
+                print('ERROR: KeyError')
+                print(sequence_information)
+                raise KeyError
 
-    utils.saveVariableToPickle([run_successfully, counter, number_multi_alleles, count_absent, percentage_lowCoverage, meanCoverage, consensus_sequence, number_diferences], outdir, str('coverage_info.' + str(counter)))
+    utils.save_variable_to_pickle([run_successfully, counter, number_multi_alleles, count_absent,
+                                   percentage_low_coverage, mean_coverage, consensus_sequence, number_diferences],
+                                  outdir, str('coverage_info.' + str(counter)))
 
 
 def clean_header(header):
@@ -856,21 +1013,23 @@
         sequence_counter = 0
         temp_sequence_dict = {}
         for line in reader:
-            line = line.splitlines()[0]
+            line = line.rstrip('\r\n')
             if len(line) > 0:
                 if not blank_line_found:
                     if line.startswith('>'):
                         if len(temp_sequence_dict) > 0:
-                            if temp_sequence_dict.values()[0]['length'] - 2 * length_extra_seq > 0:
-                                sequence_dict[temp_sequence_dict.keys()[0]] = temp_sequence_dict.values()[0]
+                            if list(temp_sequence_dict.values())[0]['length'] - 2 * length_extra_seq > 0:
+                                sequence_dict[list(temp_sequence_dict.keys())[0]] = list(temp_sequence_dict.values())[0]
                             else:
-                                print '{header} sequence ignored due to length <= 0'.format(header=temp_sequence_dict.values()[0]['header'])
-                                del headers[temp_sequence_dict.values()[0]['header']]
+                                print('{header} sequence ignored due to'
+                                      ' length = 0'.format(header=list(temp_sequence_dict.values())[0]['header']))
+                                del headers[list(temp_sequence_dict.values())[0]['header']]
                             temp_sequence_dict = {}
 
                         original_header, new_header = clean_header(line[1:])
                         if new_header in headers:
-                            sys.exit('Found duplicated sequence headers: {original_header}'.format(original_header=original_header))
+                            sys.exit('Found duplicated sequence'
+                                     ' headers: {original_header}'.format(original_header=original_header))
 
                         sequence_counter += 1
                         temp_sequence_dict[sequence_counter] = {'header': new_header, 'sequence': '', 'length': 0}
@@ -878,49 +1037,47 @@
                         if new_header != original_header:
                             headers_changed = True
                     else:
-                        temp_sequence_dict[sequence_counter]['sequence'] += line
-                        temp_sequence_dict[sequence_counter]['length'] += len(line)
+                        temp_sequence_dict[sequence_counter]['sequence'] += line.replace(' ', '')
+                        temp_sequence_dict[sequence_counter]['length'] += len(line.replace(' ', ''))
                 else:
                     sys.exit('It was found a blank line between the fasta file above line ' + line)
             else:
                 blank_line_found = True
 
         if len(temp_sequence_dict) > 0:
-            if temp_sequence_dict.values()[0]['length'] - 2 * length_extra_seq > 0:
-                sequence_dict[temp_sequence_dict.keys()[0]] = temp_sequence_dict.values()[0]
+            if list(temp_sequence_dict.values())[0]['length'] - 2 * length_extra_seq > 0:
+                sequence_dict[list(temp_sequence_dict.keys())[0]] = list(temp_sequence_dict.values())[0]
             else:
-                print '{header} sequence ignored due to length <= 0'.format(header=temp_sequence_dict.values()[0]['header'])
-                del headers[temp_sequence_dict.values()[0]['header']]
+                print('{header} sequence ignored due to'
+                      ' length <= 0'.format(header=list(temp_sequence_dict.values())[0]['header']))
+                del headers[list(temp_sequence_dict.values())[0]['header']]
 
     return sequence_dict, headers, headers_changed
 
 
-def determine_threads_2_use(number_sequences, threads):
-    if number_sequences >= threads:
-        return 1
-    else:
-        return threads / number_sequences
-
-
-def sequence_data(sample, reference_file, bam_file, outdir, threads, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, debug_mode_true, notWriteConsensus):
+def sequence_data(sample, reference_file, bam_file, outdir, threads, length_extra_seq, minimum_depth_presence,
+                  minimum_depth_call, minimum_depth_frequency_dominant_allele, debug_mode_true, not_write_consensus):
     sequence_data_outdir = os.path.join(outdir, 'sequence_data', '')
-    utils.removeDirectory(sequence_data_outdir)
+    utils.remove_directory(sequence_data_outdir)
     os.mkdir(sequence_data_outdir)
 
     sequences, headers, headers_changed = get_sequence_information(reference_file, length_extra_seq)
 
-    threads_2_use = determine_threads_2_use(len(sequences), threads)
-
     pool = multiprocessing.Pool(processes=threads)
     for sequence_counter in sequences:
         sequence_dir = os.path.join(sequence_data_outdir, str(sequence_counter), '')
-        utils.removeDirectory(sequence_dir)
+        utils.remove_directory(sequence_dir)
         os.makedirs(sequence_dir)
-        pool.apply_async(analyse_sequence_data, args=(bam_file, sequences[sequence_counter], sequence_dir, sequence_counter, reference_file, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, threads_2_use,))
+        pool.apply_async(analyse_sequence_data, args=(bam_file, sequences[sequence_counter], sequence_dir,
+                                                      sequence_counter, reference_file, length_extra_seq,
+                                                      minimum_depth_presence, minimum_depth_call,
+                                                      minimum_depth_frequency_dominant_allele,))
     pool.close()
     pool.join()
 
-    run_successfully, sample_data, consensus_files, consensus_sequences = gather_data_together(sample, sequence_data_outdir, sequences, outdir.rsplit('/', 2)[0], debug_mode_true, length_extra_seq, notWriteConsensus)
+    run_successfully, sample_data, consensus_files, consensus_sequences = \
+        gather_data_together(sample, sequence_data_outdir, sequences, outdir.rsplit('/', 2)[0], debug_mode_true,
+                             length_extra_seq, not_write_consensus)
 
     return run_successfully, sample_data, consensus_files, consensus_sequences
 
@@ -941,7 +1098,8 @@
     return consensus_files
 
 
-def gather_data_together(sample, data_directory, sequences_information, outdir, debug_mode_true, length_extra_seq, notWriteConsensus):
+def gather_data_together(sample, data_directory, sequences_information, outdir, debug_mode_true, length_extra_seq,
+                         not_write_consensus):
     run_successfully = True
     counter = 0
     sample_data = {}
@@ -951,21 +1109,29 @@
 
     write_consensus_first_time = True
 
-    genes_directories = [d for d in os.listdir(data_directory) if not d.startswith('.') and os.path.isdir(os.path.join(data_directory, d, ''))]
+    genes_directories = [d for d in os.listdir(data_directory) if
+                         not d.startswith('.') and
+                         os.path.isdir(os.path.join(data_directory, d, ''))]
     for gene_dir in genes_directories:
         gene_dir_path = os.path.join(data_directory, gene_dir, '')
 
-        files = [f for f in os.listdir(gene_dir_path) if not f.startswith('.') and os.path.isfile(os.path.join(gene_dir_path, f))]
+        files = [f for f in os.listdir(gene_dir_path) if
+                 not f.startswith('.') and
+                 os.path.isfile(os.path.join(gene_dir_path, f))]
         for file_found in files:
             if file_found.startswith('coverage_info.') and file_found.endswith('.pkl'):
                 file_path = os.path.join(gene_dir_path, file_found)
 
                 if run_successfully:
-                    run_successfully, sequence_counter, multiple_alleles_found, count_absent, percentage_lowCoverage, meanCoverage, consensus_sequence, number_diferences = utils.extractVariableFromPickle(file_path)
+                    run_successfully, sequence_counter, multiple_alleles_found, count_absent, percentage_low_coverage, \
+                        mean_coverage, consensus_sequence, \
+                        number_diferences = utils.extract_variable_from_pickle(file_path)
 
-                    if not notWriteConsensus:
+                    if not not_write_consensus:
                         for consensus_type in consensus_sequence:
-                            consensus_sequences_together[consensus_type][sequence_counter] = {'header': consensus_sequence[consensus_type]['header'], 'sequence': consensus_sequence[consensus_type]['sequence']}
+                            consensus_sequences_together[consensus_type][sequence_counter] = \
+                                {'header': consensus_sequence[consensus_type]['header'],
+                                 'sequence': consensus_sequence[consensus_type]['sequence']}
 
                         if write_consensus_first_time:
                             for consensus_type in ['correct', 'noMatter', 'alignment']:
@@ -977,13 +1143,24 @@
 
                     gene_identity = 0
                     if sequences_information[sequence_counter]['length'] - 2 * length_extra_seq - count_absent > 0:
-                        gene_identity = 100 - (float(number_diferences) / (sequences_information[sequence_counter]['length'] - 2 * length_extra_seq - count_absent)) * 100
+                        gene_identity = 100 - \
+                                        (float(number_diferences) /
+                                         (sequences_information[sequence_counter]['length'] - 2 * length_extra_seq -
+                                          count_absent)) * 100
 
-                    sample_data[sequence_counter] = {'header': sequences_information[sequence_counter]['header'], 'gene_coverage': 100 - (float(count_absent) / (sequences_information[sequence_counter]['length'] - 2 * length_extra_seq)) * 100, 'gene_low_coverage': percentage_lowCoverage, 'gene_number_positions_multiple_alleles': multiple_alleles_found, 'gene_mean_read_coverage': meanCoverage, 'gene_identity': gene_identity}
+                    sample_data[sequence_counter] = \
+                        {'header': sequences_information[sequence_counter]['header'],
+                         'gene_coverage': 100 - (float(count_absent) /
+                                                 (sequences_information[sequence_counter]['length'] - 2 *
+                                                  length_extra_seq)) * 100,
+                         'gene_low_coverage': percentage_low_coverage,
+                         'gene_number_positions_multiple_alleles': multiple_alleles_found,
+                         'gene_mean_read_coverage': mean_coverage,
+                         'gene_identity': gene_identity}
                     counter += 1
 
         if not debug_mode_true:
-            utils.removeDirectory(gene_dir_path)
+            utils.remove_directory(gene_dir_path)
 
     if counter != len(sequences_information):
         run_successfully = False
@@ -995,32 +1172,56 @@
 
 
 @rematch_timer
-def runRematchModule(sample, fastq_files, reference_file, threads, outdir, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, minimum_gene_coverage, conserved_True, debug_mode_true, numMapLoc, minimum_gene_identity, rematch_run, softClip_baseQuality, softClip_recodeRun, reference_dict, softClip_cigarFlagRecode, bowtieOPT, gene_list_reference, notWriteConsensus):
+def run_rematch_module(sample, fastq_files, reference_file, threads, outdir, length_extra_seq, minimum_depth_presence,
+                       minimum_depth_call, minimum_depth_frequency_dominant_allele, minimum_gene_coverage,
+                       debug_mode_true, num_map_loc, minimum_gene_identity, rematch_run,
+                       soft_clip_base_quality, soft_clip_recode_run, reference_dict, soft_clip_cigar_flag_recode,
+                       bowtie_algorithm, bowtie_opt, gene_list_reference, not_write_consensus, clean_run=True):
     rematch_folder = os.path.join(outdir, 'rematch_module', '')
-    utils.removeDirectory(rematch_folder)
+
+    utils.remove_directory(rematch_folder)
     os.mkdir(rematch_folder)
 
     # Map reads
-    run_successfully, bam_file, reference_file = mapping_reads(fastq_files, reference_file, threads, rematch_folder, conserved_True, numMapLoc, rematch_run, softClip_baseQuality, softClip_recodeRun, reference_dict, softClip_cigarFlagRecode, bowtieOPT)
-
+    run_successfully, bam_file, reference_file = mapping_reads(fastq_files=fastq_files, reference_file=reference_file,
+                                                               threads=threads, outdir=rematch_folder,
+                                                               num_map_loc=num_map_loc, rematch_run=rematch_run,
+                                                               soft_clip_base_quality=soft_clip_base_quality,
+                                                               soft_clip_recode_run=soft_clip_recode_run,
+                                                               reference_dict=reference_dict,
+                                                               soft_clip_cigar_flag_recode=soft_clip_cigar_flag_recode,
+                                                               bowtie_algorithm=bowtie_algorithm, bowtie_opt=bowtie_opt,
+                                                               clean_run=clean_run)
     if run_successfully:
         # Index reference file
         run_successfully, stdout = index_fasta_samtools(reference_file, None, None, True)
         if run_successfully:
-            print 'Analysing alignment data'
-            run_successfully, sample_data, consensus_files, consensus_sequences = sequence_data(sample, reference_file, bam_file, rematch_folder, threads, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, debug_mode_true, notWriteConsensus)
+            print('Analysing alignment data')
+            run_successfully, sample_data, consensus_files, consensus_sequences = \
+                sequence_data(sample, reference_file, bam_file, rematch_folder, threads, length_extra_seq,
+                              minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele,
+                              debug_mode_true, not_write_consensus)
 
             if run_successfully:
-                print 'Writing report file'
+                print('Writing report file')
                 number_absent_genes = 0
                 number_genes_multiple_alleles = 0
                 mean_sample_coverage = 0
                 with open(os.path.join(outdir, 'rematchModule_report.txt'), 'wt') as writer:
-                    writer.write('\t'.join(['#gene', 'percentage_gene_coverage', 'gene_mean_read_coverage', 'percentage_gene_low_coverage', 'number_positions_multiple_alleles', 'percentage_gene_identity']) + '\n')
+                    print('\n'.join(outdir, 'rematchModule_report.txt'))
+                    writer.write('\t'.join(['#gene', 'percentage_gene_coverage', 'gene_mean_read_coverage',
+                                            'percentage_gene_low_coverage', 'number_positions_multiple_alleles',
+                                            'percentage_gene_identity']) + '\n')
                     for i in range(1, len(sample_data) + 1):
-                        writer.write('\t'.join([gene_list_reference[sample_data[i]['header']], str(round(sample_data[i]['gene_coverage'], 2)), str(round(sample_data[i]['gene_mean_read_coverage'], 2)), str(round(sample_data[i]['gene_low_coverage'], 2)), str(sample_data[i]['gene_number_positions_multiple_alleles']), str(round(sample_data[i]['gene_identity'], 2))]) + '\n')
+                        writer.write('\t'.join([gene_list_reference[sample_data[i]['header']],
+                                                str(round(sample_data[i]['gene_coverage'], 2)),
+                                                str(round(sample_data[i]['gene_mean_read_coverage'], 2)),
+                                                str(round(sample_data[i]['gene_low_coverage'], 2)),
+                                                str(sample_data[i]['gene_number_positions_multiple_alleles']),
+                                                str(round(sample_data[i]['gene_identity'], 2))]) + '\n')
 
-                        if sample_data[i]['gene_coverage'] < minimum_gene_coverage or sample_data[i]['gene_identity'] < minimum_gene_identity:
+                        if sample_data[i]['gene_coverage'] < minimum_gene_coverage or \
+                                sample_data[i]['gene_identity'] < minimum_gene_identity:
                             number_absent_genes += 1
                         else:
                             mean_sample_coverage += sample_data[i]['gene_mean_read_coverage']
@@ -1028,15 +1229,27 @@
                                 number_genes_multiple_alleles += 1
 
                     if len(sample_data) - number_absent_genes > 0:
-                        mean_sample_coverage = float(mean_sample_coverage) / float(len(sample_data) - number_absent_genes)
+                        mean_sample_coverage = \
+                            float(mean_sample_coverage) / float(len(sample_data) - number_absent_genes)
                     else:
                         mean_sample_coverage = 0
 
-                    writer.write('\n'.join(['#general', '>number_absent_genes', str(number_absent_genes), '>number_genes_multiple_alleles', str(number_genes_multiple_alleles), '>mean_sample_coverage', str(round(mean_sample_coverage, 2))]) + '\n')
+                    writer.write('\n'.join(['#general',
+                                            '>number_absent_genes', str(number_absent_genes),
+                                            '>number_genes_multiple_alleles', str(number_genes_multiple_alleles),
+                                            '>mean_sample_coverage', str(round(mean_sample_coverage, 2))]) + '\n')
 
-                    print '\n'.join([str('number_absent_genes: ' + str(number_absent_genes)), str('number_genes_multiple_alleles: ' + str(number_genes_multiple_alleles)), str('mean_sample_coverage: ' + str(round(mean_sample_coverage, 2)))])
+                    print('\n'.join([str('number_absent_genes: ' + str(number_absent_genes)),
+                                     str('number_genes_multiple_alleles: ' + str(number_genes_multiple_alleles)),
+                                     str('mean_sample_coverage: ' + str(round(mean_sample_coverage, 2)))]))
 
     if not debug_mode_true:
-        utils.removeDirectory(rematch_folder)
+        utils.remove_directory(rematch_folder)
 
-    return run_successfully, sample_data if 'sample_data' in locals() else None, {'number_absent_genes': number_absent_genes if 'number_absent_genes' in locals() else None, 'number_genes_multiple_alleles': number_genes_multiple_alleles if 'number_genes_multiple_alleles' in locals() else None, 'mean_sample_coverage': round(mean_sample_coverage, 2) if 'mean_sample_coverage' in locals() else None}, consensus_files if 'consensus_files' in locals() else None, consensus_sequences if 'consensus_sequences' in locals() else None
+    return run_successfully, sample_data if 'sample_data' in locals() else None, \
+           {'number_absent_genes': number_absent_genes if 'number_absent_genes' in locals() else None,
+            'number_genes_multiple_alleles': number_genes_multiple_alleles if
+            'number_genes_multiple_alleles' in locals() else None,
+            'mean_sample_coverage': round(mean_sample_coverage, 2) if 'mean_sample_coverage' in locals() else None}, \
+           consensus_files if 'consensus_files' in locals() else None,\
+           consensus_sequences if 'consensus_sequences' in locals() else None