Mercurial > repos > artbio > repenrich

diff RepEnrich.py @ 12:89e05f831259 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
author: artbio
date: Mon, 18 Mar 2024 09:39:44 +0000
parents: 6f4143893463
children: 530626b0757c
--- a/RepEnrich.py	Sat Mar 09 22:32:46 2024 +0000
+++ b/RepEnrich.py	Mon Mar 18 09:39:44 2024 +0000
@@ -2,7 +2,6 @@
 import csv
 import os
 import shlex
-import shutil
 import subprocess
 import sys
 
@@ -10,86 +9,41 @@
 
 
 parser = argparse.ArgumentParser(description='''
-             Part II: Conducting the alignments to the psuedogenomes. Before\
-             doing this step you will require 1) a bamfile of the unique\
-             alignments with index 2) a fastq file of the reads mapping to\
-             more than one location. These files can be obtained using the\
-             following bowtie options [EXAMPLE: bowtie -S -m 1\
-             --max multimap.fastq mm9 mate1_reads.fastq]  Once you have the\
-             unique alignment bamfile and the reads mapping to more than one\
-             location in a fastq file you can run this step.  EXAMPLE: python\
-             master_output.py\
-             /users/nneretti/data/annotation/hg19/hg19_repeatmasker.txt\
-             /users/nneretti/datasets/repeatmapping/POL3/Pol3_human/
-             HeLa_InputChIPseq_Rep1 HeLa_InputChIPseq_Rep1\
-             /users/nneretti/data/annotation/hg19/setup_folder\
-             HeLa_InputChIPseq_Rep1_multimap.fastq\
-             HeLa_InputChIPseq_Rep1.bam''')
-parser.add_argument('--version', action='version', version='%(prog)s 0.1')
-parser.add_argument('annotation_file', action='store',
+             Repenrich aligns reads to Repeat Elements pseudogenomes\
+             and counts aligned reads. RepEnrich_setup must be run\
+             before its use''')
+parser.add_argument('--annotation_file', action='store',
                     metavar='annotation_file',
-                    help='List RepeatMasker.org annotation file for your\
-                          organism.  The file may be downloaded from the\
-                          RepeatMasker.org website. Example:\
-                          /data/annotation/hg19/hg19_repeatmasker.txt')
-parser.add_argument('outputfolder', action='store', metavar='outputfolder',
-                    help='List folder to contain results.\
-                          Example: /outputfolder')
-parser.add_argument('outputprefix', action='store', metavar='outputprefix',
-                    help='Enter prefix name for data.\
-                           Example: HeLa_InputChIPseq_Rep1')
-parser.add_argument('setup_folder', action='store', metavar='setup_folder',
-                    help='List folder that contains the repeat element\
-                          pseudogenomes.\
-                          Example: /data/annotation/hg19/setup_folder')
-parser.add_argument('fastqfile', action='store', metavar='fastqfile',
-                    help='Enter file for the fastq reads that map to multiple\
+                    help='RepeatMasker.org annotation file for your\
+                          organism. The file may be downloaded from\
+                          RepeatMasker.org. E.g. hg19_repeatmasker.txt')
+parser.add_argument('--outputfolder', action='store', metavar='outputfolder',
+                    help='Folder that will contain results. Should be the\
+                          same as the one used for RepEnrich_setup.\
+                          Example: ./outputfolder')
+parser.add_argument('--outputprefix', action='store', metavar='outputprefix',
+                    help='Prefix name for Repenrich output files.')
+parser.add_argument('--setup_folder', action='store', metavar='setup_folder',
+                    help='Folder produced by RepEnrich_setup which contains\
+                    repeat element pseudogenomes.')
+parser.add_argument('--fastqfile', action='store', metavar='fastqfile',
+                    help='File of fastq reads mapping to multiple\
                           locations. Example: /data/multimap.fastq')
-parser.add_argument('alignment_bam', action='store', metavar='alignment_bam',
-                    help='Enter bamfile output for reads that map uniquely.\
-                    Example /bamfiles/old.bam')
+parser.add_argument('--alignment_bam', action='store', metavar='alignment_bam',
+                    help='Bam alignments of unique mapper reads.')
 parser.add_argument('--pairedend', action='store', dest='pairedend',
                     default='FALSE',
-                    help='Designate this option for paired-end sequencing.\
-                          Default FALSE change to TRUE')
-parser.add_argument('--collapserepeat', action='store', dest='collapserepeat',
-                    metavar='collapserepeat', default='Simple_repeat',
-                    help='Designate this option to generate a collapsed repeat\
-                          type. Uncollapsed output is generated in addition to\
-                          collapsed repeat type. Simple_repeat is default to\
-                          simplify downstream analysis. You can change the\
-                          default to another repeat name to collapse a\
-                          seperate specific repeat instead or if the name of\
-                          Simple_repeat is different for your organism.\
-                          Default Simple_repeat')
+                    help='Change to TRUE for paired-end fastq files.\
+                          Default FALSE')
 parser.add_argument('--fastqfile2', action='store', dest='fastqfile2',
                     metavar='fastqfile2', default='none',
-                    help='Enter fastqfile2 when using paired-end option.\
+                    help='fastqfile #2 when using paired-end option.\
                           Default none')
 parser.add_argument('--cpus', action='store', dest='cpus', metavar='cpus',
                     default="1", type=int,
-                    help='Enter available cpus per node.  The more cpus the\
-                          faster RepEnrich performs. RepEnrich is designed to\
-                          only work on one node. Default: "1"')
-parser.add_argument('--allcountmethod', action='store', dest='allcountmethod',
-                    metavar='allcountmethod', default="FALSE",
-                    help='By default the pipeline only outputs the fraction\
-                          count method.  Consdidered to be the best way to\
-                          count multimapped reads. Changing this option will\
-                          include the unique count method, a conservative\
-                          count, and the total count method, a liberal\
-                          counting strategy. Our evaluation of simulated data\
-                          indicated fraction counting is best.\
-                          Default = FALSE, change to TRUE')
-parser.add_argument('--is_bed', action='store', dest='is_bed',
-                    metavar='is_bed', default='FALSE',
-                    help='Is the annotation file a bed file.\
-                         This is also a compatible format. The file needs to\
-                         be a tab seperated bed with optional fields.\
-                         Ex. format: chr\tstart\tend\tName_element\tclass\
-                         \tfamily. The class and family should identical to\
-                         name_element if not applicable.\
-                         Default FALSE change to TRUE')
+                    help='Number of CPUs. The more cpus the\
+                          faster RepEnrich performs. Default: "1"')
+
 args = parser.parse_args()
 
 # parameters
@@ -97,18 +51,16 @@
 outputfolder = args.outputfolder
 outputfile_prefix = args.outputprefix
 setup_folder = args.setup_folder
-repeat_bed = setup_folder + os.path.sep + 'repnames.bed'
+repeat_bed = os.path.join(setup_folder, 'repnames.bed')
 unique_mapper_bam = args.alignment_bam
 fastqfile_1 = args.fastqfile
 fastqfile_2 = args.fastqfile2
 cpus = args.cpus
-b_opt = "-k1 -p " + str(1) + " --quiet"
-simple_repeat = args.collapserepeat
+b_opt = "-k1 -p 1 --quiet"
+# Change if simple repeats are differently annotated in your organism
+simple_repeat = "Simple_repeat"
 paired_end = args.pairedend
-allcountmethod = args.allcountmethod
-is_bed = args.is_bed
 
-##############################################################################
 # check that the programs we need are available
 try:
     subprocess.call(shlex.split("coverageBed -h"),
@@ -118,10 +70,9 @@
                     stdout=open(os.devnull, 'wb'),
                     stderr=open(os.devnull, 'wb'))
 except OSError:
-    print("Error: Bowtie or BEDTools not loaded")
+    print("Error: Bowtie or bedtools not loaded")
     raise
 
-##############################################################################
 # define a csv reader that reads space deliminated files
 print('Preparing for analysis using RepEnrich...')
 csv.field_size_limit(sys.maxsize)
@@ -134,142 +85,144 @@
             yield line
 
 
-##############################################################################
-# build dictionaries to convert repclass and rep families'
-if is_bed == "FALSE":
-    repeatclass = {}
-    repeatfamily = {}
-    fin = import_text(annotation_file, ' ')
-    x = 0
-    for line in fin:
-        if x > 2:
-            classfamily = []
-            classfamily = line[10].split(os.path.sep)
-            line9 = line[9].replace("(", "_").replace(
-                                    ")", "_").replace("/", "_")
-            repeatclass[line9] = classfamily[0]
-            if len(classfamily) == 2:
-                repeatfamily[line9] = classfamily[1]
-            else:
-                repeatfamily[line9] = classfamily[0]
-        x += 1
-if is_bed == "TRUE":
-    repeatclass = {}
-    repeatfamily = {}
-    fin = open(annotation_file, 'r')
-    for line in fin:
-        line = line.strip('\n')
-        line = line.split('\t')
-        theclass = line[4]
-        thefamily = line[5]
-        line3 = line[3].replace("(", "_").replace(")", "_").replace("/", "_")
-        repeatclass[line3] = theclass
-        repeatfamily[line3] = thefamily
-fin.close()
+# build dictionaries to convert repclass and rep families
+repeatclass, repeatfamily = {}, {}
+repeats = import_text(annotation_file, ' ')
+# skip three first lines of the iterator
+for line in range(3):
+    next(repeats)
+for repeat in repeats:
+    classfamily = []
+    classfamily = repeat[10].split('/')
+    matching_repeat = repeat[9].translate(str.maketrans('()/', '___'))
+    repeatclass[matching_repeat] = classfamily[0]
+    if len(classfamily) == 2:
+        repeatfamily[matching_repeat] = classfamily[1]
+    else:
+        repeatfamily[matching_repeat] = classfamily[0]
 
-##############################################################################
 # build list of repeats initializing dictionaries for downstream analysis'
-fin = import_text(setup_folder + os.path.sep + 'repgenomes_key.txt', '\t')
-repeat_key = {}
-rev_repeat_key = {}
-repeat_list = []
-reptotalcounts = {}
-classfractionalcounts = {}
-familyfractionalcounts = {}
-classtotalcounts = {}
-familytotalcounts = {}
-reptotalcounts_simple = {}
-fractionalcounts = {}
-i = 0
-for line in fin:
-    reptotalcounts[line[0]] = 0
-    fractionalcounts[line[0]] = 0
-    if line[0] in repeatclass:
-        classtotalcounts[repeatclass[line[0]]] = 0
-        classfractionalcounts[repeatclass[line[0]]] = 0
-    if line[0] in repeatfamily:
-        familytotalcounts[repeatfamily[line[0]]] = 0
-        familyfractionalcounts[repeatfamily[line[0]]] = 0
-    if line[0] in repeatfamily:
-        if repeatfamily[line[0]] == simple_repeat:
-            reptotalcounts_simple[simple_repeat] = 0
-    else:
-        reptotalcounts_simple[line[0]] = 0
-    repeat_list.append(line[0])
-    repeat_key[line[0]] = int(line[1])
-    rev_repeat_key[int(line[1])] = line[0]
-fin.close()
-##############################################################################
+repgenome_path = os.path.join(setup_folder, 'repgenomes_key.txt')
+reptotalcounts = {line[0]: 0 for line in import_text(repgenome_path, '\t')}
+fractionalcounts = {line[0]: 0 for line in import_text(repgenome_path, '\t')}
+classtotalcounts = {repeatclass[line[0]]: 0 for line in import_text(
+    repgenome_path, '\t') if line[0] in repeatclass}
+classfractionalcounts = {repeatclass[line[0]]: 0 for line in import_text(
+    repgenome_path, '\t') if line[0] in repeatclass}
+familytotalcounts = {repeatfamily[line[0]]: 0 for line in import_text(
+    repgenome_path, '\t') if line[0] in repeatfamily}
+familyfractionalcounts = {repeatfamily[line[0]]: 0 for line in import_text(
+    repgenome_path, '\t') if line[0] in repeatfamily}
+reptotalcounts_simple = {(simple_repeat if line[0] in repeatfamily and
+                          repeatfamily[line[0]] == simple_repeat else
+                          line[0]): 0 for line in import_text(
+                              repgenome_path, '\t')}
+repeat_key = {line[0]: int(line[1]) for line in import_text(
+    repgenome_path, '\t')}
+rev_repeat_key = {int(line[1]): line[0] for line in import_text(
+    repgenome_path, '\t')}
+repeat_list = [line[0] for line in import_text(repgenome_path, '\t')]
+
 # map the repeats to the psuedogenomes:
 if not os.path.exists(outputfolder):
     os.mkdir(outputfolder)
-##############################################################################
+
 # Conduct the regions sorting
-print('Conducting region sorting on unique mapping reads....')
-fileout = outputfolder + os.path.sep + outputfile_prefix + '_regionsorter.txt'
+fileout = os.path.join(outputfolder, f"{outputfile_prefix}_regionsorter.txt")
+command = shlex.split(f"coverageBed -abam {unique_mapper_bam} -b \
+                        {os.path.join(setup_folder, 'repnames.bed')}")
 with open(fileout, 'w') as stdout:
-    command = shlex.split("coverageBed -abam " + unique_mapper_bam + " -b " +
-                          setup_folder + os.path.sep + 'repnames.bed')
-    p = subprocess.Popen(command, stdout=stdout)
-p.communicate()
-stdout.close()
-filein = open(outputfolder + os.path.sep + outputfile_prefix
-              + '_regionsorter.txt', 'r')
+    subprocess.run(command, stdout=stdout, check=True)
+
 counts = {}
 sumofrepeatreads = 0
-for line in filein:
-    line = line.split('\t')
-    if not str(repeat_key[line[3]]) in counts:
-        counts[str(repeat_key[line[3]])] = 0
-    counts[str(repeat_key[line[3]])] += int(line[4])
-    sumofrepeatreads += int(line[4])
-print('Identified ' + str(sumofrepeatreads) +
-      'unique reads that mapped to repeats.')
-##############################################################################
-if paired_end == 'TRUE':
-    if not os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'):
-        os.mkdir(outputfolder + os.path.sep + 'pair1_bowtie')
-    if not os.path.exists(outputfolder + os.path.sep + 'pair2_bowtie'):
-        os.mkdir(outputfolder + os.path.sep + 'pair2_bowtie')
-    folder_pair1 = outputfolder + os.path.sep + 'pair1_bowtie'
-    folder_pair2 = outputfolder + os.path.sep + 'pair2_bowtie'
-##############################################################################
-    print("Processing repeat psuedogenomes...")
-    ps = []
-    psb = []
+with open(fileout) as filein:
+    for line in filein:
+        line = line.split('\t')
+        if not str(repeat_key[line[3]]) in counts:
+            counts[str(repeat_key[line[3]])] = 0
+        counts[str(repeat_key[line[3]])] += int(line[4])
+        sumofrepeatreads += int(line[4])
+    print(f"Identified {sumofrepeatreads} unique reads that \
+            mapped to repeats.")
+
+
+def run_bowtie(metagenome, fastqfile, folder):
+    metagenomepath = os.path.join(setup_folder, metagenome)
+    output_file = os.path.join(folder, f"{metagenome}.bowtie")
+    command = shlex.split(f"bowtie {b_opt} {metagenomepath} {fastqfile}")
+    with open(output_file, 'w') as stdout:
+        return subprocess.Popen(command, stdout=stdout)
+
+
+if paired_end == 'FALSE':
+    folder_pair1 = os.path.join(outputfolder, 'pair1_bowtie')
+    os.makedirs(folder_pair1, exist_ok=True)
+
+    print("Processing repeat pseudogenomes...")
+    processes = []
     ticker = 0
+
     for metagenome in repeat_list:
-        metagenomepath = setup_folder + os.path.sep + metagenome
-        file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie'
-        file2 = folder_pair2 + os.path.sep + metagenome + '.bowtie'
-        with open(file1, 'w') as stdout:
-            command = shlex.split("bowtie " + b_opt + " " +
-                                  metagenomepath + " " + fastqfile_1)
-            p = subprocess.Popen(command, stdout=stdout)
-        with open(file2, 'w') as stdout:
-            command = shlex.split("bowtie " + b_opt + " " +
-                                  metagenomepath + " " + fastqfile_2)
-            pp = subprocess.Popen(command, stdout=stdout)
-        ps.append(p)
-        ticker += 1
-        psb.append(pp)
+        processes.append(run_bowtie(metagenome, fastqfile_1, folder_pair1))
         ticker += 1
         if ticker == cpus:
+            for p in processes:
+                p.communicate()
+            ticker = 0
+            processes = []
+
+    for p in processes:
+        p.communicate()
+    # Combine the output from both read pairs:
+    print('Sorting and combining the output for both read pairs....')
+    sorted_bowtie = os.path.join(outputfolder, 'sorted_bowtie')
+    os.makedirs(sorted_bowtie, exist_ok=True)
+    for metagenome in repeat_list:
+        file1 = os.path.join(folder_pair1, f"{metagenome}.bowtie")
+        fileout = os.path.join(sorted_bowtie, f"{metagenome}.bowtie")
+        with open(fileout, 'w') as stdout:
+            p1 = subprocess.Popen(['cat', file1], stdout=subprocess.PIPE)
+            p2 = subprocess.Popen(['cut', '-f1'], stdin=p1.stdout,
+                                  stdout=subprocess.PIPE)
+            p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin=p2.stdout,
+                                  stdout=subprocess.PIPE)
+            p4 = subprocess.Popen(['sort'], stdin=p3.stdout,
+                                  stdout=subprocess.PIPE)
+            p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout)
+            p5.communicate()
+        stdout.close()
+    print('completed ...')
+else:
+    folder_pair1 = os.path.join(outputfolder, 'pair1_bowtie')
+    folder_pair2 = os.path.join(outputfolder, 'pair2_bowtie')
+    os.makedirs(folder_pair1, exist_ok=True)
+    os.makedirs(folder_pair2, exist_ok=True)
+
+    print("Processing repeat pseudogenomes...")
+    ps, psb, ticker = [], [], 0
+
+    for metagenome in repeat_list:
+        ps.append(run_bowtie(metagenome, fastqfile_1, folder_pair1))
+        ticker += 1
+        if fastqfile_2 != 'none':
+            psb.append(run_bowtie(metagenome, fastqfile_2, folder_pair2))
+            ticker += 1
+        if ticker >= cpus:
             for p in ps:
                 p.communicate()
             for p in psb:
                 p.communicate()
             ticker = 0
+            ps = []
             psb = []
-            ps = []
-    if len(ps) > 0:
-        for p in ps:
-            p.communicate()
-    stdout.close()
 
-##############################################################################
-# combine the output from both read pairs:
-    print('sorting and combining the output for both read pairs...')
+    for p in ps:
+        p.communicate()
+    for p in psb:
+        p.communicate()
+    # combine the output from both read pairs:
+    print('Sorting and combining the output for both read pairs...')
     if not os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'):
         os.mkdir(outputfolder + os.path.sep + 'sorted_bowtie')
     sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie'
@@ -290,108 +243,43 @@
             p5.communicate()
         stdout.close()
     print('completed ...')
-##############################################################################
-if paired_end == 'FALSE':
-    if not os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'):
-        os.mkdir(outputfolder + os.path.sep + 'pair1_bowtie')
-    folder_pair1 = outputfolder + os.path.sep + 'pair1_bowtie'
-##############################################################################
-    ps = []
-    ticker = 0
-    print("Processing repeat psuedogenomes...")
-    for metagenome in repeat_list:
-        metagenomepath = setup_folder + os.path.sep + metagenome
-        file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie'
-        with open(file1, 'w') as stdout:
-            command = shlex.split("bowtie " + b_opt + " " +
-                                  metagenomepath + " " + fastqfile_1)
-            p = subprocess.Popen(command, stdout=stdout)
-        ps.append(p)
-        ticker += 1
-        if ticker == cpus:
-            for p in ps:
-                p.communicate()
-            ticker = 0
-            ps = []
-    if len(ps) > 0:
-        for p in ps:
-            p.communicate()
-    stdout.close()
 
-##############################################################################
-# combine the output from both read pairs:
-    print('Sorting and combining the output for both read pairs....')
-    if not os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'):
-        os.mkdir(outputfolder + os.path.sep + 'sorted_bowtie')
-    sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie'
-    for metagenome in repeat_list:
-        file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie'
-        fileout = sorted_bowtie + os.path.sep + metagenome + '.bowtie'
-        with open(fileout, 'w') as stdout:
-            p1 = subprocess.Popen(['cat', file1], stdout=subprocess.PIPE)
-            p2 = subprocess.Popen(['cut', '-f1'], stdin=p1.stdout,
-                                  stdout=subprocess.PIPE)
-            p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin=p2.stdout,
-                                  stdout=subprocess.PIPE)
-            p4 = subprocess.Popen(['sort'], stdin=p3.stdout,
-                                  stdout=subprocess.PIPE)
-            p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout)
-            p5.communicate()
-        stdout.close()
-    print('completed ...')
-
-##############################################################################
 # build a file of repeat keys for all reads
 print('Writing and processing intermediate files...')
-sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie'
+sorted_bowtie = os.path.join(outputfolder, 'sorted_bowtie')
+sumofrepeatreads = 0
 readid = {}
-sumofrepeatreads = 0
+
 for rep in repeat_list:
-    for data in import_text(sorted_bowtie + os.path.sep +
-                            rep + '.bowtie', '\t'):
+    for data in import_text(
+            f"{os.path.join(sorted_bowtie, rep)}.bowtie", '\t'):
         readid[data[0]] = ''
+
 for rep in repeat_list:
-    for data in import_text(sorted_bowtie + os.path.sep
-                            + rep + '.bowtie', '\t'):
-        readid[data[0]] += str(repeat_key[rep]) + str(',')
+    for data in import_text(
+            f"{os.path.join(sorted_bowtie, rep)}.bowtie", '\t'):
+        readid[data[0]] += f"{repeat_key[rep]},"
+
 for subfamilies in readid.values():
     if subfamilies not in counts:
         counts[subfamilies] = 0
     counts[subfamilies] += 1
     sumofrepeatreads += 1
-del readid
-print('Identified ' + str(sumofrepeatreads) +
-      ' reads that mapped to repeats for unique and multimappers.')
 
-##############################################################################
+print(f'Identified {sumofrepeatreads} reads that mapped to \
+        repeats for unique and multimappers.')
 print("Conducting final calculations...")
 
-
-def convert(x):
-    '''
-    build a converter to numeric label for repeat and yield a combined list
-    of repnames seperated by backslash
-    '''
-    x = x.strip(',')
-    x = x.split(',')
-    global repname
-    repname = ""
-    for i in x:
-        repname = repname + os.path.sep + rev_repeat_key[int(i)]
-
-
 # building the total counts for repeat element enrichment...
 for x in counts.keys():
     count = counts[x]
-    x = x.strip(',')
-    x = x.split(',')
+    x = x.strip(',').split(',')
     for i in x:
         reptotalcounts[rev_repeat_key[int(i)]] += int(count)
 # building the fractional counts for repeat element enrichment...
 for x in counts.keys():
     count = counts[x]
-    x = x.strip(',')
-    x = x.split(',')
+    x = x.strip(',')    .split(',')
     splits = len(x)
     for i in x:
         fractionalcounts[rev_repeat_key[int(i)]] += float(
@@ -400,7 +288,10 @@
 repcounts = {}
 repcounts['other'] = 0
 for key in counts.keys():
-    convert(key)
+    key_list = key.strip(',').split(',')
+    repname = ''
+    for i in key_list:
+        repname = os.path.join(repname, rev_repeat_key[int(i)])
     repcounts[repname] = counts[key]
 # building the total counts for class enrichment...
 for key in reptotalcounts.keys():
@@ -408,7 +299,7 @@
 # building total counts for family enrichment...
 for key in reptotalcounts.keys():
     familytotalcounts[repeatfamily[key]] += reptotalcounts[key]
-# building unique counts table'
+# building unique counts table
 repcounts2 = {}
 for rep in repeat_list:
     if "/" + rep in repcounts:
@@ -422,78 +313,20 @@
 for key in fractionalcounts.keys():
     familyfractionalcounts[repeatfamily[key]] += fractionalcounts[key]
 
-##############################################################################
-print('Writing final output and removing intermediate files...')
 # print output to file of the categorized counts and total overlapping counts:
-if allcountmethod == "TRUE":
-    fout1 = open(outputfolder + os.path.sep + outputfile_prefix
-                 + '_total_counts.txt', 'w')
-    for key in sorted(reptotalcounts.keys()):
-        fout1.write(str(key) + '\t' + repeatclass[key] + '\t' +
-                    repeatfamily[key] + '\t' + str(reptotalcounts[key])
-                    + '\n')
-    fout2 = open(outputfolder + os.path.sep + outputfile_prefix
-                 + '_class_total_counts.txt', 'w')
-    for key in sorted(classtotalcounts.keys()):
-        fout2.write(str(key) + '\t' + str(classtotalcounts[key]) + '\n')
-    fout3 = open(outputfolder + os.path.sep + outputfile_prefix
-                 + '_family_total_counts.txt', 'w')
-    for key in sorted(familytotalcounts.keys()):
-        fout3.write(str(key) + '\t' + str(familytotalcounts[key]) + '\n')
-    fout4 = open(outputfolder + os.path.sep + outputfile_prefix +
-                 '_unique_counts.txt', 'w')
-    for key in sorted(repcounts2.keys()):
-        fout4.write(str(key) + '\t' + repeatclass[key] + '\t' +
-                    repeatfamily[key] + '\t' + str(repcounts2[key]) + '\n')
-        fout5 = open(outputfolder + os.path.sep + outputfile_prefix
-                     + '_class_fraction_counts.txt', 'w')
+print('Writing final output...')
+with open(f"{os.path.join(outputfolder, outputfile_prefix)}_"
+          f"class_fraction_counts.txt", 'w') as fout:
     for key in sorted(classfractionalcounts.keys()):
-        fout5.write(str(key) + '\t' + str(classfractionalcounts[key]) + '\n')
-    fout6 = open(outputfolder + os.path.sep + outputfile_prefix +
-                 '_family_fraction_counts.txt', 'w')
+        fout.write(f"{key}\t{classfractionalcounts[key]}\n")
+
+with open(f"{os.path.join(outputfolder, outputfile_prefix)}_"
+          f"family_fraction_counts.txt", 'w') as fout:
     for key in sorted(familyfractionalcounts.keys()):
-        fout6.write(str(key) + '\t' + str(familyfractionalcounts[key]) + '\n')
-    fout7 = open(outputfolder + os.path.sep + outputfile_prefix
-                 + '_fraction_counts.txt', 'w')
+        fout.write(f"{key}\t{familyfractionalcounts[key]}\n")
+
+with open(f"{os.path.join(outputfolder, outputfile_prefix)}_"
+          f"fraction_counts.txt", 'w') as fout:
     for key in sorted(fractionalcounts.keys()):
-        fout7.write(str(key) + '\t' + repeatclass[key] + '\t' +
-                    repeatfamily[key] + '\t' + str(int(fractionalcounts[key]))
-                    + '\n')
-        fout1.close()
-    fout2.close()
-    fout3.close()
-    fout4.close()
-    fout5.close()
-    fout6.close()
-    fout7.close()
-else:
-    fout1 = open(outputfolder + os.path.sep + outputfile_prefix +
-                 '_class_fraction_counts.txt', 'w')
-    for key in sorted(classfractionalcounts.keys()):
-        fout1.write(str(key) + '\t' + str(classfractionalcounts[key]) + '\n')
-    fout2 = open(outputfolder + os.path.sep + outputfile_prefix +
-                 '_family_fraction_counts.txt', 'w')
-    for key in sorted(familyfractionalcounts.keys()):
-        fout2.write(str(key) + '\t' + str(familyfractionalcounts[key]) + '\n')
-    fout3 = open(outputfolder + os.path.sep + outputfile_prefix +
-                 '_fraction_counts.txt', 'w')
-    for key in sorted(fractionalcounts.keys()):
-        fout3.write(str(key) + '\t' + repeatclass[key] + '\t' +
-                    repeatfamily[key] + '\t' + str(int(fractionalcounts[key]))
-                    + '\n')
-    fout1.close()
-    fout2.close()
-    fout3.close()
-##############################################################################
-#  Remove Large intermediate files
-if os.path.exists(outputfolder + os.path.sep + outputfile_prefix +
-                  '_regionsorter.txt'):
-    os.remove(outputfolder + os.path.sep + outputfile_prefix +
-              '_regionsorter.txt')
-if os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'):
-    shutil.rmtree(outputfolder + os.path.sep + 'pair1_bowtie')
-if os.path.exists(outputfolder + os.path.sep + 'pair2_bowtie'):
-    shutil.rmtree(outputfolder + os.path.sep + 'pair2_bowtie')
-if os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'):
-    shutil.rmtree(outputfolder + os.path.sep + 'sorted_bowtie')
-print("... Done")
+        fout.write(f"{key}\t{repeatclass[key]}\t{repeatfamily[key]}\t"
+                   f"{int(fractionalcounts[key])}\n")
author	artbio
date	Mon, 18 Mar 2024 09:39:44 +0000
parents	6f4143893463
children	530626b0757c