Mercurial > repos > artbio > repenrich
diff RepEnrich.py @ 12:89e05f831259 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
author | artbio |
---|---|
date | Mon, 18 Mar 2024 09:39:44 +0000 |
parents | 6f4143893463 |
children | 530626b0757c |
line wrap: on
line diff
--- a/RepEnrich.py Sat Mar 09 22:32:46 2024 +0000 +++ b/RepEnrich.py Mon Mar 18 09:39:44 2024 +0000 @@ -2,7 +2,6 @@ import csv import os import shlex -import shutil import subprocess import sys @@ -10,86 +9,41 @@ parser = argparse.ArgumentParser(description=''' - Part II: Conducting the alignments to the psuedogenomes. Before\ - doing this step you will require 1) a bamfile of the unique\ - alignments with index 2) a fastq file of the reads mapping to\ - more than one location. These files can be obtained using the\ - following bowtie options [EXAMPLE: bowtie -S -m 1\ - --max multimap.fastq mm9 mate1_reads.fastq] Once you have the\ - unique alignment bamfile and the reads mapping to more than one\ - location in a fastq file you can run this step. EXAMPLE: python\ - master_output.py\ - /users/nneretti/data/annotation/hg19/hg19_repeatmasker.txt\ - /users/nneretti/datasets/repeatmapping/POL3/Pol3_human/ - HeLa_InputChIPseq_Rep1 HeLa_InputChIPseq_Rep1\ - /users/nneretti/data/annotation/hg19/setup_folder\ - HeLa_InputChIPseq_Rep1_multimap.fastq\ - HeLa_InputChIPseq_Rep1.bam''') -parser.add_argument('--version', action='version', version='%(prog)s 0.1') -parser.add_argument('annotation_file', action='store', + Repenrich aligns reads to Repeat Elements pseudogenomes\ + and counts aligned reads. RepEnrich_setup must be run\ + before its use''') +parser.add_argument('--annotation_file', action='store', metavar='annotation_file', - help='List RepeatMasker.org annotation file for your\ - organism. The file may be downloaded from the\ - RepeatMasker.org website. Example:\ - /data/annotation/hg19/hg19_repeatmasker.txt') -parser.add_argument('outputfolder', action='store', metavar='outputfolder', - help='List folder to contain results.\ - Example: /outputfolder') -parser.add_argument('outputprefix', action='store', metavar='outputprefix', - help='Enter prefix name for data.\ - Example: HeLa_InputChIPseq_Rep1') -parser.add_argument('setup_folder', action='store', metavar='setup_folder', - help='List folder that contains the repeat element\ - pseudogenomes.\ - Example: /data/annotation/hg19/setup_folder') -parser.add_argument('fastqfile', action='store', metavar='fastqfile', - help='Enter file for the fastq reads that map to multiple\ + help='RepeatMasker.org annotation file for your\ + organism. The file may be downloaded from\ + RepeatMasker.org. E.g. hg19_repeatmasker.txt') +parser.add_argument('--outputfolder', action='store', metavar='outputfolder', + help='Folder that will contain results. Should be the\ + same as the one used for RepEnrich_setup.\ + Example: ./outputfolder') +parser.add_argument('--outputprefix', action='store', metavar='outputprefix', + help='Prefix name for Repenrich output files.') +parser.add_argument('--setup_folder', action='store', metavar='setup_folder', + help='Folder produced by RepEnrich_setup which contains\ + repeat element pseudogenomes.') +parser.add_argument('--fastqfile', action='store', metavar='fastqfile', + help='File of fastq reads mapping to multiple\ locations. Example: /data/multimap.fastq') -parser.add_argument('alignment_bam', action='store', metavar='alignment_bam', - help='Enter bamfile output for reads that map uniquely.\ - Example /bamfiles/old.bam') +parser.add_argument('--alignment_bam', action='store', metavar='alignment_bam', + help='Bam alignments of unique mapper reads.') parser.add_argument('--pairedend', action='store', dest='pairedend', default='FALSE', - help='Designate this option for paired-end sequencing.\ - Default FALSE change to TRUE') -parser.add_argument('--collapserepeat', action='store', dest='collapserepeat', - metavar='collapserepeat', default='Simple_repeat', - help='Designate this option to generate a collapsed repeat\ - type. Uncollapsed output is generated in addition to\ - collapsed repeat type. Simple_repeat is default to\ - simplify downstream analysis. You can change the\ - default to another repeat name to collapse a\ - seperate specific repeat instead or if the name of\ - Simple_repeat is different for your organism.\ - Default Simple_repeat') + help='Change to TRUE for paired-end fastq files.\ + Default FALSE') parser.add_argument('--fastqfile2', action='store', dest='fastqfile2', metavar='fastqfile2', default='none', - help='Enter fastqfile2 when using paired-end option.\ + help='fastqfile #2 when using paired-end option.\ Default none') parser.add_argument('--cpus', action='store', dest='cpus', metavar='cpus', default="1", type=int, - help='Enter available cpus per node. The more cpus the\ - faster RepEnrich performs. RepEnrich is designed to\ - only work on one node. Default: "1"') -parser.add_argument('--allcountmethod', action='store', dest='allcountmethod', - metavar='allcountmethod', default="FALSE", - help='By default the pipeline only outputs the fraction\ - count method. Consdidered to be the best way to\ - count multimapped reads. Changing this option will\ - include the unique count method, a conservative\ - count, and the total count method, a liberal\ - counting strategy. Our evaluation of simulated data\ - indicated fraction counting is best.\ - Default = FALSE, change to TRUE') -parser.add_argument('--is_bed', action='store', dest='is_bed', - metavar='is_bed', default='FALSE', - help='Is the annotation file a bed file.\ - This is also a compatible format. The file needs to\ - be a tab seperated bed with optional fields.\ - Ex. format: chr\tstart\tend\tName_element\tclass\ - \tfamily. The class and family should identical to\ - name_element if not applicable.\ - Default FALSE change to TRUE') + help='Number of CPUs. The more cpus the\ + faster RepEnrich performs. Default: "1"') + args = parser.parse_args() # parameters @@ -97,18 +51,16 @@ outputfolder = args.outputfolder outputfile_prefix = args.outputprefix setup_folder = args.setup_folder -repeat_bed = setup_folder + os.path.sep + 'repnames.bed' +repeat_bed = os.path.join(setup_folder, 'repnames.bed') unique_mapper_bam = args.alignment_bam fastqfile_1 = args.fastqfile fastqfile_2 = args.fastqfile2 cpus = args.cpus -b_opt = "-k1 -p " + str(1) + " --quiet" -simple_repeat = args.collapserepeat +b_opt = "-k1 -p 1 --quiet" +# Change if simple repeats are differently annotated in your organism +simple_repeat = "Simple_repeat" paired_end = args.pairedend -allcountmethod = args.allcountmethod -is_bed = args.is_bed -############################################################################## # check that the programs we need are available try: subprocess.call(shlex.split("coverageBed -h"), @@ -118,10 +70,9 @@ stdout=open(os.devnull, 'wb'), stderr=open(os.devnull, 'wb')) except OSError: - print("Error: Bowtie or BEDTools not loaded") + print("Error: Bowtie or bedtools not loaded") raise -############################################################################## # define a csv reader that reads space deliminated files print('Preparing for analysis using RepEnrich...') csv.field_size_limit(sys.maxsize) @@ -134,142 +85,144 @@ yield line -############################################################################## -# build dictionaries to convert repclass and rep families' -if is_bed == "FALSE": - repeatclass = {} - repeatfamily = {} - fin = import_text(annotation_file, ' ') - x = 0 - for line in fin: - if x > 2: - classfamily = [] - classfamily = line[10].split(os.path.sep) - line9 = line[9].replace("(", "_").replace( - ")", "_").replace("/", "_") - repeatclass[line9] = classfamily[0] - if len(classfamily) == 2: - repeatfamily[line9] = classfamily[1] - else: - repeatfamily[line9] = classfamily[0] - x += 1 -if is_bed == "TRUE": - repeatclass = {} - repeatfamily = {} - fin = open(annotation_file, 'r') - for line in fin: - line = line.strip('\n') - line = line.split('\t') - theclass = line[4] - thefamily = line[5] - line3 = line[3].replace("(", "_").replace(")", "_").replace("/", "_") - repeatclass[line3] = theclass - repeatfamily[line3] = thefamily -fin.close() +# build dictionaries to convert repclass and rep families +repeatclass, repeatfamily = {}, {} +repeats = import_text(annotation_file, ' ') +# skip three first lines of the iterator +for line in range(3): + next(repeats) +for repeat in repeats: + classfamily = [] + classfamily = repeat[10].split('/') + matching_repeat = repeat[9].translate(str.maketrans('()/', '___')) + repeatclass[matching_repeat] = classfamily[0] + if len(classfamily) == 2: + repeatfamily[matching_repeat] = classfamily[1] + else: + repeatfamily[matching_repeat] = classfamily[0] -############################################################################## # build list of repeats initializing dictionaries for downstream analysis' -fin = import_text(setup_folder + os.path.sep + 'repgenomes_key.txt', '\t') -repeat_key = {} -rev_repeat_key = {} -repeat_list = [] -reptotalcounts = {} -classfractionalcounts = {} -familyfractionalcounts = {} -classtotalcounts = {} -familytotalcounts = {} -reptotalcounts_simple = {} -fractionalcounts = {} -i = 0 -for line in fin: - reptotalcounts[line[0]] = 0 - fractionalcounts[line[0]] = 0 - if line[0] in repeatclass: - classtotalcounts[repeatclass[line[0]]] = 0 - classfractionalcounts[repeatclass[line[0]]] = 0 - if line[0] in repeatfamily: - familytotalcounts[repeatfamily[line[0]]] = 0 - familyfractionalcounts[repeatfamily[line[0]]] = 0 - if line[0] in repeatfamily: - if repeatfamily[line[0]] == simple_repeat: - reptotalcounts_simple[simple_repeat] = 0 - else: - reptotalcounts_simple[line[0]] = 0 - repeat_list.append(line[0]) - repeat_key[line[0]] = int(line[1]) - rev_repeat_key[int(line[1])] = line[0] -fin.close() -############################################################################## +repgenome_path = os.path.join(setup_folder, 'repgenomes_key.txt') +reptotalcounts = {line[0]: 0 for line in import_text(repgenome_path, '\t')} +fractionalcounts = {line[0]: 0 for line in import_text(repgenome_path, '\t')} +classtotalcounts = {repeatclass[line[0]]: 0 for line in import_text( + repgenome_path, '\t') if line[0] in repeatclass} +classfractionalcounts = {repeatclass[line[0]]: 0 for line in import_text( + repgenome_path, '\t') if line[0] in repeatclass} +familytotalcounts = {repeatfamily[line[0]]: 0 for line in import_text( + repgenome_path, '\t') if line[0] in repeatfamily} +familyfractionalcounts = {repeatfamily[line[0]]: 0 for line in import_text( + repgenome_path, '\t') if line[0] in repeatfamily} +reptotalcounts_simple = {(simple_repeat if line[0] in repeatfamily and + repeatfamily[line[0]] == simple_repeat else + line[0]): 0 for line in import_text( + repgenome_path, '\t')} +repeat_key = {line[0]: int(line[1]) for line in import_text( + repgenome_path, '\t')} +rev_repeat_key = {int(line[1]): line[0] for line in import_text( + repgenome_path, '\t')} +repeat_list = [line[0] for line in import_text(repgenome_path, '\t')] + # map the repeats to the psuedogenomes: if not os.path.exists(outputfolder): os.mkdir(outputfolder) -############################################################################## + # Conduct the regions sorting -print('Conducting region sorting on unique mapping reads....') -fileout = outputfolder + os.path.sep + outputfile_prefix + '_regionsorter.txt' +fileout = os.path.join(outputfolder, f"{outputfile_prefix}_regionsorter.txt") +command = shlex.split(f"coverageBed -abam {unique_mapper_bam} -b \ + {os.path.join(setup_folder, 'repnames.bed')}") with open(fileout, 'w') as stdout: - command = shlex.split("coverageBed -abam " + unique_mapper_bam + " -b " + - setup_folder + os.path.sep + 'repnames.bed') - p = subprocess.Popen(command, stdout=stdout) -p.communicate() -stdout.close() -filein = open(outputfolder + os.path.sep + outputfile_prefix - + '_regionsorter.txt', 'r') + subprocess.run(command, stdout=stdout, check=True) + counts = {} sumofrepeatreads = 0 -for line in filein: - line = line.split('\t') - if not str(repeat_key[line[3]]) in counts: - counts[str(repeat_key[line[3]])] = 0 - counts[str(repeat_key[line[3]])] += int(line[4]) - sumofrepeatreads += int(line[4]) -print('Identified ' + str(sumofrepeatreads) + - 'unique reads that mapped to repeats.') -############################################################################## -if paired_end == 'TRUE': - if not os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'): - os.mkdir(outputfolder + os.path.sep + 'pair1_bowtie') - if not os.path.exists(outputfolder + os.path.sep + 'pair2_bowtie'): - os.mkdir(outputfolder + os.path.sep + 'pair2_bowtie') - folder_pair1 = outputfolder + os.path.sep + 'pair1_bowtie' - folder_pair2 = outputfolder + os.path.sep + 'pair2_bowtie' -############################################################################## - print("Processing repeat psuedogenomes...") - ps = [] - psb = [] +with open(fileout) as filein: + for line in filein: + line = line.split('\t') + if not str(repeat_key[line[3]]) in counts: + counts[str(repeat_key[line[3]])] = 0 + counts[str(repeat_key[line[3]])] += int(line[4]) + sumofrepeatreads += int(line[4]) + print(f"Identified {sumofrepeatreads} unique reads that \ + mapped to repeats.") + + +def run_bowtie(metagenome, fastqfile, folder): + metagenomepath = os.path.join(setup_folder, metagenome) + output_file = os.path.join(folder, f"{metagenome}.bowtie") + command = shlex.split(f"bowtie {b_opt} {metagenomepath} {fastqfile}") + with open(output_file, 'w') as stdout: + return subprocess.Popen(command, stdout=stdout) + + +if paired_end == 'FALSE': + folder_pair1 = os.path.join(outputfolder, 'pair1_bowtie') + os.makedirs(folder_pair1, exist_ok=True) + + print("Processing repeat pseudogenomes...") + processes = [] ticker = 0 + for metagenome in repeat_list: - metagenomepath = setup_folder + os.path.sep + metagenome - file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie' - file2 = folder_pair2 + os.path.sep + metagenome + '.bowtie' - with open(file1, 'w') as stdout: - command = shlex.split("bowtie " + b_opt + " " + - metagenomepath + " " + fastqfile_1) - p = subprocess.Popen(command, stdout=stdout) - with open(file2, 'w') as stdout: - command = shlex.split("bowtie " + b_opt + " " + - metagenomepath + " " + fastqfile_2) - pp = subprocess.Popen(command, stdout=stdout) - ps.append(p) - ticker += 1 - psb.append(pp) + processes.append(run_bowtie(metagenome, fastqfile_1, folder_pair1)) ticker += 1 if ticker == cpus: + for p in processes: + p.communicate() + ticker = 0 + processes = [] + + for p in processes: + p.communicate() + # Combine the output from both read pairs: + print('Sorting and combining the output for both read pairs....') + sorted_bowtie = os.path.join(outputfolder, 'sorted_bowtie') + os.makedirs(sorted_bowtie, exist_ok=True) + for metagenome in repeat_list: + file1 = os.path.join(folder_pair1, f"{metagenome}.bowtie") + fileout = os.path.join(sorted_bowtie, f"{metagenome}.bowtie") + with open(fileout, 'w') as stdout: + p1 = subprocess.Popen(['cat', file1], stdout=subprocess.PIPE) + p2 = subprocess.Popen(['cut', '-f1'], stdin=p1.stdout, + stdout=subprocess.PIPE) + p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin=p2.stdout, + stdout=subprocess.PIPE) + p4 = subprocess.Popen(['sort'], stdin=p3.stdout, + stdout=subprocess.PIPE) + p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout) + p5.communicate() + stdout.close() + print('completed ...') +else: + folder_pair1 = os.path.join(outputfolder, 'pair1_bowtie') + folder_pair2 = os.path.join(outputfolder, 'pair2_bowtie') + os.makedirs(folder_pair1, exist_ok=True) + os.makedirs(folder_pair2, exist_ok=True) + + print("Processing repeat pseudogenomes...") + ps, psb, ticker = [], [], 0 + + for metagenome in repeat_list: + ps.append(run_bowtie(metagenome, fastqfile_1, folder_pair1)) + ticker += 1 + if fastqfile_2 != 'none': + psb.append(run_bowtie(metagenome, fastqfile_2, folder_pair2)) + ticker += 1 + if ticker >= cpus: for p in ps: p.communicate() for p in psb: p.communicate() ticker = 0 + ps = [] psb = [] - ps = [] - if len(ps) > 0: - for p in ps: - p.communicate() - stdout.close() -############################################################################## -# combine the output from both read pairs: - print('sorting and combining the output for both read pairs...') + for p in ps: + p.communicate() + for p in psb: + p.communicate() + # combine the output from both read pairs: + print('Sorting and combining the output for both read pairs...') if not os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'): os.mkdir(outputfolder + os.path.sep + 'sorted_bowtie') sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie' @@ -290,108 +243,43 @@ p5.communicate() stdout.close() print('completed ...') -############################################################################## -if paired_end == 'FALSE': - if not os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'): - os.mkdir(outputfolder + os.path.sep + 'pair1_bowtie') - folder_pair1 = outputfolder + os.path.sep + 'pair1_bowtie' -############################################################################## - ps = [] - ticker = 0 - print("Processing repeat psuedogenomes...") - for metagenome in repeat_list: - metagenomepath = setup_folder + os.path.sep + metagenome - file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie' - with open(file1, 'w') as stdout: - command = shlex.split("bowtie " + b_opt + " " + - metagenomepath + " " + fastqfile_1) - p = subprocess.Popen(command, stdout=stdout) - ps.append(p) - ticker += 1 - if ticker == cpus: - for p in ps: - p.communicate() - ticker = 0 - ps = [] - if len(ps) > 0: - for p in ps: - p.communicate() - stdout.close() -############################################################################## -# combine the output from both read pairs: - print('Sorting and combining the output for both read pairs....') - if not os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'): - os.mkdir(outputfolder + os.path.sep + 'sorted_bowtie') - sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie' - for metagenome in repeat_list: - file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie' - fileout = sorted_bowtie + os.path.sep + metagenome + '.bowtie' - with open(fileout, 'w') as stdout: - p1 = subprocess.Popen(['cat', file1], stdout=subprocess.PIPE) - p2 = subprocess.Popen(['cut', '-f1'], stdin=p1.stdout, - stdout=subprocess.PIPE) - p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin=p2.stdout, - stdout=subprocess.PIPE) - p4 = subprocess.Popen(['sort'], stdin=p3.stdout, - stdout=subprocess.PIPE) - p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout) - p5.communicate() - stdout.close() - print('completed ...') - -############################################################################## # build a file of repeat keys for all reads print('Writing and processing intermediate files...') -sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie' +sorted_bowtie = os.path.join(outputfolder, 'sorted_bowtie') +sumofrepeatreads = 0 readid = {} -sumofrepeatreads = 0 + for rep in repeat_list: - for data in import_text(sorted_bowtie + os.path.sep + - rep + '.bowtie', '\t'): + for data in import_text( + f"{os.path.join(sorted_bowtie, rep)}.bowtie", '\t'): readid[data[0]] = '' + for rep in repeat_list: - for data in import_text(sorted_bowtie + os.path.sep - + rep + '.bowtie', '\t'): - readid[data[0]] += str(repeat_key[rep]) + str(',') + for data in import_text( + f"{os.path.join(sorted_bowtie, rep)}.bowtie", '\t'): + readid[data[0]] += f"{repeat_key[rep]}," + for subfamilies in readid.values(): if subfamilies not in counts: counts[subfamilies] = 0 counts[subfamilies] += 1 sumofrepeatreads += 1 -del readid -print('Identified ' + str(sumofrepeatreads) + - ' reads that mapped to repeats for unique and multimappers.') -############################################################################## +print(f'Identified {sumofrepeatreads} reads that mapped to \ + repeats for unique and multimappers.') print("Conducting final calculations...") - -def convert(x): - ''' - build a converter to numeric label for repeat and yield a combined list - of repnames seperated by backslash - ''' - x = x.strip(',') - x = x.split(',') - global repname - repname = "" - for i in x: - repname = repname + os.path.sep + rev_repeat_key[int(i)] - - # building the total counts for repeat element enrichment... for x in counts.keys(): count = counts[x] - x = x.strip(',') - x = x.split(',') + x = x.strip(',').split(',') for i in x: reptotalcounts[rev_repeat_key[int(i)]] += int(count) # building the fractional counts for repeat element enrichment... for x in counts.keys(): count = counts[x] - x = x.strip(',') - x = x.split(',') + x = x.strip(',') .split(',') splits = len(x) for i in x: fractionalcounts[rev_repeat_key[int(i)]] += float( @@ -400,7 +288,10 @@ repcounts = {} repcounts['other'] = 0 for key in counts.keys(): - convert(key) + key_list = key.strip(',').split(',') + repname = '' + for i in key_list: + repname = os.path.join(repname, rev_repeat_key[int(i)]) repcounts[repname] = counts[key] # building the total counts for class enrichment... for key in reptotalcounts.keys(): @@ -408,7 +299,7 @@ # building total counts for family enrichment... for key in reptotalcounts.keys(): familytotalcounts[repeatfamily[key]] += reptotalcounts[key] -# building unique counts table' +# building unique counts table repcounts2 = {} for rep in repeat_list: if "/" + rep in repcounts: @@ -422,78 +313,20 @@ for key in fractionalcounts.keys(): familyfractionalcounts[repeatfamily[key]] += fractionalcounts[key] -############################################################################## -print('Writing final output and removing intermediate files...') # print output to file of the categorized counts and total overlapping counts: -if allcountmethod == "TRUE": - fout1 = open(outputfolder + os.path.sep + outputfile_prefix - + '_total_counts.txt', 'w') - for key in sorted(reptotalcounts.keys()): - fout1.write(str(key) + '\t' + repeatclass[key] + '\t' + - repeatfamily[key] + '\t' + str(reptotalcounts[key]) - + '\n') - fout2 = open(outputfolder + os.path.sep + outputfile_prefix - + '_class_total_counts.txt', 'w') - for key in sorted(classtotalcounts.keys()): - fout2.write(str(key) + '\t' + str(classtotalcounts[key]) + '\n') - fout3 = open(outputfolder + os.path.sep + outputfile_prefix - + '_family_total_counts.txt', 'w') - for key in sorted(familytotalcounts.keys()): - fout3.write(str(key) + '\t' + str(familytotalcounts[key]) + '\n') - fout4 = open(outputfolder + os.path.sep + outputfile_prefix + - '_unique_counts.txt', 'w') - for key in sorted(repcounts2.keys()): - fout4.write(str(key) + '\t' + repeatclass[key] + '\t' + - repeatfamily[key] + '\t' + str(repcounts2[key]) + '\n') - fout5 = open(outputfolder + os.path.sep + outputfile_prefix - + '_class_fraction_counts.txt', 'w') +print('Writing final output...') +with open(f"{os.path.join(outputfolder, outputfile_prefix)}_" + f"class_fraction_counts.txt", 'w') as fout: for key in sorted(classfractionalcounts.keys()): - fout5.write(str(key) + '\t' + str(classfractionalcounts[key]) + '\n') - fout6 = open(outputfolder + os.path.sep + outputfile_prefix + - '_family_fraction_counts.txt', 'w') + fout.write(f"{key}\t{classfractionalcounts[key]}\n") + +with open(f"{os.path.join(outputfolder, outputfile_prefix)}_" + f"family_fraction_counts.txt", 'w') as fout: for key in sorted(familyfractionalcounts.keys()): - fout6.write(str(key) + '\t' + str(familyfractionalcounts[key]) + '\n') - fout7 = open(outputfolder + os.path.sep + outputfile_prefix - + '_fraction_counts.txt', 'w') + fout.write(f"{key}\t{familyfractionalcounts[key]}\n") + +with open(f"{os.path.join(outputfolder, outputfile_prefix)}_" + f"fraction_counts.txt", 'w') as fout: for key in sorted(fractionalcounts.keys()): - fout7.write(str(key) + '\t' + repeatclass[key] + '\t' + - repeatfamily[key] + '\t' + str(int(fractionalcounts[key])) - + '\n') - fout1.close() - fout2.close() - fout3.close() - fout4.close() - fout5.close() - fout6.close() - fout7.close() -else: - fout1 = open(outputfolder + os.path.sep + outputfile_prefix + - '_class_fraction_counts.txt', 'w') - for key in sorted(classfractionalcounts.keys()): - fout1.write(str(key) + '\t' + str(classfractionalcounts[key]) + '\n') - fout2 = open(outputfolder + os.path.sep + outputfile_prefix + - '_family_fraction_counts.txt', 'w') - for key in sorted(familyfractionalcounts.keys()): - fout2.write(str(key) + '\t' + str(familyfractionalcounts[key]) + '\n') - fout3 = open(outputfolder + os.path.sep + outputfile_prefix + - '_fraction_counts.txt', 'w') - for key in sorted(fractionalcounts.keys()): - fout3.write(str(key) + '\t' + repeatclass[key] + '\t' + - repeatfamily[key] + '\t' + str(int(fractionalcounts[key])) - + '\n') - fout1.close() - fout2.close() - fout3.close() -############################################################################## -# Remove Large intermediate files -if os.path.exists(outputfolder + os.path.sep + outputfile_prefix + - '_regionsorter.txt'): - os.remove(outputfolder + os.path.sep + outputfile_prefix + - '_regionsorter.txt') -if os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'): - shutil.rmtree(outputfolder + os.path.sep + 'pair1_bowtie') -if os.path.exists(outputfolder + os.path.sep + 'pair2_bowtie'): - shutil.rmtree(outputfolder + os.path.sep + 'pair2_bowtie') -if os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'): - shutil.rmtree(outputfolder + os.path.sep + 'sorted_bowtie') -print("... Done") + fout.write(f"{key}\t{repeatclass[key]}\t{repeatfamily[key]}\t" + f"{int(fractionalcounts[key])}\n")