repenrich: RepEnrich.py comparison

comparison RepEnrich.py @ 12:89e05f831259 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df

author	artbio
date	Mon, 18 Mar 2024 09:39:44 +0000
parents	6f4143893463
children	530626b0757c

comparison

equal deleted inserted replaced

-:6bba3e33c2e7
+:89e05f831259
 import argparse
 import csv
 import os
 import shlex
-import shutil
 import subprocess
 import sys
 import numpy
 parser = argparse.ArgumentParser(description='''
-Part II: Conducting the alignments to the psuedogenomes. Before\
+Repenrich aligns reads to Repeat Elements pseudogenomes\
-doing this step you will require 1) a bamfile of the unique\
+and counts aligned reads. RepEnrich_setup must be run\
-alignments with index 2) a fastq file of the reads mapping to\
+before its use''')
-more than one location. These files can be obtained using the\
+parser.add_argument('--annotation_file', action='store',
-following bowtie options [EXAMPLE: bowtie -S -m 1\
---max multimap.fastq mm9 mate1_reads.fastq]  Once you have the\
-unique alignment bamfile and the reads mapping to more than one\
-location in a fastq file you can run this step.  EXAMPLE: python\
-master_output.py\
-/users/nneretti/data/annotation/hg19/hg19_repeatmasker.txt\
-/users/nneretti/datasets/repeatmapping/POL3/Pol3_human/
-HeLa_InputChIPseq_Rep1 HeLa_InputChIPseq_Rep1\
-/users/nneretti/data/annotation/hg19/setup_folder\
-HeLa_InputChIPseq_Rep1_multimap.fastq\
-HeLa_InputChIPseq_Rep1.bam''')
-parser.add_argument('--version', action='version', version='%(prog)s 0.1')
-parser.add_argument('annotation_file', action='store',
 metavar='annotation_file',
-help='List RepeatMasker.org annotation file for your\
+help='RepeatMasker.org annotation file for your\
-organism.  The file may be downloaded from the\
+organism. The file may be downloaded from\
-RepeatMasker.org website. Example:\
+RepeatMasker.org. E.g. hg19_repeatmasker.txt')
-/data/annotation/hg19/hg19_repeatmasker.txt')
+parser.add_argument('--outputfolder', action='store', metavar='outputfolder',
-parser.add_argument('outputfolder', action='store', metavar='outputfolder',
+help='Folder that will contain results. Should be the\
-help='List folder to contain results.\
+same as the one used for RepEnrich_setup.\
-Example: /outputfolder')
+Example: ./outputfolder')
-parser.add_argument('outputprefix', action='store', metavar='outputprefix',
+parser.add_argument('--outputprefix', action='store', metavar='outputprefix',
-help='Enter prefix name for data.\
+help='Prefix name for Repenrich output files.')
-Example: HeLa_InputChIPseq_Rep1')
+parser.add_argument('--setup_folder', action='store', metavar='setup_folder',
-parser.add_argument('setup_folder', action='store', metavar='setup_folder',
+help='Folder produced by RepEnrich_setup which contains\
-help='List folder that contains the repeat element\
+repeat element pseudogenomes.')
-pseudogenomes.\
+parser.add_argument('--fastqfile', action='store', metavar='fastqfile',
-Example: /data/annotation/hg19/setup_folder')
+help='File of fastq reads mapping to multiple\
-parser.add_argument('fastqfile', action='store', metavar='fastqfile',
-help='Enter file for the fastq reads that map to multiple\
 locations. Example: /data/multimap.fastq')
-parser.add_argument('alignment_bam', action='store', metavar='alignment_bam',
+parser.add_argument('--alignment_bam', action='store', metavar='alignment_bam',
-help='Enter bamfile output for reads that map uniquely.\
+help='Bam alignments of unique mapper reads.')
-Example /bamfiles/old.bam')
 parser.add_argument('--pairedend', action='store', dest='pairedend',
 default='FALSE',
-help='Designate this option for paired-end sequencing.\
+help='Change to TRUE for paired-end fastq files.\
-Default FALSE change to TRUE')
+Default FALSE')
-parser.add_argument('--collapserepeat', action='store', dest='collapserepeat',
-metavar='collapserepeat', default='Simple_repeat',
-help='Designate this option to generate a collapsed repeat\
-type. Uncollapsed output is generated in addition to\
-collapsed repeat type. Simple_repeat is default to\
-simplify downstream analysis. You can change the\
-default to another repeat name to collapse a\
-seperate specific repeat instead or if the name of\
-Simple_repeat is different for your organism.\
-Default Simple_repeat')
 parser.add_argument('--fastqfile2', action='store', dest='fastqfile2',
 metavar='fastqfile2', default='none',
-help='Enter fastqfile2 when using paired-end option.\
+help='fastqfile #2 when using paired-end option.\
 Default none')
 parser.add_argument('--cpus', action='store', dest='cpus', metavar='cpus',
 default="1", type=int,
-help='Enter available cpus per node.  The more cpus the\
+help='Number of CPUs. The more cpus the\
-faster RepEnrich performs. RepEnrich is designed to\
+faster RepEnrich performs. Default: "1"')
-only work on one node. Default: "1"')
-parser.add_argument('--allcountmethod', action='store', dest='allcountmethod',
-metavar='allcountmethod', default="FALSE",
-help='By default the pipeline only outputs the fraction\
-count method.  Consdidered to be the best way to\
-count multimapped reads. Changing this option will\
-include the unique count method, a conservative\
-count, and the total count method, a liberal\
-counting strategy. Our evaluation of simulated data\
-indicated fraction counting is best.\
-Default = FALSE, change to TRUE')
-parser.add_argument('--is_bed', action='store', dest='is_bed',
-metavar='is_bed', default='FALSE',
-help='Is the annotation file a bed file.\
-This is also a compatible format. The file needs to\
-be a tab seperated bed with optional fields.\
-Ex. format: chr\tstart\tend\tName_element\tclass\
-\tfamily. The class and family should identical to\
-name_element if not applicable.\
-Default FALSE change to TRUE')
 args = parser.parse_args()
 # parameters
 annotation_file = args.annotation_file
 outputfolder = args.outputfolder
 outputfile_prefix = args.outputprefix
 setup_folder = args.setup_folder
-repeat_bed = setup_folder + os.path.sep + 'repnames.bed'
+repeat_bed = os.path.join(setup_folder, 'repnames.bed')
 unique_mapper_bam = args.alignment_bam
 fastqfile_1 = args.fastqfile
 fastqfile_2 = args.fastqfile2
 cpus = args.cpus
-b_opt = "-k1 -p " + str(1) + " --quiet"
+b_opt = "-k1 -p 1 --quiet"
-simple_repeat = args.collapserepeat
+# Change if simple repeats are differently annotated in your organism
+simple_repeat = "Simple_repeat"
 paired_end = args.pairedend
-allcountmethod = args.allcountmethod
-is_bed = args.is_bed
-##############################################################################
 # check that the programs we need are available
 try:
 subprocess.call(shlex.split("coverageBed -h"),
 stdout=open(os.devnull, 'wb'),
 stderr=open(os.devnull, 'wb'))
 subprocess.call(shlex.split("bowtie --version"),
 stdout=open(os.devnull, 'wb'),
 stderr=open(os.devnull, 'wb'))
 except OSError:
-print("Error: Bowtie or BEDTools not loaded")
+print("Error: Bowtie or bedtools not loaded")
 raise
-##############################################################################
 # define a csv reader that reads space deliminated files
 print('Preparing for analysis using RepEnrich...')
 csv.field_size_limit(sys.maxsize)
 skipinitialspace=True):
 if line:
 yield line
-##############################################################################
+# build dictionaries to convert repclass and rep families
-# build dictionaries to convert repclass and rep families'
+repeatclass, repeatfamily = {}, {}
-if is_bed == "FALSE":
+repeats = import_text(annotation_file, ' ')
-repeatclass = {}
+# skip three first lines of the iterator
-repeatfamily = {}
+for line in range(3):
-fin = import_text(annotation_file, ' ')
+next(repeats)
-x = 0
+for repeat in repeats:
-for line in fin:
+classfamily = []
-if x > 2:
+classfamily = repeat[10].split('/')
-classfamily = []
+matching_repeat = repeat[9].translate(str.maketrans('()/', '___'))
-classfamily = line[10].split(os.path.sep)
+repeatclass[matching_repeat] = classfamily[0]
-line9 = line[9].replace("(", "_").replace(
+if len(classfamily) == 2:
-")", "_").replace("/", "_")
+repeatfamily[matching_repeat] = classfamily[1]
-repeatclass[line9] = classfamily[0]
+else:
-if len(classfamily) == 2:
+repeatfamily[matching_repeat] = classfamily[0]
-repeatfamily[line9] = classfamily[1]
-else:
-repeatfamily[line9] = classfamily[0]
-x += 1
-if is_bed == "TRUE":
-repeatclass = {}
-repeatfamily = {}
-fin = open(annotation_file, 'r')
-for line in fin:
-line = line.strip('\n')
-line = line.split('\t')
-theclass = line[4]
-thefamily = line[5]
-line3 = line[3].replace("(", "_").replace(")", "_").replace("/", "_")
-repeatclass[line3] = theclass
-repeatfamily[line3] = thefamily
-fin.close()
-##############################################################################
 # build list of repeats initializing dictionaries for downstream analysis'
-fin = import_text(setup_folder + os.path.sep + 'repgenomes_key.txt', '\t')
+repgenome_path = os.path.join(setup_folder, 'repgenomes_key.txt')
-repeat_key = {}
+reptotalcounts = {line[0]: 0 for line in import_text(repgenome_path, '\t')}
-rev_repeat_key = {}
+fractionalcounts = {line[0]: 0 for line in import_text(repgenome_path, '\t')}
-repeat_list = []
+classtotalcounts = {repeatclass[line[0]]: 0 for line in import_text(
-reptotalcounts = {}
+repgenome_path, '\t') if line[0] in repeatclass}
-classfractionalcounts = {}
+classfractionalcounts = {repeatclass[line[0]]: 0 for line in import_text(
-familyfractionalcounts = {}
+repgenome_path, '\t') if line[0] in repeatclass}
-classtotalcounts = {}
+familytotalcounts = {repeatfamily[line[0]]: 0 for line in import_text(
-familytotalcounts = {}
+repgenome_path, '\t') if line[0] in repeatfamily}
-reptotalcounts_simple = {}
+familyfractionalcounts = {repeatfamily[line[0]]: 0 for line in import_text(
-fractionalcounts = {}
+repgenome_path, '\t') if line[0] in repeatfamily}
-i = 0
+reptotalcounts_simple = {(simple_repeat if line[0] in repeatfamily and
-for line in fin:
+repeatfamily[line[0]] == simple_repeat else
-reptotalcounts[line[0]] = 0
+line[0]): 0 for line in import_text(
-fractionalcounts[line[0]] = 0
+repgenome_path, '\t')}
-if line[0] in repeatclass:
+repeat_key = {line[0]: int(line[1]) for line in import_text(
-classtotalcounts[repeatclass[line[0]]] = 0
+repgenome_path, '\t')}
-classfractionalcounts[repeatclass[line[0]]] = 0
+rev_repeat_key = {int(line[1]): line[0] for line in import_text(
-if line[0] in repeatfamily:
+repgenome_path, '\t')}
-familytotalcounts[repeatfamily[line[0]]] = 0
+repeat_list = [line[0] for line in import_text(repgenome_path, '\t')]
-familyfractionalcounts[repeatfamily[line[0]]] = 0
-if line[0] in repeatfamily:
-if repeatfamily[line[0]] == simple_repeat:
-reptotalcounts_simple[simple_repeat] = 0
-else:
-reptotalcounts_simple[line[0]] = 0
-repeat_list.append(line[0])
-repeat_key[line[0]] = int(line[1])
-rev_repeat_key[int(line[1])] = line[0]
-fin.close()
-##############################################################################
 # map the repeats to the psuedogenomes:
 if not os.path.exists(outputfolder):
 os.mkdir(outputfolder)
-##############################################################################
 # Conduct the regions sorting
-print('Conducting region sorting on unique mapping reads....')
+fileout = os.path.join(outputfolder, f"{outputfile_prefix}_regionsorter.txt")
-fileout = outputfolder + os.path.sep + outputfile_prefix + '_regionsorter.txt'
+command = shlex.split(f"coverageBed -abam {unique_mapper_bam} -b \
+{os.path.join(setup_folder, 'repnames.bed')}")
 with open(fileout, 'w') as stdout:
-command = shlex.split("coverageBed -abam " + unique_mapper_bam + " -b " +
+subprocess.run(command, stdout=stdout, check=True)
-setup_folder + os.path.sep + 'repnames.bed')
-p = subprocess.Popen(command, stdout=stdout)
-p.communicate()
-stdout.close()
-filein = open(outputfolder + os.path.sep + outputfile_prefix
-+ '_regionsorter.txt', 'r')
 counts = {}
 sumofrepeatreads = 0
-for line in filein:
+with open(fileout) as filein:
-line = line.split('\t')
+for line in filein:
-if not str(repeat_key[line[3]]) in counts:
+line = line.split('\t')
-counts[str(repeat_key[line[3]])] = 0
+if not str(repeat_key[line[3]]) in counts:
-counts[str(repeat_key[line[3]])] += int(line[4])
+counts[str(repeat_key[line[3]])] = 0
-sumofrepeatreads += int(line[4])
+counts[str(repeat_key[line[3]])] += int(line[4])
-print('Identified ' + str(sumofrepeatreads) +
+sumofrepeatreads += int(line[4])
-'unique reads that mapped to repeats.')
+print(f"Identified {sumofrepeatreads} unique reads that \
-##############################################################################
+mapped to repeats.")
-if paired_end == 'TRUE':
-if not os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'):
-os.mkdir(outputfolder + os.path.sep + 'pair1_bowtie')
+def run_bowtie(metagenome, fastqfile, folder):
-if not os.path.exists(outputfolder + os.path.sep + 'pair2_bowtie'):
+metagenomepath = os.path.join(setup_folder, metagenome)
-os.mkdir(outputfolder + os.path.sep + 'pair2_bowtie')
+output_file = os.path.join(folder, f"{metagenome}.bowtie")
-folder_pair1 = outputfolder + os.path.sep + 'pair1_bowtie'
+command = shlex.split(f"bowtie {b_opt} {metagenomepath} {fastqfile}")
-folder_pair2 = outputfolder + os.path.sep + 'pair2_bowtie'
+with open(output_file, 'w') as stdout:
-##############################################################################
+return subprocess.Popen(command, stdout=stdout)
-print("Processing repeat psuedogenomes...")
-ps = []
-psb = []
+if paired_end == 'FALSE':
+folder_pair1 = os.path.join(outputfolder, 'pair1_bowtie')
+os.makedirs(folder_pair1, exist_ok=True)
+print("Processing repeat pseudogenomes...")
+processes = []
 ticker = 0
 for metagenome in repeat_list:
-metagenomepath = setup_folder + os.path.sep + metagenome
+processes.append(run_bowtie(metagenome, fastqfile_1, folder_pair1))
-file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie'
-file2 = folder_pair2 + os.path.sep + metagenome + '.bowtie'
-with open(file1, 'w') as stdout:
-command = shlex.split("bowtie " + b_opt + " " +
-metagenomepath + " " + fastqfile_1)
-p = subprocess.Popen(command, stdout=stdout)
-with open(file2, 'w') as stdout:
-command = shlex.split("bowtie " + b_opt + " " +
-metagenomepath + " " + fastqfile_2)
-pp = subprocess.Popen(command, stdout=stdout)
-ps.append(p)
-ticker += 1
-psb.append(pp)
 ticker += 1
 if ticker == cpus:
+for p in processes:
+p.communicate()
+ticker = 0
+processes = []
+for p in processes:
+p.communicate()
+# Combine the output from both read pairs:
+print('Sorting and combining the output for both read pairs....')
+sorted_bowtie = os.path.join(outputfolder, 'sorted_bowtie')
+os.makedirs(sorted_bowtie, exist_ok=True)
+for metagenome in repeat_list:
+file1 = os.path.join(folder_pair1, f"{metagenome}.bowtie")
+fileout = os.path.join(sorted_bowtie, f"{metagenome}.bowtie")
+with open(fileout, 'w') as stdout:
+p1 = subprocess.Popen(['cat', file1], stdout=subprocess.PIPE)
+p2 = subprocess.Popen(['cut', '-f1'], stdin=p1.stdout,
+stdout=subprocess.PIPE)
+p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin=p2.stdout,
+stdout=subprocess.PIPE)
+p4 = subprocess.Popen(['sort'], stdin=p3.stdout,
+stdout=subprocess.PIPE)
+p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout)
+p5.communicate()
+stdout.close()
+print('completed ...')
+else:
+folder_pair1 = os.path.join(outputfolder, 'pair1_bowtie')
+folder_pair2 = os.path.join(outputfolder, 'pair2_bowtie')
+os.makedirs(folder_pair1, exist_ok=True)
+os.makedirs(folder_pair2, exist_ok=True)
+print("Processing repeat pseudogenomes...")
+ps, psb, ticker = [], [], 0
+for metagenome in repeat_list:
+ps.append(run_bowtie(metagenome, fastqfile_1, folder_pair1))
+ticker += 1
+if fastqfile_2 != 'none':
+psb.append(run_bowtie(metagenome, fastqfile_2, folder_pair2))
+ticker += 1
+if ticker >= cpus:
 for p in ps:
 p.communicate()
 for p in psb:
 p.communicate()
 ticker = 0
+ps = []
 psb = []
-ps = []
-if len(ps) > 0:
+for p in ps:
-for p in ps:
+p.communicate()
-p.communicate()
+for p in psb:
-stdout.close()
+p.communicate()
+# combine the output from both read pairs:
-##############################################################################
+print('Sorting and combining the output for both read pairs...')
-# combine the output from both read pairs:
-print('sorting and combining the output for both read pairs...')
 if not os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'):
 os.mkdir(outputfolder + os.path.sep + 'sorted_bowtie')
 sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie'
 for metagenome in repeat_list:
 file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie'
 stdout=subprocess.PIPE)
 p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout)
 p5.communicate()
 stdout.close()
 print('completed ...')
-##############################################################################
-if paired_end == 'FALSE':
-if not os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'):
-os.mkdir(outputfolder + os.path.sep + 'pair1_bowtie')
-folder_pair1 = outputfolder + os.path.sep + 'pair1_bowtie'
-##############################################################################
-ps = []
-ticker = 0
-print("Processing repeat psuedogenomes...")
-for metagenome in repeat_list:
-metagenomepath = setup_folder + os.path.sep + metagenome
-file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie'
-with open(file1, 'w') as stdout:
-command = shlex.split("bowtie " + b_opt + " " +
-metagenomepath + " " + fastqfile_1)
-p = subprocess.Popen(command, stdout=stdout)
-ps.append(p)
-ticker += 1
-if ticker == cpus:
-for p in ps:
-p.communicate()
-ticker = 0
-ps = []
-if len(ps) > 0:
-for p in ps:
-p.communicate()
-stdout.close()
-##############################################################################
-# combine the output from both read pairs:
-print('Sorting and combining the output for both read pairs....')
-if not os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'):
-os.mkdir(outputfolder + os.path.sep + 'sorted_bowtie')
-sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie'
-for metagenome in repeat_list:
-file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie'
-fileout = sorted_bowtie + os.path.sep + metagenome + '.bowtie'
-with open(fileout, 'w') as stdout:
-p1 = subprocess.Popen(['cat', file1], stdout=subprocess.PIPE)
-p2 = subprocess.Popen(['cut', '-f1'], stdin=p1.stdout,
-stdout=subprocess.PIPE)
-p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin=p2.stdout,
-stdout=subprocess.PIPE)
-p4 = subprocess.Popen(['sort'], stdin=p3.stdout,
-stdout=subprocess.PIPE)
-p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout)
-p5.communicate()
-stdout.close()
-print('completed ...')
-##############################################################################
 # build a file of repeat keys for all reads
 print('Writing and processing intermediate files...')
-sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie'
+sorted_bowtie = os.path.join(outputfolder, 'sorted_bowtie')
+sumofrepeatreads = 0
 readid = {}
-sumofrepeatreads = 0
 for rep in repeat_list:
-for data in import_text(sorted_bowtie + os.path.sep +
+for data in import_text(
-rep + '.bowtie', '\t'):
+f"{os.path.join(sorted_bowtie, rep)}.bowtie", '\t'):
 readid[data[0]] = ''
 for rep in repeat_list:
-for data in import_text(sorted_bowtie + os.path.sep
+for data in import_text(
-+ rep + '.bowtie', '\t'):
+f"{os.path.join(sorted_bowtie, rep)}.bowtie", '\t'):
-readid[data[0]] += str(repeat_key[rep]) + str(',')
+readid[data[0]] += f"{repeat_key[rep]},"
 for subfamilies in readid.values():
 if subfamilies not in counts:
 counts[subfamilies] = 0
 counts[subfamilies] += 1
 sumofrepeatreads += 1
-del readid
-print('Identified ' + str(sumofrepeatreads) +
+print(f'Identified {sumofrepeatreads} reads that mapped to \
-' reads that mapped to repeats for unique and multimappers.')
+repeats for unique and multimappers.')
-##############################################################################
 print("Conducting final calculations...")
-def convert(x):
-'''
-build a converter to numeric label for repeat and yield a combined list
-of repnames seperated by backslash
-'''
-x = x.strip(',')
-x = x.split(',')
-global repname
-repname = ""
-for i in x:
-repname = repname + os.path.sep + rev_repeat_key[int(i)]
 # building the total counts for repeat element enrichment...
 for x in counts.keys():
 count = counts[x]
-x = x.strip(',')
+x = x.strip(',').split(',')
-x = x.split(',')
 for i in x:
 reptotalcounts[rev_repeat_key[int(i)]] += int(count)
 # building the fractional counts for repeat element enrichment...
 for x in counts.keys():
 count = counts[x]
-x = x.strip(',')
+x = x.strip(',')    .split(',')
-x = x.split(',')
 splits = len(x)
 for i in x:
 fractionalcounts[rev_repeat_key[int(i)]] += float(
 numpy.divide(float(count), float(splits)))
 # building categorized table of repeat element enrichment...
 repcounts = {}
 repcounts['other'] = 0
 for key in counts.keys():
-convert(key)
+key_list = key.strip(',').split(',')
+repname = ''
+for i in key_list:
+repname = os.path.join(repname, rev_repeat_key[int(i)])
 repcounts[repname] = counts[key]
 # building the total counts for class enrichment...
 for key in reptotalcounts.keys():
 classtotalcounts[repeatclass[key]] += reptotalcounts[key]
 # building total counts for family enrichment...
 for key in reptotalcounts.keys():
 familytotalcounts[repeatfamily[key]] += reptotalcounts[key]
-# building unique counts table'
+# building unique counts table
 repcounts2 = {}
 for rep in repeat_list:
 if "/" + rep in repcounts:
 repcounts2[rep] = repcounts["/" + rep]
 else:
 classfractionalcounts[repeatclass[key]] += fractionalcounts[key]
 # building fractional counts for family enrichment...
 for key in fractionalcounts.keys():
 familyfractionalcounts[repeatfamily[key]] += fractionalcounts[key]
-##############################################################################
-print('Writing final output and removing intermediate files...')
 # print output to file of the categorized counts and total overlapping counts:
-if allcountmethod == "TRUE":
+print('Writing final output...')
-fout1 = open(outputfolder + os.path.sep + outputfile_prefix
+with open(f"{os.path.join(outputfolder, outputfile_prefix)}_"
-+ '_total_counts.txt', 'w')
+f"class_fraction_counts.txt", 'w') as fout:
-for key in sorted(reptotalcounts.keys()):
-fout1.write(str(key) + '\t' + repeatclass[key] + '\t' +
-repeatfamily[key] + '\t' + str(reptotalcounts[key])
-+ '\n')
-fout2 = open(outputfolder + os.path.sep + outputfile_prefix
-+ '_class_total_counts.txt', 'w')
-for key in sorted(classtotalcounts.keys()):
-fout2.write(str(key) + '\t' + str(classtotalcounts[key]) + '\n')
-fout3 = open(outputfolder + os.path.sep + outputfile_prefix
-+ '_family_total_counts.txt', 'w')
-for key in sorted(familytotalcounts.keys()):
-fout3.write(str(key) + '\t' + str(familytotalcounts[key]) + '\n')
-fout4 = open(outputfolder + os.path.sep + outputfile_prefix +
-'_unique_counts.txt', 'w')
-for key in sorted(repcounts2.keys()):
-fout4.write(str(key) + '\t' + repeatclass[key] + '\t' +
-repeatfamily[key] + '\t' + str(repcounts2[key]) + '\n')
-fout5 = open(outputfolder + os.path.sep + outputfile_prefix
-+ '_class_fraction_counts.txt', 'w')
 for key in sorted(classfractionalcounts.keys()):
-fout5.write(str(key) + '\t' + str(classfractionalcounts[key]) + '\n')
+fout.write(f"{key}\t{classfractionalcounts[key]}\n")
-fout6 = open(outputfolder + os.path.sep + outputfile_prefix +
-'_family_fraction_counts.txt', 'w')
+with open(f"{os.path.join(outputfolder, outputfile_prefix)}_"
+f"family_fraction_counts.txt", 'w') as fout:
 for key in sorted(familyfractionalcounts.keys()):
-fout6.write(str(key) + '\t' + str(familyfractionalcounts[key]) + '\n')
+fout.write(f"{key}\t{familyfractionalcounts[key]}\n")
-fout7 = open(outputfolder + os.path.sep + outputfile_prefix
-+ '_fraction_counts.txt', 'w')
+with open(f"{os.path.join(outputfolder, outputfile_prefix)}_"
+f"fraction_counts.txt", 'w') as fout:
 for key in sorted(fractionalcounts.keys()):
-fout7.write(str(key) + '\t' + repeatclass[key] + '\t' +
+fout.write(f"{key}\t{repeatclass[key]}\t{repeatfamily[key]}\t"
-repeatfamily[key] + '\t' + str(int(fractionalcounts[key]))
+f"{int(fractionalcounts[key])}\n")
-+ '\n')
-fout1.close()
-fout2.close()
-fout3.close()
-fout4.close()
-fout5.close()
-fout6.close()
-fout7.close()
-else:
-fout1 = open(outputfolder + os.path.sep + outputfile_prefix +
-'_class_fraction_counts.txt', 'w')
-for key in sorted(classfractionalcounts.keys()):
-fout1.write(str(key) + '\t' + str(classfractionalcounts[key]) + '\n')
-fout2 = open(outputfolder + os.path.sep + outputfile_prefix +
-'_family_fraction_counts.txt', 'w')
-for key in sorted(familyfractionalcounts.keys()):
-fout2.write(str(key) + '\t' + str(familyfractionalcounts[key]) + '\n')
-fout3 = open(outputfolder + os.path.sep + outputfile_prefix +
-'_fraction_counts.txt', 'w')
-for key in sorted(fractionalcounts.keys()):
-fout3.write(str(key) + '\t' + repeatclass[key] + '\t' +
-repeatfamily[key] + '\t' + str(int(fractionalcounts[key]))
-+ '\n')
-fout1.close()
-fout2.close()
-fout3.close()
-##############################################################################
-#  Remove Large intermediate files
-if os.path.exists(outputfolder + os.path.sep + outputfile_prefix +
-'_regionsorter.txt'):
-os.remove(outputfolder + os.path.sep + outputfile_prefix +
-'_regionsorter.txt')
-if os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'):
-shutil.rmtree(outputfolder + os.path.sep + 'pair1_bowtie')
-if os.path.exists(outputfolder + os.path.sep + 'pair2_bowtie'):
-shutil.rmtree(outputfolder + os.path.sep + 'pair2_bowtie')
-if os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'):
-shutil.rmtree(outputfolder + os.path.sep + 'sorted_bowtie')
-print("... Done")

Mercurial > repos > artbio > repenrich

comparison RepEnrich.py @ 12:89e05f831259 draft