Mercurial > repos > artbio > repenrich
diff RepEnrich.py @ 13:530626b0757c draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
author | artbio |
---|---|
date | Tue, 02 Apr 2024 21:16:37 +0000 |
parents | 89e05f831259 |
children | bf866bedd4b4 |
line wrap: on
line diff
--- a/RepEnrich.py Mon Mar 18 09:39:44 2024 +0000 +++ b/RepEnrich.py Tue Apr 02 21:16:37 2024 +0000 @@ -4,8 +4,8 @@ import shlex import subprocess import sys - -import numpy +from collections import defaultdict +from concurrent.futures import ProcessPoolExecutor parser = argparse.ArgumentParser(description=''' @@ -17,26 +17,13 @@ help='RepeatMasker.org annotation file for your\ organism. The file may be downloaded from\ RepeatMasker.org. E.g. hg19_repeatmasker.txt') -parser.add_argument('--outputfolder', action='store', metavar='outputfolder', - help='Folder that will contain results. Should be the\ - same as the one used for RepEnrich_setup.\ - Example: ./outputfolder') -parser.add_argument('--outputprefix', action='store', metavar='outputprefix', - help='Prefix name for Repenrich output files.') -parser.add_argument('--setup_folder', action='store', metavar='setup_folder', - help='Folder produced by RepEnrich_setup which contains\ - repeat element pseudogenomes.') +parser.add_argument('--alignment_bam', action='store', metavar='alignment_bam', + help='Bam alignments of unique mapper reads.') parser.add_argument('--fastqfile', action='store', metavar='fastqfile', help='File of fastq reads mapping to multiple\ locations. Example: /data/multimap.fastq') -parser.add_argument('--alignment_bam', action='store', metavar='alignment_bam', - help='Bam alignments of unique mapper reads.') -parser.add_argument('--pairedend', action='store', dest='pairedend', - default='FALSE', - help='Change to TRUE for paired-end fastq files.\ - Default FALSE') parser.add_argument('--fastqfile2', action='store', dest='fastqfile2', - metavar='fastqfile2', default='none', + metavar='fastqfile2', default='', help='fastqfile #2 when using paired-end option.\ Default none') parser.add_argument('--cpus', action='store', dest='cpus', metavar='cpus', @@ -48,18 +35,16 @@ # parameters annotation_file = args.annotation_file -outputfolder = args.outputfolder -outputfile_prefix = args.outputprefix -setup_folder = args.setup_folder -repeat_bed = os.path.join(setup_folder, 'repnames.bed') unique_mapper_bam = args.alignment_bam fastqfile_1 = args.fastqfile fastqfile_2 = args.fastqfile2 cpus = args.cpus -b_opt = "-k1 -p 1 --quiet" # Change if simple repeats are differently annotated in your organism simple_repeat = "Simple_repeat" -paired_end = args.pairedend +if args.fastqfile2: + paired_end = True +else: + paired_end = False # check that the programs we need are available try: @@ -73,260 +58,147 @@ print("Error: Bowtie or bedtools not loaded") raise -# define a csv reader that reads space deliminated files -print('Preparing for analysis using RepEnrich...') -csv.field_size_limit(sys.maxsize) - -def import_text(filename, separator): - for line in csv.reader(open(filename), delimiter=separator, - skipinitialspace=True): - if line: - yield line +def starts_with_numerical(list): + try: + if len(list) == 0: + return False + int(list[0]) + return True + except ValueError: + return False -# build dictionaries to convert repclass and rep families -repeatclass, repeatfamily = {}, {} -repeats = import_text(annotation_file, ' ') -# skip three first lines of the iterator -for line in range(3): - next(repeats) -for repeat in repeats: - classfamily = [] - classfamily = repeat[10].split('/') - matching_repeat = repeat[9].translate(str.maketrans('()/', '___')) - repeatclass[matching_repeat] = classfamily[0] - if len(classfamily) == 2: - repeatfamily[matching_repeat] = classfamily[1] - else: - repeatfamily[matching_repeat] = classfamily[0] - -# build list of repeats initializing dictionaries for downstream analysis' -repgenome_path = os.path.join(setup_folder, 'repgenomes_key.txt') -reptotalcounts = {line[0]: 0 for line in import_text(repgenome_path, '\t')} -fractionalcounts = {line[0]: 0 for line in import_text(repgenome_path, '\t')} -classtotalcounts = {repeatclass[line[0]]: 0 for line in import_text( - repgenome_path, '\t') if line[0] in repeatclass} -classfractionalcounts = {repeatclass[line[0]]: 0 for line in import_text( - repgenome_path, '\t') if line[0] in repeatclass} -familytotalcounts = {repeatfamily[line[0]]: 0 for line in import_text( - repgenome_path, '\t') if line[0] in repeatfamily} -familyfractionalcounts = {repeatfamily[line[0]]: 0 for line in import_text( - repgenome_path, '\t') if line[0] in repeatfamily} -reptotalcounts_simple = {(simple_repeat if line[0] in repeatfamily and - repeatfamily[line[0]] == simple_repeat else - line[0]): 0 for line in import_text( - repgenome_path, '\t')} -repeat_key = {line[0]: int(line[1]) for line in import_text( - repgenome_path, '\t')} -rev_repeat_key = {int(line[1]): line[0] for line in import_text( - repgenome_path, '\t')} -repeat_list = [line[0] for line in import_text(repgenome_path, '\t')] - -# map the repeats to the psuedogenomes: -if not os.path.exists(outputfolder): - os.mkdir(outputfolder) - -# Conduct the regions sorting -fileout = os.path.join(outputfolder, f"{outputfile_prefix}_regionsorter.txt") -command = shlex.split(f"coverageBed -abam {unique_mapper_bam} -b \ - {os.path.join(setup_folder, 'repnames.bed')}") -with open(fileout, 'w') as stdout: - subprocess.run(command, stdout=stdout, check=True) - -counts = {} -sumofrepeatreads = 0 -with open(fileout) as filein: - for line in filein: - line = line.split('\t') - if not str(repeat_key[line[3]]) in counts: - counts[str(repeat_key[line[3]])] = 0 - counts[str(repeat_key[line[3]])] += int(line[4]) - sumofrepeatreads += int(line[4]) - print(f"Identified {sumofrepeatreads} unique reads that \ - mapped to repeats.") +# define a text importer for .out/.txt format of repbase +def import_text(filename, separator): + csv.field_size_limit(sys.maxsize) + file = csv.reader(open(filename), delimiter=separator, + skipinitialspace=True) + return [line for line in file if starts_with_numerical(line)] -def run_bowtie(metagenome, fastqfile, folder): - metagenomepath = os.path.join(setup_folder, metagenome) - output_file = os.path.join(folder, f"{metagenome}.bowtie") - command = shlex.split(f"bowtie {b_opt} {metagenomepath} {fastqfile}") - with open(output_file, 'w') as stdout: - return subprocess.Popen(command, stdout=stdout) +def run_bowtie(args): + metagenome, fastqfile = args + b_opt = "-k 1 -p 1 --quiet" + command = shlex.split(f"bowtie {b_opt} -x {metagenome} {fastqfile}") + bowtie_align = subprocess.run(command, check=True, + capture_output=True, text=True).stdout + bowtie_align = bowtie_align.rstrip('\r\n').split('\n') + readlist = [metagenome] + if paired_end: + for line in bowtie_align: + readlist.append(line.split("/")[0]) + else: + for line in bowtie_align: + readlist.append(line.split("\t")[0]) + return readlist -if paired_end == 'FALSE': - folder_pair1 = os.path.join(outputfolder, 'pair1_bowtie') - os.makedirs(folder_pair1, exist_ok=True) - - print("Processing repeat pseudogenomes...") - processes = [] - ticker = 0 - - for metagenome in repeat_list: - processes.append(run_bowtie(metagenome, fastqfile_1, folder_pair1)) - ticker += 1 - if ticker == cpus: - for p in processes: - p.communicate() - ticker = 0 - processes = [] +# set a reference repeat list for the script +repeat_list = [listline[9].translate( + str.maketrans( + '()/', '___')) for listline in import_text(annotation_file, ' ')] +repeat_list = sorted(list(set(repeat_list))) - for p in processes: - p.communicate() - # Combine the output from both read pairs: - print('Sorting and combining the output for both read pairs....') - sorted_bowtie = os.path.join(outputfolder, 'sorted_bowtie') - os.makedirs(sorted_bowtie, exist_ok=True) - for metagenome in repeat_list: - file1 = os.path.join(folder_pair1, f"{metagenome}.bowtie") - fileout = os.path.join(sorted_bowtie, f"{metagenome}.bowtie") - with open(fileout, 'w') as stdout: - p1 = subprocess.Popen(['cat', file1], stdout=subprocess.PIPE) - p2 = subprocess.Popen(['cut', '-f1'], stdin=p1.stdout, - stdout=subprocess.PIPE) - p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin=p2.stdout, - stdout=subprocess.PIPE) - p4 = subprocess.Popen(['sort'], stdin=p3.stdout, - stdout=subprocess.PIPE) - p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout) - p5.communicate() - stdout.close() - print('completed ...') -else: - folder_pair1 = os.path.join(outputfolder, 'pair1_bowtie') - folder_pair2 = os.path.join(outputfolder, 'pair2_bowtie') - os.makedirs(folder_pair1, exist_ok=True) - os.makedirs(folder_pair2, exist_ok=True) +# unique mapper counting +cmd = f"bedtools bamtobed -i {unique_mapper_bam} | \ + bedtools coverage -b stdin -a repnames.bed" +p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) +bedtools_counts = p.communicate()[0].decode().rstrip('\r\n').split('\n') - print("Processing repeat pseudogenomes...") - ps, psb, ticker = [], [], 0 +# parse bedtools output +counts = defaultdict(int) # key: repeat names, value: unique mapper counts +sumofrepeatreads = 0 +for line in bedtools_counts: + line = line.split('\t') + counts[line[3]] += int(line[4]) + sumofrepeatreads += int(line[4]) +print(f"Identified {sumofrepeatreads} unique reads that mapped to repeats.") - for metagenome in repeat_list: - ps.append(run_bowtie(metagenome, fastqfile_1, folder_pair1)) - ticker += 1 - if fastqfile_2 != 'none': - psb.append(run_bowtie(metagenome, fastqfile_2, folder_pair2)) - ticker += 1 - if ticker >= cpus: - for p in ps: - p.communicate() - for p in psb: - p.communicate() - ticker = 0 - ps = [] - psb = [] +# multimapper parsing +if not paired_end: + args_list = [(metagenome, fastqfile_1) for + metagenome in repeat_list] + with ProcessPoolExecutor(max_workers=cpus) as executor: + results = executor.map(run_bowtie, args_list) +else: + args_list = [(metagenome, fastqfile_1) for + metagenome in repeat_list] + args_list.extend([(metagenome, fastqfile_2) for + metagenome in repeat_list]) + with ProcessPoolExecutor(max_workers=cpus) as executor: + results = executor.map(run_bowtie, args_list) + +# Aggregate results (avoiding race conditions) +metagenome_reads = defaultdict(list) # repeat_name: list of multimap reads +for result in results: + metagenome_reads[result[0]] += result[1:] - for p in ps: - p.communicate() - for p in psb: - p.communicate() - # combine the output from both read pairs: - print('Sorting and combining the output for both read pairs...') - if not os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'): - os.mkdir(outputfolder + os.path.sep + 'sorted_bowtie') - sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie' - for metagenome in repeat_list: - file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie' - file2 = folder_pair2 + os.path.sep + metagenome + '.bowtie' - fileout = sorted_bowtie + os.path.sep + metagenome + '.bowtie' - with open(fileout, 'w') as stdout: - p1 = subprocess.Popen(['cat', file1, file2], - stdout=subprocess.PIPE) - p2 = subprocess.Popen(['cut', '-f1', "-d "], stdin=p1.stdout, - stdout=subprocess.PIPE) - p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin=p2.stdout, - stdout=subprocess.PIPE) - p4 = subprocess.Popen(['sort'], stdin=p3.stdout, - stdout=subprocess.PIPE) - p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout) - p5.communicate() - stdout.close() - print('completed ...') +for name in metagenome_reads: + # read are only once in list + metagenome_reads[name] = list(set(metagenome_reads[name])) + # remove "no read" instances + metagenome_reads[name] = [read for read in metagenome_reads[name] + if read != ""] -# build a file of repeat keys for all reads -print('Writing and processing intermediate files...') -sorted_bowtie = os.path.join(outputfolder, 'sorted_bowtie') -sumofrepeatreads = 0 -readid = {} +# implement repeats_by_reads from the inverse dictionnary metagenome_reads +repeats_by_reads = defaultdict(list) # readids: list of repeats names +for repname in metagenome_reads: + for read in metagenome_reads[repname]: + repeats_by_reads[read].append(repname) +for repname in repeats_by_reads: + repeats_by_reads[repname] = list(set(repeats_by_reads[repname])) -for rep in repeat_list: - for data in import_text( - f"{os.path.join(sorted_bowtie, rep)}.bowtie", '\t'): - readid[data[0]] = '' +# 3 dictionnaries and 1 pointer variable to be populated +fractionalcounts = defaultdict(float) +familyfractionalcounts = defaultdict(float) +classfractionalcounts = defaultdict(float) +sumofrepeatreads = 0 -for rep in repeat_list: - for data in import_text( - f"{os.path.join(sorted_bowtie, rep)}.bowtie", '\t'): - readid[data[0]] += f"{repeat_key[rep]}," - -for subfamilies in readid.values(): - if subfamilies not in counts: - counts[subfamilies] = 0 - counts[subfamilies] += 1 +# Update counts dictionnary with sets of repeats (was "subfamilies") +# matched by multimappers +for repeat_set in repeats_by_reads.values(): + repeat_set_string = ','.join(repeat_set) + counts[repeat_set_string] += 1 sumofrepeatreads += 1 -print(f'Identified {sumofrepeatreads} reads that mapped to \ - repeats for unique and multimappers.') -print("Conducting final calculations...") +print(f'Identified more {sumofrepeatreads} mutimapper repeat reads') -# building the total counts for repeat element enrichment... -for x in counts.keys(): - count = counts[x] - x = x.strip(',').split(',') - for i in x: - reptotalcounts[rev_repeat_key[int(i)]] += int(count) -# building the fractional counts for repeat element enrichment... -for x in counts.keys(): - count = counts[x] - x = x.strip(',') .split(',') - splits = len(x) - for i in x: - fractionalcounts[rev_repeat_key[int(i)]] += float( - numpy.divide(float(count), float(splits))) -# building categorized table of repeat element enrichment... -repcounts = {} -repcounts['other'] = 0 -for key in counts.keys(): - key_list = key.strip(',').split(',') - repname = '' +# Populate fractionalcounts +for key, count in counts.items(): + key_list = key.split(',') for i in key_list: - repname = os.path.join(repname, rev_repeat_key[int(i)]) - repcounts[repname] = counts[key] -# building the total counts for class enrichment... -for key in reptotalcounts.keys(): - classtotalcounts[repeatclass[key]] += reptotalcounts[key] -# building total counts for family enrichment... -for key in reptotalcounts.keys(): - familytotalcounts[repeatfamily[key]] += reptotalcounts[key] -# building unique counts table -repcounts2 = {} -for rep in repeat_list: - if "/" + rep in repcounts: - repcounts2[rep] = repcounts["/" + rep] - else: - repcounts2[rep] = 0 -# building the fractionalcounts counts for class enrichment... -for key in fractionalcounts.keys(): - classfractionalcounts[repeatclass[key]] += fractionalcounts[key] -# building fractional counts for family enrichment... -for key in fractionalcounts.keys(): - familyfractionalcounts[repeatfamily[key]] += fractionalcounts[key] + fractionalcounts[i] += count / len(key_list) -# print output to file of the categorized counts and total overlapping counts: -print('Writing final output...') -with open(f"{os.path.join(outputfolder, outputfile_prefix)}_" - f"class_fraction_counts.txt", 'w') as fout: - for key in sorted(classfractionalcounts.keys()): +# build repeat_ref for easy access to rep class and rep families +repeat_ref = defaultdict(dict) +repeats = import_text(annotation_file, ' ') +for repeat in repeats: + repeat_name = repeat[9].translate(str.maketrans('()/', '___')) + try: + repclass = repeat[10].split('/')[0] + repfamily = repeat[10].split('/')[1] + except IndexError: + repclass, repfamily = repeat[10], repeat[10] + repeat_ref[repeat_name]['class'] = repclass + repeat_ref[repeat_name]['family'] = repfamily + +# Populate classfractionalcounts and familyfractionalcounts +for key, value in fractionalcounts.items(): + classfractionalcounts[repeat_ref[key]['class']] += value + familyfractionalcounts[repeat_ref[key]['family']] += value + +# print class-, family- and fraction-repeats counts to files +with open("class_fraction_counts.tsv", 'w') as fout: + for key in sorted(classfractionalcounts): fout.write(f"{key}\t{classfractionalcounts[key]}\n") -with open(f"{os.path.join(outputfolder, outputfile_prefix)}_" - f"family_fraction_counts.txt", 'w') as fout: - for key in sorted(familyfractionalcounts.keys()): +with open("family_fraction_counts.tsv", 'w') as fout: + for key in sorted(familyfractionalcounts): fout.write(f"{key}\t{familyfractionalcounts[key]}\n") -with open(f"{os.path.join(outputfolder, outputfile_prefix)}_" - f"fraction_counts.txt", 'w') as fout: - for key in sorted(fractionalcounts.keys()): - fout.write(f"{key}\t{repeatclass[key]}\t{repeatfamily[key]}\t" - f"{int(fractionalcounts[key])}\n") +with open("fraction_counts.tsv", 'w') as fout: + for key in sorted(fractionalcounts): + fout.write(f"{key}\t{repeat_ref[key]['class']}\t" + f"{repeat_ref[key]['family']}\t" + f"{fractionalcounts[key]}\n")