diff RepEnrich.py @ 13:530626b0757c draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
author artbio
date Tue, 02 Apr 2024 21:16:37 +0000
parents 89e05f831259
children bf866bedd4b4
line wrap: on
line diff
--- a/RepEnrich.py	Mon Mar 18 09:39:44 2024 +0000
+++ b/RepEnrich.py	Tue Apr 02 21:16:37 2024 +0000
@@ -4,8 +4,8 @@
 import shlex
 import subprocess
 import sys
-
-import numpy
+from collections import defaultdict
+from concurrent.futures import ProcessPoolExecutor
 
 
 parser = argparse.ArgumentParser(description='''
@@ -17,26 +17,13 @@
                     help='RepeatMasker.org annotation file for your\
                           organism. The file may be downloaded from\
                           RepeatMasker.org. E.g. hg19_repeatmasker.txt')
-parser.add_argument('--outputfolder', action='store', metavar='outputfolder',
-                    help='Folder that will contain results. Should be the\
-                          same as the one used for RepEnrich_setup.\
-                          Example: ./outputfolder')
-parser.add_argument('--outputprefix', action='store', metavar='outputprefix',
-                    help='Prefix name for Repenrich output files.')
-parser.add_argument('--setup_folder', action='store', metavar='setup_folder',
-                    help='Folder produced by RepEnrich_setup which contains\
-                    repeat element pseudogenomes.')
+parser.add_argument('--alignment_bam', action='store', metavar='alignment_bam',
+                    help='Bam alignments of unique mapper reads.')
 parser.add_argument('--fastqfile', action='store', metavar='fastqfile',
                     help='File of fastq reads mapping to multiple\
                           locations. Example: /data/multimap.fastq')
-parser.add_argument('--alignment_bam', action='store', metavar='alignment_bam',
-                    help='Bam alignments of unique mapper reads.')
-parser.add_argument('--pairedend', action='store', dest='pairedend',
-                    default='FALSE',
-                    help='Change to TRUE for paired-end fastq files.\
-                          Default FALSE')
 parser.add_argument('--fastqfile2', action='store', dest='fastqfile2',
-                    metavar='fastqfile2', default='none',
+                    metavar='fastqfile2', default='',
                     help='fastqfile #2 when using paired-end option.\
                           Default none')
 parser.add_argument('--cpus', action='store', dest='cpus', metavar='cpus',
@@ -48,18 +35,16 @@
 
 # parameters
 annotation_file = args.annotation_file
-outputfolder = args.outputfolder
-outputfile_prefix = args.outputprefix
-setup_folder = args.setup_folder
-repeat_bed = os.path.join(setup_folder, 'repnames.bed')
 unique_mapper_bam = args.alignment_bam
 fastqfile_1 = args.fastqfile
 fastqfile_2 = args.fastqfile2
 cpus = args.cpus
-b_opt = "-k1 -p 1 --quiet"
 # Change if simple repeats are differently annotated in your organism
 simple_repeat = "Simple_repeat"
-paired_end = args.pairedend
+if args.fastqfile2:
+    paired_end = True
+else:
+    paired_end = False
 
 # check that the programs we need are available
 try:
@@ -73,260 +58,147 @@
     print("Error: Bowtie or bedtools not loaded")
     raise
 
-# define a csv reader that reads space deliminated files
-print('Preparing for analysis using RepEnrich...')
-csv.field_size_limit(sys.maxsize)
 
-
-def import_text(filename, separator):
-    for line in csv.reader(open(filename), delimiter=separator,
-                           skipinitialspace=True):
-        if line:
-            yield line
+def starts_with_numerical(list):
+    try:
+        if len(list) == 0:
+            return False
+        int(list[0])
+        return True
+    except ValueError:
+        return False
 
 
-# build dictionaries to convert repclass and rep families
-repeatclass, repeatfamily = {}, {}
-repeats = import_text(annotation_file, ' ')
-# skip three first lines of the iterator
-for line in range(3):
-    next(repeats)
-for repeat in repeats:
-    classfamily = []
-    classfamily = repeat[10].split('/')
-    matching_repeat = repeat[9].translate(str.maketrans('()/', '___'))
-    repeatclass[matching_repeat] = classfamily[0]
-    if len(classfamily) == 2:
-        repeatfamily[matching_repeat] = classfamily[1]
-    else:
-        repeatfamily[matching_repeat] = classfamily[0]
-
-# build list of repeats initializing dictionaries for downstream analysis'
-repgenome_path = os.path.join(setup_folder, 'repgenomes_key.txt')
-reptotalcounts = {line[0]: 0 for line in import_text(repgenome_path, '\t')}
-fractionalcounts = {line[0]: 0 for line in import_text(repgenome_path, '\t')}
-classtotalcounts = {repeatclass[line[0]]: 0 for line in import_text(
-    repgenome_path, '\t') if line[0] in repeatclass}
-classfractionalcounts = {repeatclass[line[0]]: 0 for line in import_text(
-    repgenome_path, '\t') if line[0] in repeatclass}
-familytotalcounts = {repeatfamily[line[0]]: 0 for line in import_text(
-    repgenome_path, '\t') if line[0] in repeatfamily}
-familyfractionalcounts = {repeatfamily[line[0]]: 0 for line in import_text(
-    repgenome_path, '\t') if line[0] in repeatfamily}
-reptotalcounts_simple = {(simple_repeat if line[0] in repeatfamily and
-                          repeatfamily[line[0]] == simple_repeat else
-                          line[0]): 0 for line in import_text(
-                              repgenome_path, '\t')}
-repeat_key = {line[0]: int(line[1]) for line in import_text(
-    repgenome_path, '\t')}
-rev_repeat_key = {int(line[1]): line[0] for line in import_text(
-    repgenome_path, '\t')}
-repeat_list = [line[0] for line in import_text(repgenome_path, '\t')]
-
-# map the repeats to the psuedogenomes:
-if not os.path.exists(outputfolder):
-    os.mkdir(outputfolder)
-
-# Conduct the regions sorting
-fileout = os.path.join(outputfolder, f"{outputfile_prefix}_regionsorter.txt")
-command = shlex.split(f"coverageBed -abam {unique_mapper_bam} -b \
-                        {os.path.join(setup_folder, 'repnames.bed')}")
-with open(fileout, 'w') as stdout:
-    subprocess.run(command, stdout=stdout, check=True)
-
-counts = {}
-sumofrepeatreads = 0
-with open(fileout) as filein:
-    for line in filein:
-        line = line.split('\t')
-        if not str(repeat_key[line[3]]) in counts:
-            counts[str(repeat_key[line[3]])] = 0
-        counts[str(repeat_key[line[3]])] += int(line[4])
-        sumofrepeatreads += int(line[4])
-    print(f"Identified {sumofrepeatreads} unique reads that \
-            mapped to repeats.")
+# define a text importer for .out/.txt format of repbase
+def import_text(filename, separator):
+    csv.field_size_limit(sys.maxsize)
+    file = csv.reader(open(filename), delimiter=separator,
+                      skipinitialspace=True)
+    return [line for line in file if starts_with_numerical(line)]
 
 
-def run_bowtie(metagenome, fastqfile, folder):
-    metagenomepath = os.path.join(setup_folder, metagenome)
-    output_file = os.path.join(folder, f"{metagenome}.bowtie")
-    command = shlex.split(f"bowtie {b_opt} {metagenomepath} {fastqfile}")
-    with open(output_file, 'w') as stdout:
-        return subprocess.Popen(command, stdout=stdout)
+def run_bowtie(args):
+    metagenome, fastqfile = args
+    b_opt = "-k 1 -p 1 --quiet"
+    command = shlex.split(f"bowtie {b_opt} -x {metagenome} {fastqfile}")
+    bowtie_align = subprocess.run(command, check=True,
+                                  capture_output=True, text=True).stdout
+    bowtie_align = bowtie_align.rstrip('\r\n').split('\n')
+    readlist = [metagenome]
+    if paired_end:
+        for line in bowtie_align:
+            readlist.append(line.split("/")[0])
+    else:
+        for line in bowtie_align:
+            readlist.append(line.split("\t")[0])
+    return readlist
 
 
-if paired_end == 'FALSE':
-    folder_pair1 = os.path.join(outputfolder, 'pair1_bowtie')
-    os.makedirs(folder_pair1, exist_ok=True)
-
-    print("Processing repeat pseudogenomes...")
-    processes = []
-    ticker = 0
-
-    for metagenome in repeat_list:
-        processes.append(run_bowtie(metagenome, fastqfile_1, folder_pair1))
-        ticker += 1
-        if ticker == cpus:
-            for p in processes:
-                p.communicate()
-            ticker = 0
-            processes = []
+# set a reference repeat list for the script
+repeat_list = [listline[9].translate(
+    str.maketrans(
+        '()/', '___')) for listline in import_text(annotation_file, ' ')]
+repeat_list = sorted(list(set(repeat_list)))
 
-    for p in processes:
-        p.communicate()
-    # Combine the output from both read pairs:
-    print('Sorting and combining the output for both read pairs....')
-    sorted_bowtie = os.path.join(outputfolder, 'sorted_bowtie')
-    os.makedirs(sorted_bowtie, exist_ok=True)
-    for metagenome in repeat_list:
-        file1 = os.path.join(folder_pair1, f"{metagenome}.bowtie")
-        fileout = os.path.join(sorted_bowtie, f"{metagenome}.bowtie")
-        with open(fileout, 'w') as stdout:
-            p1 = subprocess.Popen(['cat', file1], stdout=subprocess.PIPE)
-            p2 = subprocess.Popen(['cut', '-f1'], stdin=p1.stdout,
-                                  stdout=subprocess.PIPE)
-            p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin=p2.stdout,
-                                  stdout=subprocess.PIPE)
-            p4 = subprocess.Popen(['sort'], stdin=p3.stdout,
-                                  stdout=subprocess.PIPE)
-            p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout)
-            p5.communicate()
-        stdout.close()
-    print('completed ...')
-else:
-    folder_pair1 = os.path.join(outputfolder, 'pair1_bowtie')
-    folder_pair2 = os.path.join(outputfolder, 'pair2_bowtie')
-    os.makedirs(folder_pair1, exist_ok=True)
-    os.makedirs(folder_pair2, exist_ok=True)
+# unique mapper counting
+cmd = f"bedtools bamtobed -i {unique_mapper_bam} | \
+        bedtools coverage -b stdin -a  repnames.bed"
+p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
+bedtools_counts = p.communicate()[0].decode().rstrip('\r\n').split('\n')
 
-    print("Processing repeat pseudogenomes...")
-    ps, psb, ticker = [], [], 0
+# parse bedtools output
+counts = defaultdict(int)  # key: repeat names, value: unique mapper counts
+sumofrepeatreads = 0
+for line in bedtools_counts:
+    line = line.split('\t')
+    counts[line[3]] += int(line[4])
+    sumofrepeatreads += int(line[4])
+print(f"Identified {sumofrepeatreads} unique reads that mapped to repeats.")
 
-    for metagenome in repeat_list:
-        ps.append(run_bowtie(metagenome, fastqfile_1, folder_pair1))
-        ticker += 1
-        if fastqfile_2 != 'none':
-            psb.append(run_bowtie(metagenome, fastqfile_2, folder_pair2))
-            ticker += 1
-        if ticker >= cpus:
-            for p in ps:
-                p.communicate()
-            for p in psb:
-                p.communicate()
-            ticker = 0
-            ps = []
-            psb = []
+# multimapper parsing
+if not paired_end:
+    args_list = [(metagenome, fastqfile_1) for
+                 metagenome in repeat_list]
+    with ProcessPoolExecutor(max_workers=cpus) as executor:
+        results = executor.map(run_bowtie, args_list)
+else:
+    args_list = [(metagenome, fastqfile_1) for
+                 metagenome in repeat_list]
+    args_list.extend([(metagenome, fastqfile_2) for
+                     metagenome in repeat_list])
+    with ProcessPoolExecutor(max_workers=cpus) as executor:
+        results = executor.map(run_bowtie, args_list)
+
+# Aggregate results (avoiding race conditions)
+metagenome_reads = defaultdict(list)  # repeat_name: list of multimap reads
+for result in results:
+    metagenome_reads[result[0]] += result[1:]
 
-    for p in ps:
-        p.communicate()
-    for p in psb:
-        p.communicate()
-    # combine the output from both read pairs:
-    print('Sorting and combining the output for both read pairs...')
-    if not os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'):
-        os.mkdir(outputfolder + os.path.sep + 'sorted_bowtie')
-    sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie'
-    for metagenome in repeat_list:
-        file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie'
-        file2 = folder_pair2 + os.path.sep + metagenome + '.bowtie'
-        fileout = sorted_bowtie + os.path.sep + metagenome + '.bowtie'
-        with open(fileout, 'w') as stdout:
-            p1 = subprocess.Popen(['cat', file1, file2],
-                                  stdout=subprocess.PIPE)
-            p2 = subprocess.Popen(['cut', '-f1', "-d "], stdin=p1.stdout,
-                                  stdout=subprocess.PIPE)
-            p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin=p2.stdout,
-                                  stdout=subprocess.PIPE)
-            p4 = subprocess.Popen(['sort'], stdin=p3.stdout,
-                                  stdout=subprocess.PIPE)
-            p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout)
-            p5.communicate()
-        stdout.close()
-    print('completed ...')
+for name in metagenome_reads:
+    #  read are only once in list
+    metagenome_reads[name] = list(set(metagenome_reads[name]))
+    #  remove "no read" instances
+    metagenome_reads[name] = [read for read in metagenome_reads[name]
+                              if read != ""]
 
-# build a file of repeat keys for all reads
-print('Writing and processing intermediate files...')
-sorted_bowtie = os.path.join(outputfolder, 'sorted_bowtie')
-sumofrepeatreads = 0
-readid = {}
+# implement repeats_by_reads from the inverse dictionnary metagenome_reads
+repeats_by_reads = defaultdict(list)  # readids: list of repeats names
+for repname in metagenome_reads:
+    for read in metagenome_reads[repname]:
+        repeats_by_reads[read].append(repname)
+for repname in repeats_by_reads:
+    repeats_by_reads[repname] = list(set(repeats_by_reads[repname]))
 
-for rep in repeat_list:
-    for data in import_text(
-            f"{os.path.join(sorted_bowtie, rep)}.bowtie", '\t'):
-        readid[data[0]] = ''
+# 3 dictionnaries and 1 pointer variable to be populated
+fractionalcounts = defaultdict(float)
+familyfractionalcounts = defaultdict(float)
+classfractionalcounts = defaultdict(float)
+sumofrepeatreads = 0
 
-for rep in repeat_list:
-    for data in import_text(
-            f"{os.path.join(sorted_bowtie, rep)}.bowtie", '\t'):
-        readid[data[0]] += f"{repeat_key[rep]},"
-
-for subfamilies in readid.values():
-    if subfamilies not in counts:
-        counts[subfamilies] = 0
-    counts[subfamilies] += 1
+# Update counts dictionnary with sets of repeats (was "subfamilies")
+# matched by multimappers
+for repeat_set in repeats_by_reads.values():
+    repeat_set_string = ','.join(repeat_set)
+    counts[repeat_set_string] += 1
     sumofrepeatreads += 1
 
-print(f'Identified {sumofrepeatreads} reads that mapped to \
-        repeats for unique and multimappers.')
-print("Conducting final calculations...")
+print(f'Identified more {sumofrepeatreads} mutimapper repeat reads')
 
-# building the total counts for repeat element enrichment...
-for x in counts.keys():
-    count = counts[x]
-    x = x.strip(',').split(',')
-    for i in x:
-        reptotalcounts[rev_repeat_key[int(i)]] += int(count)
-# building the fractional counts for repeat element enrichment...
-for x in counts.keys():
-    count = counts[x]
-    x = x.strip(',')    .split(',')
-    splits = len(x)
-    for i in x:
-        fractionalcounts[rev_repeat_key[int(i)]] += float(
-            numpy.divide(float(count), float(splits)))
-# building categorized table of repeat element enrichment...
-repcounts = {}
-repcounts['other'] = 0
-for key in counts.keys():
-    key_list = key.strip(',').split(',')
-    repname = ''
+# Populate fractionalcounts
+for key, count in counts.items():
+    key_list = key.split(',')
     for i in key_list:
-        repname = os.path.join(repname, rev_repeat_key[int(i)])
-    repcounts[repname] = counts[key]
-# building the total counts for class enrichment...
-for key in reptotalcounts.keys():
-    classtotalcounts[repeatclass[key]] += reptotalcounts[key]
-# building total counts for family enrichment...
-for key in reptotalcounts.keys():
-    familytotalcounts[repeatfamily[key]] += reptotalcounts[key]
-# building unique counts table
-repcounts2 = {}
-for rep in repeat_list:
-    if "/" + rep in repcounts:
-        repcounts2[rep] = repcounts["/" + rep]
-    else:
-        repcounts2[rep] = 0
-# building the fractionalcounts counts for class enrichment...
-for key in fractionalcounts.keys():
-    classfractionalcounts[repeatclass[key]] += fractionalcounts[key]
-# building fractional counts for family enrichment...
-for key in fractionalcounts.keys():
-    familyfractionalcounts[repeatfamily[key]] += fractionalcounts[key]
+        fractionalcounts[i] += count / len(key_list)
 
-# print output to file of the categorized counts and total overlapping counts:
-print('Writing final output...')
-with open(f"{os.path.join(outputfolder, outputfile_prefix)}_"
-          f"class_fraction_counts.txt", 'w') as fout:
-    for key in sorted(classfractionalcounts.keys()):
+# build repeat_ref for easy access to rep class and rep families
+repeat_ref = defaultdict(dict)
+repeats = import_text(annotation_file, ' ')
+for repeat in repeats:
+    repeat_name = repeat[9].translate(str.maketrans('()/', '___'))
+    try:
+        repclass = repeat[10].split('/')[0]
+        repfamily = repeat[10].split('/')[1]
+    except IndexError:
+        repclass, repfamily = repeat[10], repeat[10]
+    repeat_ref[repeat_name]['class'] = repclass
+    repeat_ref[repeat_name]['family'] = repfamily
+
+# Populate classfractionalcounts and familyfractionalcounts
+for key, value in fractionalcounts.items():
+    classfractionalcounts[repeat_ref[key]['class']] += value
+    familyfractionalcounts[repeat_ref[key]['family']] += value
+
+# print class-, family- and fraction-repeats counts to files
+with open("class_fraction_counts.tsv", 'w') as fout:
+    for key in sorted(classfractionalcounts):
         fout.write(f"{key}\t{classfractionalcounts[key]}\n")
 
-with open(f"{os.path.join(outputfolder, outputfile_prefix)}_"
-          f"family_fraction_counts.txt", 'w') as fout:
-    for key in sorted(familyfractionalcounts.keys()):
+with open("family_fraction_counts.tsv", 'w') as fout:
+    for key in sorted(familyfractionalcounts):
         fout.write(f"{key}\t{familyfractionalcounts[key]}\n")
 
-with open(f"{os.path.join(outputfolder, outputfile_prefix)}_"
-          f"fraction_counts.txt", 'w') as fout:
-    for key in sorted(fractionalcounts.keys()):
-        fout.write(f"{key}\t{repeatclass[key]}\t{repeatfamily[key]}\t"
-                   f"{int(fractionalcounts[key])}\n")
+with open("fraction_counts.tsv", 'w') as fout:
+    for key in sorted(fractionalcounts):
+        fout.write(f"{key}\t{repeat_ref[key]['class']}\t"
+                   f"{repeat_ref[key]['family']}\t"
+                   f"{fractionalcounts[key]}\n")