diff RepEnrich_setup.py @ 13:530626b0757c draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
author artbio
date Tue, 02 Apr 2024 21:16:37 +0000
parents 89e05f831259
children bf866bedd4b4
line wrap: on
line diff
--- a/RepEnrich_setup.py	Mon Mar 18 09:39:44 2024 +0000
+++ b/RepEnrich_setup.py	Tue Apr 02 21:16:37 2024 +0000
@@ -5,6 +5,9 @@
 import shlex
 import subprocess
 import sys
+from collections import defaultdict
+from concurrent.futures import ProcessPoolExecutor
+
 
 from Bio import SeqIO
 from Bio.Seq import Seq
@@ -22,10 +25,6 @@
 parser.add_argument('--genomefasta', action='store', metavar='genomefasta',
                     help='''Genome of interest in fasta format.\
                          Example: mm9.fa''')
-parser.add_argument('--setup_folder', action='store', metavar='setup_folder',
-                    help='''Folder that contains bowtie indexes of repeats and\
-                         repeat element psuedogenomes.\
-                         Example working/setup''')
 parser.add_argument('--gaplength', action='store', dest='gaplength',
                     metavar='gaplength', default='200', type=int,
                     help='''Length of the N-spacer in the\
@@ -36,6 +35,10 @@
                          repeat pseudogenomes. Flanking length should be set\
                          according to the length of your reads.\
                          Default 25, for 50 nt reads''')
+parser.add_argument('--cpus', action='store', dest='cpus', metavar='cpus',
+                    default="1", type=int,
+                    help='Number of CPUs. The more cpus the\
+                          faster RepEnrich performs. Default: "1"')
 args = parser.parse_args()
 
 # parameters from argsparse
@@ -43,7 +46,7 @@
 flankingl = args.flankinglength
 annotation_file = args.annotation_file
 genomefasta = args.genomefasta
-setup_folder = args.setup_folder
+cpus = args.cpus
 
 # check that the programs we need are available
 try:
@@ -54,56 +57,51 @@
     print("Error: Bowtie not available in the path")
     raise
 
-# Define a text importer
-csv.field_size_limit(sys.maxsize)
 
-
-def import_text(filename, separator):
-    for line in csv.reader(open(os.path.realpath(filename)),
-                           delimiter=separator, skipinitialspace=True):
-        if line:
-            yield line
+def starts_with_numerical(list):
+    try:
+        if len(list) == 0:
+            return False
+        int(list[0])
+        return True
+    except ValueError:
+        return False
 
 
-# Make a setup folder
-if not os.path.exists(setup_folder):
-    os.makedirs(setup_folder)
+# define a text importer for .out/.txt format of repbase
+def import_text(filename, separator):
+    csv.field_size_limit(sys.maxsize)
+    file = csv.reader(open(filename), delimiter=separator,
+                      skipinitialspace=True)
+    return [line for line in file if starts_with_numerical(line)]
+
+
 # load genome into dictionary and compute length
 g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta"))
-idxgenome, lgenome, genome = {}, {}, {}
+genome = defaultdict(dict)
 
-for k, chr in enumerate(g.keys()):
-    genome[chr] = g[chr].seq
-    lgenome[chr] = len(genome[chr])
-    idxgenome[chr] = k
+for chr in g.keys():
+    genome[chr]['sequence'] = g[chr].seq
+    genome[chr]['length'] = len(g[chr].seq)
 
 # Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter
-repeat_elements = []
-# these dictionaries will contain lists
-rep_chr, rep_start, rep_end = {}, {}, {}
-fin = import_text(annotation_file, ' ')
-with open(os.path.join(setup_folder, 'repnames.bed'), 'w') as fout:
-    for i in range(3):
-        next(fin)
-    for line in fin:
+repeat_elements = set()
+rep_coords = defaultdict(list)  # Merged dictionary for coordinates
+
+with open('repnames.bed', 'w') as fout:
+    f_in = import_text(annotation_file, ' ')
+    for line in f_in:
         repname = line[9].translate(str.maketrans('()/', '___'))
-        if repname not in repeat_elements:
-            repeat_elements.append(repname)
-        repchr = line[4]
-        repstart = line[5]
-        repend = line[6]
-        fout.write('\t'.join([repchr, repstart, repend, repname]) + '\n')
-        if repname in rep_chr:
-            rep_chr[repname].append(repchr)
-            rep_start[repname].append(repstart)
-            rep_end[repname].append(repend)
-        else:
-            rep_chr[repname] = [repchr]
-            rep_start[repname] = [repstart]
-            rep_end[repname] = [repend]
+        repeat_elements.add(repname)
+        repchr, repstart, repend = line[4], line[5], line[6]
+        fout.write(f"{repchr}\t{repstart}\t{repend}\t{repname}\n")
+        rep_coords[repname].extend([repchr, repstart, repend])
+# repeat_elements now contains the unique repeat names
+# rep_coords is a dictionary where keys are repeat names and values are lists
+# containing chromosome, start, and end coordinates for each repeat instance
 
-# sort repeat_elements and print them in repgenomes_key.txt
-with open(os.path.join(setup_folder, 'repgenomes_key.txt'), 'w') as fout:
+# sort repeat_elements and print them in repeatIDs.txt
+with open('repeatIDs.txt', 'w') as fout:
     for i, repeat in enumerate(sorted(repeat_elements)):
         fout.write('\t'.join([repeat, str(i)]) + '\n')
 
@@ -111,24 +109,41 @@
 spacer = ''.join(['N' for i in range(gapl)])
 
 # generate metagenomes and save them to FASTA files for bowtie build
-for repname in rep_chr:
+for repname in rep_coords:
     metagenome = ''
-    for i, repeat in enumerate(rep_chr[repname]):
-        try:
-            chromosome = rep_chr[repname][i]
-            start = max(int(rep_start[repname][i]) - flankingl, 0)
-            end = min(int(rep_end[repname][i]) + flankingl,
-                      int(lgenome[chr])-1) + 1
-            metagenome = f"{metagenome}{spacer}{genome[chromosome][start:end]}"
-        except KeyError:
-            print("Unrecognised Chromosome: " + rep_chr[repname][i])
+    # iterating coordinate list by block of 3 (chr, start, end)
+    block = 3
+    for i in range(0, len(rep_coords[repname]) - block + 1, block):
+        batch = rep_coords[repname][i:i+block]
+        print(batch)
+        chromosome = batch[0]
+        start = max(int(batch[1]) - flankingl, 0)
+        end = min(int(batch[2]) + flankingl,
+                  int(genome[chromosome]['length'])-1) + 1
+        metagenome = (
+            f"{metagenome}{spacer}"
+            f"{genome[chromosome]['sequence'][start:end]}"
+            )
 
     # Create Fasta of repeat pseudogenome
-    fastafilename = f"{os.path.join(setup_folder, repname)}.fa"
+    fastafilename = f"{repname}.fa"
     record = SeqRecord(Seq(metagenome), id=repname, name='', description='')
     SeqIO.write(record, fastafilename, "fasta")
 
-    # Generate repeat pseudogenome bowtie index
-    bowtie_build_cmd = ["bowtie-build", "-f", fastafilename,
-                        os.path.join(setup_folder, repname)]
-    subprocess.run(bowtie_build_cmd, check=True)
+
+def bowtie_build(args):
+    """
+    Function to be executed in parallel by ProcessPoolExecutor.
+    """
+    try:
+        bowtie_base, fasta = args
+        command = shlex.split(f"bowtie-build -f {fasta} {bowtie_base}")
+        squash = subprocess.run(command, capture_output=True, text=True)
+        return squash.stdout
+    except Exception as e:
+        return str(e)
+
+
+args_list = [(name, f"{name}.fa") for name in rep_coords]
+with ProcessPoolExecutor(max_workers=cpus) as executor:
+    executor.map(bowtie_build, args_list)