Mercurial > repos > artbio > repenrich
diff RepEnrich_setup.py @ 13:530626b0757c draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
author | artbio |
---|---|
date | Tue, 02 Apr 2024 21:16:37 +0000 |
parents | 89e05f831259 |
children | bf866bedd4b4 |
line wrap: on
line diff
--- a/RepEnrich_setup.py Mon Mar 18 09:39:44 2024 +0000 +++ b/RepEnrich_setup.py Tue Apr 02 21:16:37 2024 +0000 @@ -5,6 +5,9 @@ import shlex import subprocess import sys +from collections import defaultdict +from concurrent.futures import ProcessPoolExecutor + from Bio import SeqIO from Bio.Seq import Seq @@ -22,10 +25,6 @@ parser.add_argument('--genomefasta', action='store', metavar='genomefasta', help='''Genome of interest in fasta format.\ Example: mm9.fa''') -parser.add_argument('--setup_folder', action='store', metavar='setup_folder', - help='''Folder that contains bowtie indexes of repeats and\ - repeat element psuedogenomes.\ - Example working/setup''') parser.add_argument('--gaplength', action='store', dest='gaplength', metavar='gaplength', default='200', type=int, help='''Length of the N-spacer in the\ @@ -36,6 +35,10 @@ repeat pseudogenomes. Flanking length should be set\ according to the length of your reads.\ Default 25, for 50 nt reads''') +parser.add_argument('--cpus', action='store', dest='cpus', metavar='cpus', + default="1", type=int, + help='Number of CPUs. The more cpus the\ + faster RepEnrich performs. Default: "1"') args = parser.parse_args() # parameters from argsparse @@ -43,7 +46,7 @@ flankingl = args.flankinglength annotation_file = args.annotation_file genomefasta = args.genomefasta -setup_folder = args.setup_folder +cpus = args.cpus # check that the programs we need are available try: @@ -54,56 +57,51 @@ print("Error: Bowtie not available in the path") raise -# Define a text importer -csv.field_size_limit(sys.maxsize) - -def import_text(filename, separator): - for line in csv.reader(open(os.path.realpath(filename)), - delimiter=separator, skipinitialspace=True): - if line: - yield line +def starts_with_numerical(list): + try: + if len(list) == 0: + return False + int(list[0]) + return True + except ValueError: + return False -# Make a setup folder -if not os.path.exists(setup_folder): - os.makedirs(setup_folder) +# define a text importer for .out/.txt format of repbase +def import_text(filename, separator): + csv.field_size_limit(sys.maxsize) + file = csv.reader(open(filename), delimiter=separator, + skipinitialspace=True) + return [line for line in file if starts_with_numerical(line)] + + # load genome into dictionary and compute length g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta")) -idxgenome, lgenome, genome = {}, {}, {} +genome = defaultdict(dict) -for k, chr in enumerate(g.keys()): - genome[chr] = g[chr].seq - lgenome[chr] = len(genome[chr]) - idxgenome[chr] = k +for chr in g.keys(): + genome[chr]['sequence'] = g[chr].seq + genome[chr]['length'] = len(g[chr].seq) # Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter -repeat_elements = [] -# these dictionaries will contain lists -rep_chr, rep_start, rep_end = {}, {}, {} -fin = import_text(annotation_file, ' ') -with open(os.path.join(setup_folder, 'repnames.bed'), 'w') as fout: - for i in range(3): - next(fin) - for line in fin: +repeat_elements = set() +rep_coords = defaultdict(list) # Merged dictionary for coordinates + +with open('repnames.bed', 'w') as fout: + f_in = import_text(annotation_file, ' ') + for line in f_in: repname = line[9].translate(str.maketrans('()/', '___')) - if repname not in repeat_elements: - repeat_elements.append(repname) - repchr = line[4] - repstart = line[5] - repend = line[6] - fout.write('\t'.join([repchr, repstart, repend, repname]) + '\n') - if repname in rep_chr: - rep_chr[repname].append(repchr) - rep_start[repname].append(repstart) - rep_end[repname].append(repend) - else: - rep_chr[repname] = [repchr] - rep_start[repname] = [repstart] - rep_end[repname] = [repend] + repeat_elements.add(repname) + repchr, repstart, repend = line[4], line[5], line[6] + fout.write(f"{repchr}\t{repstart}\t{repend}\t{repname}\n") + rep_coords[repname].extend([repchr, repstart, repend]) +# repeat_elements now contains the unique repeat names +# rep_coords is a dictionary where keys are repeat names and values are lists +# containing chromosome, start, and end coordinates for each repeat instance -# sort repeat_elements and print them in repgenomes_key.txt -with open(os.path.join(setup_folder, 'repgenomes_key.txt'), 'w') as fout: +# sort repeat_elements and print them in repeatIDs.txt +with open('repeatIDs.txt', 'w') as fout: for i, repeat in enumerate(sorted(repeat_elements)): fout.write('\t'.join([repeat, str(i)]) + '\n') @@ -111,24 +109,41 @@ spacer = ''.join(['N' for i in range(gapl)]) # generate metagenomes and save them to FASTA files for bowtie build -for repname in rep_chr: +for repname in rep_coords: metagenome = '' - for i, repeat in enumerate(rep_chr[repname]): - try: - chromosome = rep_chr[repname][i] - start = max(int(rep_start[repname][i]) - flankingl, 0) - end = min(int(rep_end[repname][i]) + flankingl, - int(lgenome[chr])-1) + 1 - metagenome = f"{metagenome}{spacer}{genome[chromosome][start:end]}" - except KeyError: - print("Unrecognised Chromosome: " + rep_chr[repname][i]) + # iterating coordinate list by block of 3 (chr, start, end) + block = 3 + for i in range(0, len(rep_coords[repname]) - block + 1, block): + batch = rep_coords[repname][i:i+block] + print(batch) + chromosome = batch[0] + start = max(int(batch[1]) - flankingl, 0) + end = min(int(batch[2]) + flankingl, + int(genome[chromosome]['length'])-1) + 1 + metagenome = ( + f"{metagenome}{spacer}" + f"{genome[chromosome]['sequence'][start:end]}" + ) # Create Fasta of repeat pseudogenome - fastafilename = f"{os.path.join(setup_folder, repname)}.fa" + fastafilename = f"{repname}.fa" record = SeqRecord(Seq(metagenome), id=repname, name='', description='') SeqIO.write(record, fastafilename, "fasta") - # Generate repeat pseudogenome bowtie index - bowtie_build_cmd = ["bowtie-build", "-f", fastafilename, - os.path.join(setup_folder, repname)] - subprocess.run(bowtie_build_cmd, check=True) + +def bowtie_build(args): + """ + Function to be executed in parallel by ProcessPoolExecutor. + """ + try: + bowtie_base, fasta = args + command = shlex.split(f"bowtie-build -f {fasta} {bowtie_base}") + squash = subprocess.run(command, capture_output=True, text=True) + return squash.stdout + except Exception as e: + return str(e) + + +args_list = [(name, f"{name}.fa") for name in rep_coords] +with ProcessPoolExecutor(max_workers=cpus) as executor: + executor.map(bowtie_build, args_list)