Mercurial > repos > artbio > repenrich2
view RepEnrich2.py @ 9:2b61c6407efb draft default tip
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 961a14cd73dd8e40b2f30d5e4df136b98cd8f07a
author | artbio |
---|---|
date | Thu, 25 Apr 2024 16:22:34 +0000 |
parents | 567549a49eb2 |
children |
line wrap: on
line source
import argparse import csv import os import shlex import subprocess import sys from collections import defaultdict from concurrent.futures import ProcessPoolExecutor parser = argparse.ArgumentParser(description=''' Repenrich aligns reads to Repeat Elements pseudogenomes\ and counts aligned reads. RepEnrich_setup must be run\ before its use''') parser.add_argument('--annotation_file', action='store', metavar='annotation_file', help='RepeatMasker.org annotation file for your\ organism. The file may be downloaded from\ RepeatMasker.org. E.g. hg19_repeatmasker.txt') parser.add_argument('--alignment_bam', action='store', metavar='alignment_bam', help='Bam alignments of unique mapper reads.') parser.add_argument('--fastqfile', action='store', metavar='fastqfile', help='File of fastq reads mapping to multiple\ locations. Example: /data/multimap.fastq') parser.add_argument('--fastqfile2', action='store', dest='fastqfile2', metavar='fastqfile2', default='', help='fastqfile #2 when using paired-end option.\ Default none') parser.add_argument('--cpus', action='store', dest='cpus', metavar='cpus', default="1", type=int, help='Number of CPUs. The more cpus the\ faster RepEnrich performs. Default: "1"') args = parser.parse_args() # parameters annotation_file = args.annotation_file unique_mapper_bam = args.alignment_bam fastqfile_1 = args.fastqfile fastqfile_2 = args.fastqfile2 cpus = args.cpus # Change if simple repeats are differently annotated in your organism simple_repeat = "Simple_repeat" if args.fastqfile2: paired_end = True else: paired_end = False # check that the programs we need are available try: subprocess.call(shlex.split("coverageBed -h"), stdout=open(os.devnull, 'wb'), stderr=open(os.devnull, 'wb')) subprocess.call(shlex.split("bowtie2 --version"), stdout=open(os.devnull, 'wb'), stderr=open(os.devnull, 'wb')) except OSError: print("Error: Bowtie2 or bedtools not loaded") raise def starts_with_numerical(list): try: if len(list) == 0: return False int(list[0]) return True except ValueError: return False # define a text importer for .out/.txt format of repbase def import_text(filename, separator): csv.field_size_limit(sys.maxsize) file = csv.reader(open(filename), delimiter=separator, skipinitialspace=True) return [line for line in file if starts_with_numerical(line)] # set a reference repeat list for the script repeat_list = [listline[9].translate( str.maketrans( '()/', '___')) for listline in import_text(annotation_file, ' ')] repeat_list = sorted(list(set(repeat_list))) # unique mapper counting cmd = f"bedtools bamtobed -i {unique_mapper_bam} | \ bedtools coverage -b stdin -a repnames.bed" p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) bedtools_counts = p.communicate()[0].decode().rstrip('\r\n').split('\n') # parse bedtools output counts = defaultdict(int) # key: repeat names, value: unique mapper counts sumofrepeatreads = 0 for line in bedtools_counts: line = line.split('\t') counts[line[3]] += int(line[4]) sumofrepeatreads += int(line[4]) print(f"Identified {sumofrepeatreads} unique reads that mapped to repeats.") # print unique mapper counts with open("unique_mapper_counts.tsv", 'w') as fout: fout.write("#element\tcount\n") for count in sorted(counts): fout.write(f"{count}\t{counts[count]}\n") def run_bowtie(args): ''' write to files to save memory ''' metagenome = args b_opt = "-k 1 -p 2 --quiet --no-hd --no-unal" if paired_end is True: command = shlex.split(f"bowtie2 {b_opt} -x {metagenome}" f" -1 {fastqfile_1} -2 {fastqfile_1}") else: command = shlex.split(f"bowtie2 {b_opt} -x {metagenome} {fastqfile_1}") bowtie_align = subprocess.run(command, check=True, capture_output=True, text=True).stdout bowtie_align = bowtie_align.rstrip('\r\n').split('\n') with open(f"{metagenome}.reads", "a+") as readfile: for line in bowtie_align: read = line.split()[0].split("/")[0] if read: readfile.write(f"{read}\n") # multimapper parsing args_list = [metagenome for metagenome in repeat_list] with ProcessPoolExecutor(max_workers=cpus) as executor: results = executor.map(run_bowtie, args_list) # Aggregate results (avoiding race conditions) metagenome_reads = defaultdict(list) # metagenome: list of multimap reads # Now we read .reads files to populate metagnomes_reads for metagenome in repeat_list: with open(f"{metagenome}.reads") as readfile: for read in readfile: metagenome_reads[metagenome].append(read) # read are only once in list metagenome_reads[metagenome] = list(set(metagenome_reads[metagenome])) # implement repeats_by_reads from the inverse dictionnary metagenome_reads repeats_by_reads = defaultdict(list) # readids: list of repeats names for repname in metagenome_reads: for read in metagenome_reads[repname]: repeats_by_reads[read].append(repname) for repname in repeats_by_reads: repeats_by_reads[repname] = list(set(repeats_by_reads[repname])) # this repeats_by_reads dictionary is far too big # 3 dictionnaries and 1 pointer variable to be populated fractionalcounts = defaultdict(float) familyfractionalcounts = defaultdict(float) classfractionalcounts = defaultdict(float) sumofrepeatreads = 0 # Update counts dictionnary with sets of repeats (was "subfamilies") # matched by multimappers for repeat_set in repeats_by_reads.values(): repeat_set_string = ','.join(repeat_set) counts[repeat_set_string] += 1 sumofrepeatreads += 1 print(f'Identified more {sumofrepeatreads} mutimapper repeat reads') # Populate fractionalcounts for key, count in counts.items(): key_list = key.split(',') for i in key_list: fractionalcounts[i] += count / len(key_list) # build repeat_ref for easy access to rep class and rep families repeat_ref = defaultdict(dict) repeats = import_text(annotation_file, ' ') for repeat in repeats: repeat_name = repeat[9].translate(str.maketrans('()/', '___')) try: repclass = repeat[10].split('/')[0] repfamily = repeat[10].split('/')[1] except IndexError: repclass, repfamily = repeat[10], repeat[10] repeat_ref[repeat_name]['class'] = repclass repeat_ref[repeat_name]['family'] = repfamily # Populate classfractionalcounts and familyfractionalcounts for key, value in fractionalcounts.items(): classfractionalcounts[repeat_ref[key]['class']] += value familyfractionalcounts[repeat_ref[key]['family']] += value # print class-, family- and fraction-repeats counts to files with open("class_fraction_counts.tsv", 'w') as fout: for key in sorted(classfractionalcounts): fout.write(f"{key}\t{round(classfractionalcounts[key], 2)}\n") with open("family_fraction_counts.tsv", 'w') as fout: for key in sorted(familyfractionalcounts): fout.write(f"{key}\t{round(familyfractionalcounts[key], 2)}\n") with open("fraction_counts.tsv", 'w') as fout: for key in sorted(fractionalcounts): fout.write(f"{key}\t{repeat_ref[key]['class']}\t" f"{repeat_ref[key]['family']}\t" f"{round(fractionalcounts[key], 2)}\n")