view RepEnrich_setup.py @ 11:6bba3e33c2e7 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 11df9f0b68d35d3a9424a17e4cefee6cfb9d4c19
author artbio
date Sat, 09 Mar 2024 22:32:46 +0000
parents f6f0f1e5e940
children 89e05f831259
line wrap: on
line source

#!/usr/bin/env python
import argparse
import csv
import os
import shlex
import subprocess
import sys

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

parser = argparse.ArgumentParser(description='''
             Part I: Prepartion of repetive element psuedogenomes and repetive\
             element bamfiles.  This script prepares the annotation used by\
             downstream applications to analyze for repetitive element\
             enrichment. For this script to run properly bowtie must be\
             loaded.  The repeat element psuedogenomes are prepared in order\
             to analyze reads that map to multiple locations of the genome.\
             The repeat element bamfiles are prepared in order to use a\
             region sorter to analyze reads that map to a single location\
             of the genome. You will 1) annotation_file:\
             The repetitive element annotation file downloaded from\
             RepeatMasker.org database for your organism of interest.\
             2) genomefasta: Your genome of interest in fasta format,\
             3)setup_folder: a folder to contain repeat element setup files\
             command-line usage
             EXAMPLE: python master_setup.py\
             /users/nneretti/data/annotation/mm9/mm9_repeatmasker.txt\
             /users/nneretti/data/annotation/mm9/mm9.fa\
             /users/nneretti/data/annotation/mm9/setup_folder''',
                                 prog='getargs_genome_maker.py')
parser.add_argument('--version', action='version', version='%(prog)s 0.1')
parser.add_argument('annotation_file', action='store',
                    metavar='annotation_file',
                    help='''List annotation file. The annotation file contains\
                         the repeat masker annotation for the genome of\
                         interest and may be downloaded at RepeatMasker.org\
                         Example /data/annotation/mm9/mm9.fa.out''')
parser.add_argument('genomefasta', action='store', metavar='genomefasta',
                    help='''File name and path for genome of interest in fasta\
                         format.  Example /data/annotation/mm9/mm9.fa''')
parser.add_argument('setup_folder', action='store', metavar='setup_folder',
                    help='''List folder to contain bamfiles for repeats and\
                         repeat element psuedogenomes.\
                         Example /data/annotation/mm9/setup''')
parser.add_argument('--nfragmentsfile1', action='store',
                    dest='nfragmentsfile1', metavar='nfragmentsfile1',
                    default='./repnames_nfragments.txt',
                    help='''Output location of a description file that saves\
                         the number of fragments processed per repname.
                         Default ./repnames_nfragments.txt''')
parser.add_argument('--gaplength', action='store', dest='gaplength',
                    metavar='gaplength', default='200', type=int,
                    help='Length of the spacer used to build\
                         repeat psuedogeneomes.  Default 200')
parser.add_argument('--flankinglength', action='store', dest='flankinglength',
                    metavar='flankinglength', default='25', type=int,
                    help='Length of the flanking region adjacent to the repeat\
                         element that is used to build repeat psuedogeneomes.\
                         The flanking length should be set according to the\
                         length of your reads.  Default 25')
parser.add_argument('--is_bed', action='store', dest='is_bed',
                    metavar='is_bed', default='FALSE',
                    help='''Is the annotation file a bed file. This is also a\
                         compatible format. The file needs to be a tab\
                         separated bed with optional fields.
                         Ex. format:
                         chr\tstart\tend\tName_element\tclass\tfamily.
                         The class and family should identical to name_element\
                         if not applicable.  Default FALSE change to TRUE''')
args = parser.parse_args()

# parameters and paths specified in args_parse
gapl = args.gaplength
flankingl = args.flankinglength
annotation_file = args.annotation_file
genomefasta = args.genomefasta
setup_folder = args.setup_folder
nfragmentsfile1 = args.nfragmentsfile1
is_bed = args.is_bed

##############################################################################
# check that the programs we need are available
try:
    subprocess.call(shlex.split("bowtie --version"),
                    stdout=open(os.devnull, 'wb'),
                    stderr=open(os.devnull, 'wb'))
except OSError:
    print("Error: Bowtie or BEDTools not loaded")
    raise

##############################################################################
# Define a text importer
csv.field_size_limit(sys.maxsize)


def import_text(filename, separator):
    for line in csv.reader(open(os.path.realpath(filename)),
                           delimiter=separator, skipinitialspace=True):
        if line:
            yield line


# Make a setup folder
if not os.path.exists(setup_folder):
    os.makedirs(setup_folder)
##############################################################################
# load genome into dictionary
print("loading genome...")
g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta"))

print("Precomputing length of all chromosomes...")
idxgenome = {}
lgenome = {}
genome = {}
allchrs = g.keys()
k = 0
for chr in allchrs:
    genome[chr] = str(g[chr].seq)
#    del g[chr]
    lgenome[chr] = len(genome[chr])
    idxgenome[chr] = k
    k = k + 1
del g

##############################################################################
# Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter
if is_bed == "FALSE":
    repeat_elements = []
    fout = open(os.path.realpath(setup_folder + os.path.sep
                                 + 'repnames.bed'), 'w')
    fin = import_text(annotation_file, ' ')
    x = 0
    rep_chr = {}
    rep_start = {}
    rep_end = {}
    x = 0
    for line in fin:
        if x > 2:
            line9 = line[9].replace("(", "_").replace(")",
                                                      "_").replace("/", "_")
            repname = line9
            if repname not in repeat_elements:
                repeat_elements.append(repname)
            repchr = line[4]
            repstart = int(line[5])
            repend = int(line[6])
            fout.write(str(repchr) + '\t' + str(repstart) + '\t' + str(repend)
                       + '\t' + str(repname) + '\n')
            if repname in rep_chr:
                rep_chr[repname].append(repchr)
                rep_start[repname].append(int(repstart))
                rep_end[repname].append(int(repend))
            else:
                rep_chr[repname] = [repchr]
                rep_start[repname] = [int(repstart)]
                rep_end[repname] = [int(repend)]
        x += 1
if is_bed == "TRUE":
    repeat_elements = []
    fout = open(os.path.realpath(setup_folder + os.path.sep + 'repnames.bed'),
                'w')
    fin = open(os.path.realpath(annotation_file), 'r')
    x = 0
    rep_chr = {}
    rep_start = {}
    rep_end = {}
    x = 0
    for line in fin:
        line = line.strip('\n')
        line = line.split('\t')
        line3 = line[3].replace("(", "_").replace(")", "_").replace("/", "_")
        repname = line3
        if repname not in repeat_elements:
            repeat_elements.append(repname)
        repchr = line[0]
        repstart = int(line[1])
        repend = int(line[2])
        fout.write(str(repchr) + '\t' + str(repstart) + '\t' +
                   str(repend) + '\t' + str(repname) + '\n')
        # if rep_chr.has_key(repname):
        if repname in rep_chr:
            rep_chr[repname].append(repchr)
            rep_start[repname].append(int(repstart))
            rep_end[repname].append(int(repend))
        else:
            rep_chr[repname] = [repchr]
            rep_start[repname] = [int(repstart)]
            rep_end[repname] = [int(repend)]

fin.close()
fout.close()
repeat_elements = sorted(repeat_elements)
print("Writing a key for all repeats...")
# print to fout the binary key that contains each repeat type with the
# associated binary number; sort the binary key:
fout = open(os.path.realpath(setup_folder + os.path.sep +
                             'repgenomes_key.txt'), 'w')
x = 0
for repeat in repeat_elements:
    # print >> fout, str(repeat) + '\t' + str(x)
    fout.write(str(repeat) + '\t' + str(x) + '\n')
    x += 1
fout.close()
##############################################################################
# generate spacer for psuedogenomes
spacer = ""
for i in range(gapl):
    spacer = spacer + "N"

# save file with number of fragments processed per repname
print("Saving number of fragments processed per repname to "
      + nfragmentsfile1)
fout1 = open(os.path.realpath(nfragmentsfile1), "w")
for repname in rep_chr.keys():
    rep_chr_current = rep_chr[repname]
#    print >>fout1, str(len(rep_chr[repname])) + "\t" + repname
    fout1.write(str(len(rep_chr[repname])) + "\t" + repname + '\n')
fout1.close()

# generate metagenomes and save them to FASTA files
k = 1
nrepgenomes = len(rep_chr.keys())
for repname in rep_chr.keys():
    metagenome = ""
    newname = repname.replace("(", "_").replace(")", "_").replace("/", "_")
    print("processing repgenome " + newname + ".fa" + " (" + str(k)
          + " of " + str(nrepgenomes) + ")")
    rep_chr_current = rep_chr[repname]
    rep_start_current = rep_start[repname]
    rep_end_current = rep_end[repname]
    print("-------> " + str(len(rep_chr[repname])) + " fragments")
    for i in range(len(rep_chr[repname])):
        try:
            chr = rep_chr_current[i]
            rstart = max(rep_start_current[i] - flankingl, 0)
            rend = min(rep_end_current[i] + flankingl, lgenome[chr]-1)
            metagenome = metagenome + spacer + genome[chr][rstart:(rend+1)]
        except KeyError:
            print("Unrecognised Chromosome: "+chr)
            pass
    # Convert metagenome to SeqRecord object (required by SeqIO.write)
    record = SeqRecord(Seq(metagenome), id="repname",
                       name="", description="")
    print("saving repgenome " + newname + ".fa" + " (" + str(k) + " of "
          + str(nrepgenomes) + ")")
    fastafilename = os.path.realpath(setup_folder + os.path.sep
                                     + newname + ".fa")
    SeqIO.write(record, fastafilename, "fasta")
    print("indexing repgenome " + newname + ".fa" + " (" +
          str(k) + " of " + str(nrepgenomes) + ")")
    command = shlex.split('bowtie-build -f ' + fastafilename + ' ' +
                          setup_folder + os.path.sep + newname)
    p = subprocess.Popen(command).communicate()
    k += 1

print("... Done")