Mercurial > repos > artbio > repenrich
diff RepEnrich_setup.py @ 0:f6f0f1e5e940 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
author | artbio |
---|---|
date | Wed, 02 Aug 2017 05:17:29 -0400 |
parents | |
children | 6bba3e33c2e7 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RepEnrich_setup.py Wed Aug 02 05:17:29 2017 -0400 @@ -0,0 +1,259 @@ +#!/usr/bin/env python +import argparse +import csv +import os +import shlex +import subprocess +import sys + +from Bio import SeqIO +from Bio.Alphabet import IUPAC +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord + +parser = argparse.ArgumentParser(description=''' + Part I: Prepartion of repetive element psuedogenomes and repetive\ + element bamfiles. This script prepares the annotation used by\ + downstream applications to analyze for repetitive element\ + enrichment. For this script to run properly bowtie must be\ + loaded. The repeat element psuedogenomes are prepared in order\ + to analyze reads that map to multiple locations of the genome.\ + The repeat element bamfiles are prepared in order to use a\ + region sorter to analyze reads that map to a single location\ + of the genome. You will 1) annotation_file:\ + The repetitive element annotation file downloaded from\ + RepeatMasker.org database for your organism of interest.\ + 2) genomefasta: Your genome of interest in fasta format,\ + 3)setup_folder: a folder to contain repeat element setup files\ + command-line usage + EXAMPLE: python master_setup.py\ + /users/nneretti/data/annotation/mm9/mm9_repeatmasker.txt\ + /users/nneretti/data/annotation/mm9/mm9.fa\ + /users/nneretti/data/annotation/mm9/setup_folder''', + prog='getargs_genome_maker.py') +parser.add_argument('--version', action='version', version='%(prog)s 0.1') +parser.add_argument('annotation_file', action='store', + metavar='annotation_file', + help='''List annotation file. The annotation file contains\ + the repeat masker annotation for the genome of\ + interest and may be downloaded at RepeatMasker.org\ + Example /data/annotation/mm9/mm9.fa.out''') +parser.add_argument('genomefasta', action='store', metavar='genomefasta', + help='''File name and path for genome of interest in fasta\ + format. Example /data/annotation/mm9/mm9.fa''') +parser.add_argument('setup_folder', action='store', metavar='setup_folder', + help='''List folder to contain bamfiles for repeats and\ + repeat element psuedogenomes.\ + Example /data/annotation/mm9/setup''') +parser.add_argument('--nfragmentsfile1', action='store', + dest='nfragmentsfile1', metavar='nfragmentsfile1', + default='./repnames_nfragments.txt', + help='''Output location of a description file that saves\ + the number of fragments processed per repname. + Default ./repnames_nfragments.txt''') +parser.add_argument('--gaplength', action='store', dest='gaplength', + metavar='gaplength', default='200', type=int, + help='Length of the spacer used to build\ + repeat psuedogeneomes. Default 200') +parser.add_argument('--flankinglength', action='store', dest='flankinglength', + metavar='flankinglength', default='25', type=int, + help='Length of the flanking region adjacent to the repeat\ + element that is used to build repeat psuedogeneomes.\ + The flanking length should be set according to the\ + length of your reads. Default 25') +parser.add_argument('--is_bed', action='store', dest='is_bed', + metavar='is_bed', default='FALSE', + help='''Is the annotation file a bed file. This is also a\ + compatible format. The file needs to be a tab\ + separated bed with optional fields. + Ex. format: + chr\tstart\tend\tName_element\tclass\tfamily. + The class and family should identical to name_element\ + if not applicable. Default FALSE change to TRUE''') +args = parser.parse_args() + +# parameters and paths specified in args_parse +gapl = args.gaplength +flankingl = args.flankinglength +annotation_file = args.annotation_file +genomefasta = args.genomefasta +setup_folder = args.setup_folder +nfragmentsfile1 = args.nfragmentsfile1 +is_bed = args.is_bed + +############################################################################## +# check that the programs we need are available +try: + subprocess.call(shlex.split("bowtie --version"), + stdout=open(os.devnull, 'wb'), + stderr=open(os.devnull, 'wb')) +except OSError: + print("Error: Bowtie or BEDTools not loaded") + raise + +############################################################################## +# Define a text importer +csv.field_size_limit(sys.maxsize) + + +def import_text(filename, separator): + for line in csv.reader(open(os.path.realpath(filename)), + delimiter=separator, skipinitialspace=True): + if line: + yield line + + +# Make a setup folder +if not os.path.exists(setup_folder): + os.makedirs(setup_folder) +############################################################################## +# load genome into dictionary +print("loading genome...") +g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta")) + +print("Precomputing length of all chromosomes...") +idxgenome = {} +lgenome = {} +genome = {} +allchrs = g.keys() +k = 0 +for chr in allchrs: + genome[chr] = str(g[chr].seq) +# del g[chr] + lgenome[chr] = len(genome[chr]) + idxgenome[chr] = k + k = k + 1 +del g + +############################################################################## +# Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter +if is_bed == "FALSE": + repeat_elements = [] + fout = open(os.path.realpath(setup_folder + os.path.sep + + 'repnames.bed'), 'w') + fin = import_text(annotation_file, ' ') + x = 0 + rep_chr = {} + rep_start = {} + rep_end = {} + x = 0 + for line in fin: + if x > 2: + line9 = line[9].replace("(", "_").replace(")", + "_").replace("/", "_") + repname = line9 + if repname not in repeat_elements: + repeat_elements.append(repname) + repchr = line[4] + repstart = int(line[5]) + repend = int(line[6]) + fout.write(str(repchr) + '\t' + str(repstart) + '\t' + str(repend) + + '\t' + str(repname) + '\n') + if repname in rep_chr: + rep_chr[repname].append(repchr) + rep_start[repname].append(int(repstart)) + rep_end[repname].append(int(repend)) + else: + rep_chr[repname] = [repchr] + rep_start[repname] = [int(repstart)] + rep_end[repname] = [int(repend)] + x += 1 +if is_bed == "TRUE": + repeat_elements = [] + fout = open(os.path.realpath(setup_folder + os.path.sep + 'repnames.bed'), + 'w') + fin = open(os.path.realpath(annotation_file), 'r') + x = 0 + rep_chr = {} + rep_start = {} + rep_end = {} + x = 0 + for line in fin: + line = line.strip('\n') + line = line.split('\t') + line3 = line[3].replace("(", "_").replace(")", "_").replace("/", "_") + repname = line3 + if repname not in repeat_elements: + repeat_elements.append(repname) + repchr = line[0] + repstart = int(line[1]) + repend = int(line[2]) + fout.write(str(repchr) + '\t' + str(repstart) + '\t' + + str(repend) + '\t' + str(repname) + '\n') + # if rep_chr.has_key(repname): + if repname in rep_chr: + rep_chr[repname].append(repchr) + rep_start[repname].append(int(repstart)) + rep_end[repname].append(int(repend)) + else: + rep_chr[repname] = [repchr] + rep_start[repname] = [int(repstart)] + rep_end[repname] = [int(repend)] + +fin.close() +fout.close() +repeat_elements = sorted(repeat_elements) +print("Writing a key for all repeats...") +# print to fout the binary key that contains each repeat type with the +# associated binary number; sort the binary key: +fout = open(os.path.realpath(setup_folder + os.path.sep + + 'repgenomes_key.txt'), 'w') +x = 0 +for repeat in repeat_elements: + # print >> fout, str(repeat) + '\t' + str(x) + fout.write(str(repeat) + '\t' + str(x) + '\n') + x += 1 +fout.close() +############################################################################## +# generate spacer for psuedogenomes +spacer = "" +for i in range(gapl): + spacer = spacer + "N" + +# save file with number of fragments processed per repname +print("Saving number of fragments processed per repname to " + + nfragmentsfile1) +fout1 = open(os.path.realpath(nfragmentsfile1), "w") +for repname in rep_chr.keys(): + rep_chr_current = rep_chr[repname] +# print >>fout1, str(len(rep_chr[repname])) + "\t" + repname + fout1.write(str(len(rep_chr[repname])) + "\t" + repname + '\n') +fout1.close() + +# generate metagenomes and save them to FASTA files +k = 1 +nrepgenomes = len(rep_chr.keys()) +for repname in rep_chr.keys(): + metagenome = "" + newname = repname.replace("(", "_").replace(")", "_").replace("/", "_") + print("processing repgenome " + newname + ".fa" + " (" + str(k) + + " of " + str(nrepgenomes) + ")") + rep_chr_current = rep_chr[repname] + rep_start_current = rep_start[repname] + rep_end_current = rep_end[repname] + print("-------> " + str(len(rep_chr[repname])) + " fragments") + for i in range(len(rep_chr[repname])): + try: + chr = rep_chr_current[i] + rstart = max(rep_start_current[i] - flankingl, 0) + rend = min(rep_end_current[i] + flankingl, lgenome[chr]-1) + metagenome = metagenome + spacer + genome[chr][rstart:(rend+1)] + except KeyError: + print("Unrecognised Chromosome: "+chr) + pass + # Convert metagenome to SeqRecord object (required by SeqIO.write) + record = SeqRecord(Seq(metagenome, IUPAC.unambiguous_dna), id="repname", + name="", description="") + print("saving repgenome " + newname + ".fa" + " (" + str(k) + " of " + + str(nrepgenomes) + ")") + fastafilename = os.path.realpath(setup_folder + os.path.sep + + newname + ".fa") + SeqIO.write(record, fastafilename, "fasta") + print("indexing repgenome " + newname + ".fa" + " (" + + str(k) + " of " + str(nrepgenomes) + ")") + command = shlex.split('bowtie-build -f ' + fastafilename + ' ' + + setup_folder + os.path.sep + newname) + p = subprocess.Popen(command).communicate() + k += 1 + +print("... Done")