Mercurial > repos > artbio > repenrich
view RepEnrich_setup.py @ 3:1c9810ba0638 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 50a80e047ef74664d616a332f93c84f27cb6b7a0
author | artbio |
---|---|
date | Fri, 22 Sep 2017 03:19:23 -0400 |
parents | f6f0f1e5e940 |
children | 6bba3e33c2e7 |
line wrap: on
line source
#!/usr/bin/env python import argparse import csv import os import shlex import subprocess import sys from Bio import SeqIO from Bio.Alphabet import IUPAC from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord parser = argparse.ArgumentParser(description=''' Part I: Prepartion of repetive element psuedogenomes and repetive\ element bamfiles. This script prepares the annotation used by\ downstream applications to analyze for repetitive element\ enrichment. For this script to run properly bowtie must be\ loaded. The repeat element psuedogenomes are prepared in order\ to analyze reads that map to multiple locations of the genome.\ The repeat element bamfiles are prepared in order to use a\ region sorter to analyze reads that map to a single location\ of the genome. You will 1) annotation_file:\ The repetitive element annotation file downloaded from\ RepeatMasker.org database for your organism of interest.\ 2) genomefasta: Your genome of interest in fasta format,\ 3)setup_folder: a folder to contain repeat element setup files\ command-line usage EXAMPLE: python master_setup.py\ /users/nneretti/data/annotation/mm9/mm9_repeatmasker.txt\ /users/nneretti/data/annotation/mm9/mm9.fa\ /users/nneretti/data/annotation/mm9/setup_folder''', prog='getargs_genome_maker.py') parser.add_argument('--version', action='version', version='%(prog)s 0.1') parser.add_argument('annotation_file', action='store', metavar='annotation_file', help='''List annotation file. The annotation file contains\ the repeat masker annotation for the genome of\ interest and may be downloaded at RepeatMasker.org\ Example /data/annotation/mm9/mm9.fa.out''') parser.add_argument('genomefasta', action='store', metavar='genomefasta', help='''File name and path for genome of interest in fasta\ format. Example /data/annotation/mm9/mm9.fa''') parser.add_argument('setup_folder', action='store', metavar='setup_folder', help='''List folder to contain bamfiles for repeats and\ repeat element psuedogenomes.\ Example /data/annotation/mm9/setup''') parser.add_argument('--nfragmentsfile1', action='store', dest='nfragmentsfile1', metavar='nfragmentsfile1', default='./repnames_nfragments.txt', help='''Output location of a description file that saves\ the number of fragments processed per repname. Default ./repnames_nfragments.txt''') parser.add_argument('--gaplength', action='store', dest='gaplength', metavar='gaplength', default='200', type=int, help='Length of the spacer used to build\ repeat psuedogeneomes. Default 200') parser.add_argument('--flankinglength', action='store', dest='flankinglength', metavar='flankinglength', default='25', type=int, help='Length of the flanking region adjacent to the repeat\ element that is used to build repeat psuedogeneomes.\ The flanking length should be set according to the\ length of your reads. Default 25') parser.add_argument('--is_bed', action='store', dest='is_bed', metavar='is_bed', default='FALSE', help='''Is the annotation file a bed file. This is also a\ compatible format. The file needs to be a tab\ separated bed with optional fields. Ex. format: chr\tstart\tend\tName_element\tclass\tfamily. The class and family should identical to name_element\ if not applicable. Default FALSE change to TRUE''') args = parser.parse_args() # parameters and paths specified in args_parse gapl = args.gaplength flankingl = args.flankinglength annotation_file = args.annotation_file genomefasta = args.genomefasta setup_folder = args.setup_folder nfragmentsfile1 = args.nfragmentsfile1 is_bed = args.is_bed ############################################################################## # check that the programs we need are available try: subprocess.call(shlex.split("bowtie --version"), stdout=open(os.devnull, 'wb'), stderr=open(os.devnull, 'wb')) except OSError: print("Error: Bowtie or BEDTools not loaded") raise ############################################################################## # Define a text importer csv.field_size_limit(sys.maxsize) def import_text(filename, separator): for line in csv.reader(open(os.path.realpath(filename)), delimiter=separator, skipinitialspace=True): if line: yield line # Make a setup folder if not os.path.exists(setup_folder): os.makedirs(setup_folder) ############################################################################## # load genome into dictionary print("loading genome...") g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta")) print("Precomputing length of all chromosomes...") idxgenome = {} lgenome = {} genome = {} allchrs = g.keys() k = 0 for chr in allchrs: genome[chr] = str(g[chr].seq) # del g[chr] lgenome[chr] = len(genome[chr]) idxgenome[chr] = k k = k + 1 del g ############################################################################## # Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter if is_bed == "FALSE": repeat_elements = [] fout = open(os.path.realpath(setup_folder + os.path.sep + 'repnames.bed'), 'w') fin = import_text(annotation_file, ' ') x = 0 rep_chr = {} rep_start = {} rep_end = {} x = 0 for line in fin: if x > 2: line9 = line[9].replace("(", "_").replace(")", "_").replace("/", "_") repname = line9 if repname not in repeat_elements: repeat_elements.append(repname) repchr = line[4] repstart = int(line[5]) repend = int(line[6]) fout.write(str(repchr) + '\t' + str(repstart) + '\t' + str(repend) + '\t' + str(repname) + '\n') if repname in rep_chr: rep_chr[repname].append(repchr) rep_start[repname].append(int(repstart)) rep_end[repname].append(int(repend)) else: rep_chr[repname] = [repchr] rep_start[repname] = [int(repstart)] rep_end[repname] = [int(repend)] x += 1 if is_bed == "TRUE": repeat_elements = [] fout = open(os.path.realpath(setup_folder + os.path.sep + 'repnames.bed'), 'w') fin = open(os.path.realpath(annotation_file), 'r') x = 0 rep_chr = {} rep_start = {} rep_end = {} x = 0 for line in fin: line = line.strip('\n') line = line.split('\t') line3 = line[3].replace("(", "_").replace(")", "_").replace("/", "_") repname = line3 if repname not in repeat_elements: repeat_elements.append(repname) repchr = line[0] repstart = int(line[1]) repend = int(line[2]) fout.write(str(repchr) + '\t' + str(repstart) + '\t' + str(repend) + '\t' + str(repname) + '\n') # if rep_chr.has_key(repname): if repname in rep_chr: rep_chr[repname].append(repchr) rep_start[repname].append(int(repstart)) rep_end[repname].append(int(repend)) else: rep_chr[repname] = [repchr] rep_start[repname] = [int(repstart)] rep_end[repname] = [int(repend)] fin.close() fout.close() repeat_elements = sorted(repeat_elements) print("Writing a key for all repeats...") # print to fout the binary key that contains each repeat type with the # associated binary number; sort the binary key: fout = open(os.path.realpath(setup_folder + os.path.sep + 'repgenomes_key.txt'), 'w') x = 0 for repeat in repeat_elements: # print >> fout, str(repeat) + '\t' + str(x) fout.write(str(repeat) + '\t' + str(x) + '\n') x += 1 fout.close() ############################################################################## # generate spacer for psuedogenomes spacer = "" for i in range(gapl): spacer = spacer + "N" # save file with number of fragments processed per repname print("Saving number of fragments processed per repname to " + nfragmentsfile1) fout1 = open(os.path.realpath(nfragmentsfile1), "w") for repname in rep_chr.keys(): rep_chr_current = rep_chr[repname] # print >>fout1, str(len(rep_chr[repname])) + "\t" + repname fout1.write(str(len(rep_chr[repname])) + "\t" + repname + '\n') fout1.close() # generate metagenomes and save them to FASTA files k = 1 nrepgenomes = len(rep_chr.keys()) for repname in rep_chr.keys(): metagenome = "" newname = repname.replace("(", "_").replace(")", "_").replace("/", "_") print("processing repgenome " + newname + ".fa" + " (" + str(k) + " of " + str(nrepgenomes) + ")") rep_chr_current = rep_chr[repname] rep_start_current = rep_start[repname] rep_end_current = rep_end[repname] print("-------> " + str(len(rep_chr[repname])) + " fragments") for i in range(len(rep_chr[repname])): try: chr = rep_chr_current[i] rstart = max(rep_start_current[i] - flankingl, 0) rend = min(rep_end_current[i] + flankingl, lgenome[chr]-1) metagenome = metagenome + spacer + genome[chr][rstart:(rend+1)] except KeyError: print("Unrecognised Chromosome: "+chr) pass # Convert metagenome to SeqRecord object (required by SeqIO.write) record = SeqRecord(Seq(metagenome, IUPAC.unambiguous_dna), id="repname", name="", description="") print("saving repgenome " + newname + ".fa" + " (" + str(k) + " of " + str(nrepgenomes) + ")") fastafilename = os.path.realpath(setup_folder + os.path.sep + newname + ".fa") SeqIO.write(record, fastafilename, "fasta") print("indexing repgenome " + newname + ".fa" + " (" + str(k) + " of " + str(nrepgenomes) + ")") command = shlex.split('bowtie-build -f ' + fastafilename + ' ' + setup_folder + os.path.sep + newname) p = subprocess.Popen(command).communicate() k += 1 print("... Done")