Mercurial > repos > drosofff > repenrich
diff RepEnrich_setup.py @ 0:1435d142041b draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit d5ebd581fa3a22ca61ce07a31c01bb70610fbcf5
author | drosofff |
---|---|
date | Tue, 23 May 2017 18:37:22 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RepEnrich_setup.py Tue May 23 18:37:22 2017 -0400 @@ -0,0 +1,194 @@ +#!/usr/bin/env python +import argparse +import csv +import os +import shlex +import subprocess +import sys +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord +from Bio.Alphabet import IUPAC + +parser = argparse.ArgumentParser(description='Part I: Prepartion of repetive element psuedogenomes and repetive element bamfiles. This script prepares the annotation used by downstream applications to analyze for repetitive element enrichment. For this script to run properly bowtie must be loaded. The repeat element psuedogenomes are prepared in order to analyze reads that map to multiple locations of the genome. The repeat element bamfiles are prepared in order to use a region sorter to analyze reads that map to a single location of the genome.You will 1) annotation_file: The repetitive element annotation file downloaded from RepeatMasker.org database for your organism of interest. 2) genomefasta: Your genome of interest in fasta format, 3)setup_folder: a folder to contain repeat element setup files command-line usage EXAMPLE: python master_setup.py /users/nneretti/data/annotation/mm9/mm9_repeatmasker.txt /users/nneretti/data/annotation/mm9/mm9.fa /users/nneretti/data/annotation/mm9/setup_folder', prog='getargs_genome_maker.py') +parser.add_argument('--version', action='version', version='%(prog)s 0.1') +parser.add_argument('annotation_file', action= 'store', metavar='annotation_file', help='List annotation file. The annotation file contains the repeat masker annotation for the genome of interest and may be downloaded at RepeatMasker.org Example /data/annotation/mm9/mm9.fa.out') +parser.add_argument('genomefasta', action= 'store', metavar='genomefasta', help='File name and path for genome of interest in fasta format. Example /data/annotation/mm9/mm9.fa') +parser.add_argument('setup_folder', action= 'store', metavar='setup_folder', help='List folder to contain bamfiles for repeats and repeat element psuedogenomes. Example /data/annotation/mm9/setup') +parser.add_argument('--nfragmentsfile1', action= 'store', dest='nfragmentsfile1', metavar='nfragmentsfile1', default='./repnames_nfragments.txt', help='Output location of a description file that saves the number of fragments processed per repname. Default ./repnames_nfragments.txt') +parser.add_argument('--gaplength', action= 'store', dest='gaplength', metavar='gaplength', default= '200', type=int, help='Length of the spacer used to build repeat psuedogeneomes. Default 200') +parser.add_argument('--flankinglength', action= 'store', dest='flankinglength', metavar='flankinglength', default= '25', type=int, help='Length of the flanking region adjacent to the repeat element that is used to build repeat psuedogeneomes. The flanking length should be set according to the length of your reads. Default 25') +parser.add_argument('--is_bed', action= 'store', dest='is_bed', metavar='is_bed', default= 'FALSE', help='Is the annotation file a bed file. This is also a compatible format. The file needs to be a tab seperated bed with optional fields. Ex. format chr\tstart\tend\tName_element\tclass\tfamily. The class and family should identical to name_element if not applicable. Default FALSE change to TRUE') +args = parser.parse_args() + +# parameters and paths specified in args_parse +gapl = args.gaplength +flankingl = args.flankinglength +annotation_file = args.annotation_file +genomefasta = args.genomefasta +setup_folder = args.setup_folder +nfragmentsfile1 = args.nfragmentsfile1 +is_bed = args.is_bed + +################################################################################ +# check that the programs we need are available +try: + subprocess.call(shlex.split("bowtie --version"), stdout=open(os.devnull, 'wb'), stderr=open(os.devnull, 'wb')) +except OSError: + print ("Error: Bowtie or BEDTools not loaded") + raise + +################################################################################ +# Define a text importer +csv.field_size_limit(sys.maxsize) +def import_text(filename, separator): + for line in csv.reader(open(os.path.realpath(filename)), delimiter=separator, + skipinitialspace=True): + if line: + yield line +# Make a setup folder +if not os.path.exists(setup_folder): + os.makedirs(setup_folder) + +################################################################################ +# load genome into dictionary +print ("loading genome...") +g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta")) + +print ("Precomputing length of all chromosomes...") +idxgenome = {} +lgenome = {} +genome = {} +allchrs = g.keys() +k = 0 +for chr in allchrs: + genome[chr] = str(g[chr].seq) +# del g[chr] + lgenome[chr] = len(genome[chr]) + idxgenome[chr] = k + k = k + 1 +del g + +################################################################################ +# Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter +if is_bed == "FALSE": + repeat_elements= [] + fout = open(os.path.realpath(setup_folder + os.path.sep + 'repnames.bed'), 'w') + fin = import_text(annotation_file, ' ') + x = 0 + rep_chr = {} + rep_start = {} + rep_end = {} + x = 0 + for line in fin: + if x>2: + line9 = line[9].replace("(","_").replace(")","_").replace("/","_") + repname = line9 + if not repname in repeat_elements: + repeat_elements.append(repname) + repchr = line[4] + repstart = int(line[5]) + repend = int(line[6]) +# print >> fout, str(repchr) + '\t'+str(repstart)+ '\t'+str(repend)+ '\t'+str(repname) + fout.write(str(repchr) + '\t'+str(repstart)+ '\t'+str(repend)+ '\t'+str(repname)+ '\n') +# if rep_chr.has_key(repname): + if repname in rep_chr: + rep_chr[repname].append(repchr) + rep_start[repname].append(int(repstart)) + rep_end[repname].append(int(repend)) + else: + rep_chr[repname] = [repchr] + rep_start[repname] = [int(repstart)] + rep_end[repname] = [int(repend)] + x +=1 +if is_bed == "TRUE": + repeat_elements= [] + fout = open(os.path.realpath(setup_folder + os.path.sep + 'repnames.bed'), 'w') + fin = open(os.path.realpath(annotation_file), 'r') + x =0 + rep_chr = {} + rep_start = {} + rep_end = {} + x =0 + for line in fin: + line=line.strip('\n') + line=line.split('\t') + line3 = line[3].replace("(","_").replace(")","_").replace("/","_") + repname = line3 + if not repname in repeat_elements: + repeat_elements.append(repname) + repchr = line[0] + repstart = int(line[1]) + repend = int(line[2]) +# print >> fout, str(repchr) + '\t'+str(repstart)+ '\t'+str(repend)+ '\t'+str(repname) + fout.write(str(repchr) + '\t'+str(repstart)+ '\t'+str(repend)+ '\t'+str(repname) + '\n') +# if rep_chr.has_key(repname): + if repname in rep_chr: + rep_chr[repname].append(repchr) + rep_start[repname].append(int(repstart)) + rep_end[repname].append(int(repend)) + else: + rep_chr[repname] = [repchr] + rep_start[repname] = [int(repstart)] + rep_end[repname] = [int(repend)] + +fin.close() +fout.close() +repeat_elements = sorted(repeat_elements) +print ("Writing a key for all repeats...") +#print to fout the binary key that contains each repeat type with the associated binary number; sort the binary key: +fout = open(os.path.realpath(setup_folder + os.path.sep + 'repgenomes_key.txt'), 'w') +x = 0 +for repeat in repeat_elements: +# print >> fout, str(repeat) + '\t' + str(x) + fout.write(str(repeat) + '\t' + str(x) + '\n') + x +=1 +fout.close() +################################################################################ +# generate spacer for psuedogenomes +spacer = "" +for i in range(gapl): + spacer = spacer + "N" + +# save file with number of fragments processed per repname +print ("Saving number of fragments processed per repname to " + nfragmentsfile1) +fout1 = open(os.path.realpath(nfragmentsfile1),"w") +for repname in rep_chr.keys(): + rep_chr_current = rep_chr[repname] +# print >>fout1, str(len(rep_chr[repname])) + "\t" + repname + fout1.write(str(len(rep_chr[repname])) + "\t" + repname + '\n') +fout1.close() + +# generate metagenomes and save them to FASTA files +k = 1 +nrepgenomes = len(rep_chr.keys()) +for repname in rep_chr.keys(): + metagenome = "" + newname = repname.replace("(","_").replace(")","_").replace("/","_") + print ("processing repgenome " + newname + ".fa" + " (" + str(k) + " of " + str(nrepgenomes) + ")") + rep_chr_current = rep_chr[repname] + rep_start_current = rep_start[repname] + rep_end_current = rep_end[repname] + print ("-------> " + str(len(rep_chr[repname])) + " fragments") + for i in range(len(rep_chr[repname])): + try: + chr = rep_chr_current[i] + rstart = max(rep_start_current[i] - flankingl, 0) + rend = min(rep_end_current[i] + flankingl, lgenome[chr]-1) + metagenome = metagenome + spacer + genome[chr][rstart:(rend+1)] + except KeyError: + print ("Unrecognised Chromosome: "+chr) + pass + + # Convert metagenome to SeqRecord object (required by SeqIO.write) + record = SeqRecord(Seq(metagenome, IUPAC.unambiguous_dna), id = "repname", name = "", description = "") + print ("saving repgenome " + newname + ".fa" + " (" + str(k) + " of " + str(nrepgenomes) + ")") + fastafilename = os.path.realpath(setup_folder + os.path.sep + newname + ".fa") + SeqIO.write(record, fastafilename, "fasta") + print ("indexing repgenome " + newname + ".fa" + " (" + str(k) + " of " + str(nrepgenomes) + ")") + command = shlex.split('bowtie-build -f ' + fastafilename + ' ' + setup_folder + os.path.sep + newname) + p = subprocess.Popen(command).communicate() + k += 1 + +print ("... Done") +