Mercurial > repos > artbio > repenrich
diff RepEnrich_setup.py @ 12:89e05f831259 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
author | artbio |
---|---|
date | Mon, 18 Mar 2024 09:39:44 +0000 |
parents | 6bba3e33c2e7 |
children | 530626b0757c |
line wrap: on
line diff
--- a/RepEnrich_setup.py Sat Mar 09 22:32:46 2024 +0000 +++ b/RepEnrich_setup.py Mon Mar 18 09:39:44 2024 +0000 @@ -11,86 +11,49 @@ from Bio.SeqRecord import SeqRecord parser = argparse.ArgumentParser(description=''' - Part I: Prepartion of repetive element psuedogenomes and repetive\ - element bamfiles. This script prepares the annotation used by\ - downstream applications to analyze for repetitive element\ - enrichment. For this script to run properly bowtie must be\ - loaded. The repeat element psuedogenomes are prepared in order\ - to analyze reads that map to multiple locations of the genome.\ - The repeat element bamfiles are prepared in order to use a\ - region sorter to analyze reads that map to a single location\ - of the genome. You will 1) annotation_file:\ - The repetitive element annotation file downloaded from\ - RepeatMasker.org database for your organism of interest.\ - 2) genomefasta: Your genome of interest in fasta format,\ - 3)setup_folder: a folder to contain repeat element setup files\ - command-line usage - EXAMPLE: python master_setup.py\ - /users/nneretti/data/annotation/mm9/mm9_repeatmasker.txt\ - /users/nneretti/data/annotation/mm9/mm9.fa\ - /users/nneretti/data/annotation/mm9/setup_folder''', + Prepartion of repetive element pseudogenomes bowtie\ + indexes and annotation files used by RepEnrich.py enrichment.''', prog='getargs_genome_maker.py') -parser.add_argument('--version', action='version', version='%(prog)s 0.1') -parser.add_argument('annotation_file', action='store', +parser.add_argument('--annotation_file', action='store', metavar='annotation_file', - help='''List annotation file. The annotation file contains\ - the repeat masker annotation for the genome of\ - interest and may be downloaded at RepeatMasker.org\ - Example /data/annotation/mm9/mm9.fa.out''') -parser.add_argument('genomefasta', action='store', metavar='genomefasta', - help='''File name and path for genome of interest in fasta\ - format. Example /data/annotation/mm9/mm9.fa''') -parser.add_argument('setup_folder', action='store', metavar='setup_folder', - help='''List folder to contain bamfiles for repeats and\ + help='''Repeat masker annotation of the genome of\ + interest. Download from RepeatMasker.org\ + Example: mm9.fa.out''') +parser.add_argument('--genomefasta', action='store', metavar='genomefasta', + help='''Genome of interest in fasta format.\ + Example: mm9.fa''') +parser.add_argument('--setup_folder', action='store', metavar='setup_folder', + help='''Folder that contains bowtie indexes of repeats and\ repeat element psuedogenomes.\ - Example /data/annotation/mm9/setup''') -parser.add_argument('--nfragmentsfile1', action='store', - dest='nfragmentsfile1', metavar='nfragmentsfile1', - default='./repnames_nfragments.txt', - help='''Output location of a description file that saves\ - the number of fragments processed per repname. - Default ./repnames_nfragments.txt''') + Example working/setup''') parser.add_argument('--gaplength', action='store', dest='gaplength', metavar='gaplength', default='200', type=int, - help='Length of the spacer used to build\ - repeat psuedogeneomes. Default 200') + help='''Length of the N-spacer in the\ + repeat pseudogenomes. Default 200''') parser.add_argument('--flankinglength', action='store', dest='flankinglength', metavar='flankinglength', default='25', type=int, - help='Length of the flanking region adjacent to the repeat\ - element that is used to build repeat psuedogeneomes.\ - The flanking length should be set according to the\ - length of your reads. Default 25') -parser.add_argument('--is_bed', action='store', dest='is_bed', - metavar='is_bed', default='FALSE', - help='''Is the annotation file a bed file. This is also a\ - compatible format. The file needs to be a tab\ - separated bed with optional fields. - Ex. format: - chr\tstart\tend\tName_element\tclass\tfamily. - The class and family should identical to name_element\ - if not applicable. Default FALSE change to TRUE''') + help='''Length of the flanking regions used to build\ + repeat pseudogenomes. Flanking length should be set\ + according to the length of your reads.\ + Default 25, for 50 nt reads''') args = parser.parse_args() -# parameters and paths specified in args_parse +# parameters from argsparse gapl = args.gaplength flankingl = args.flankinglength annotation_file = args.annotation_file genomefasta = args.genomefasta setup_folder = args.setup_folder -nfragmentsfile1 = args.nfragmentsfile1 -is_bed = args.is_bed -############################################################################## # check that the programs we need are available try: subprocess.call(shlex.split("bowtie --version"), stdout=open(os.devnull, 'wb'), stderr=open(os.devnull, 'wb')) except OSError: - print("Error: Bowtie or BEDTools not loaded") + print("Error: Bowtie not available in the path") raise -############################################################################## # Define a text importer csv.field_size_limit(sys.maxsize) @@ -105,154 +68,67 @@ # Make a setup folder if not os.path.exists(setup_folder): os.makedirs(setup_folder) -############################################################################## -# load genome into dictionary -print("loading genome...") +# load genome into dictionary and compute length g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta")) +idxgenome, lgenome, genome = {}, {}, {} -print("Precomputing length of all chromosomes...") -idxgenome = {} -lgenome = {} -genome = {} -allchrs = g.keys() -k = 0 -for chr in allchrs: - genome[chr] = str(g[chr].seq) -# del g[chr] +for k, chr in enumerate(g.keys()): + genome[chr] = g[chr].seq lgenome[chr] = len(genome[chr]) idxgenome[chr] = k - k = k + 1 -del g -############################################################################## # Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter -if is_bed == "FALSE": - repeat_elements = [] - fout = open(os.path.realpath(setup_folder + os.path.sep - + 'repnames.bed'), 'w') - fin = import_text(annotation_file, ' ') - x = 0 - rep_chr = {} - rep_start = {} - rep_end = {} - x = 0 +repeat_elements = [] +# these dictionaries will contain lists +rep_chr, rep_start, rep_end = {}, {}, {} +fin = import_text(annotation_file, ' ') +with open(os.path.join(setup_folder, 'repnames.bed'), 'w') as fout: + for i in range(3): + next(fin) for line in fin: - if x > 2: - line9 = line[9].replace("(", "_").replace(")", - "_").replace("/", "_") - repname = line9 - if repname not in repeat_elements: - repeat_elements.append(repname) - repchr = line[4] - repstart = int(line[5]) - repend = int(line[6]) - fout.write(str(repchr) + '\t' + str(repstart) + '\t' + str(repend) - + '\t' + str(repname) + '\n') - if repname in rep_chr: - rep_chr[repname].append(repchr) - rep_start[repname].append(int(repstart)) - rep_end[repname].append(int(repend)) - else: - rep_chr[repname] = [repchr] - rep_start[repname] = [int(repstart)] - rep_end[repname] = [int(repend)] - x += 1 -if is_bed == "TRUE": - repeat_elements = [] - fout = open(os.path.realpath(setup_folder + os.path.sep + 'repnames.bed'), - 'w') - fin = open(os.path.realpath(annotation_file), 'r') - x = 0 - rep_chr = {} - rep_start = {} - rep_end = {} - x = 0 - for line in fin: - line = line.strip('\n') - line = line.split('\t') - line3 = line[3].replace("(", "_").replace(")", "_").replace("/", "_") - repname = line3 + repname = line[9].translate(str.maketrans('()/', '___')) if repname not in repeat_elements: repeat_elements.append(repname) - repchr = line[0] - repstart = int(line[1]) - repend = int(line[2]) - fout.write(str(repchr) + '\t' + str(repstart) + '\t' + - str(repend) + '\t' + str(repname) + '\n') - # if rep_chr.has_key(repname): + repchr = line[4] + repstart = line[5] + repend = line[6] + fout.write('\t'.join([repchr, repstart, repend, repname]) + '\n') if repname in rep_chr: rep_chr[repname].append(repchr) - rep_start[repname].append(int(repstart)) - rep_end[repname].append(int(repend)) + rep_start[repname].append(repstart) + rep_end[repname].append(repend) else: rep_chr[repname] = [repchr] - rep_start[repname] = [int(repstart)] - rep_end[repname] = [int(repend)] + rep_start[repname] = [repstart] + rep_end[repname] = [repend] -fin.close() -fout.close() -repeat_elements = sorted(repeat_elements) -print("Writing a key for all repeats...") -# print to fout the binary key that contains each repeat type with the -# associated binary number; sort the binary key: -fout = open(os.path.realpath(setup_folder + os.path.sep + - 'repgenomes_key.txt'), 'w') -x = 0 -for repeat in repeat_elements: - # print >> fout, str(repeat) + '\t' + str(x) - fout.write(str(repeat) + '\t' + str(x) + '\n') - x += 1 -fout.close() -############################################################################## -# generate spacer for psuedogenomes -spacer = "" -for i in range(gapl): - spacer = spacer + "N" +# sort repeat_elements and print them in repgenomes_key.txt +with open(os.path.join(setup_folder, 'repgenomes_key.txt'), 'w') as fout: + for i, repeat in enumerate(sorted(repeat_elements)): + fout.write('\t'.join([repeat, str(i)]) + '\n') -# save file with number of fragments processed per repname -print("Saving number of fragments processed per repname to " - + nfragmentsfile1) -fout1 = open(os.path.realpath(nfragmentsfile1), "w") -for repname in rep_chr.keys(): - rep_chr_current = rep_chr[repname] -# print >>fout1, str(len(rep_chr[repname])) + "\t" + repname - fout1.write(str(len(rep_chr[repname])) + "\t" + repname + '\n') -fout1.close() +# generate spacer for pseudogenomes +spacer = ''.join(['N' for i in range(gapl)]) -# generate metagenomes and save them to FASTA files -k = 1 -nrepgenomes = len(rep_chr.keys()) -for repname in rep_chr.keys(): - metagenome = "" - newname = repname.replace("(", "_").replace(")", "_").replace("/", "_") - print("processing repgenome " + newname + ".fa" + " (" + str(k) - + " of " + str(nrepgenomes) + ")") - rep_chr_current = rep_chr[repname] - rep_start_current = rep_start[repname] - rep_end_current = rep_end[repname] - print("-------> " + str(len(rep_chr[repname])) + " fragments") - for i in range(len(rep_chr[repname])): +# generate metagenomes and save them to FASTA files for bowtie build +for repname in rep_chr: + metagenome = '' + for i, repeat in enumerate(rep_chr[repname]): try: - chr = rep_chr_current[i] - rstart = max(rep_start_current[i] - flankingl, 0) - rend = min(rep_end_current[i] + flankingl, lgenome[chr]-1) - metagenome = metagenome + spacer + genome[chr][rstart:(rend+1)] + chromosome = rep_chr[repname][i] + start = max(int(rep_start[repname][i]) - flankingl, 0) + end = min(int(rep_end[repname][i]) + flankingl, + int(lgenome[chr])-1) + 1 + metagenome = f"{metagenome}{spacer}{genome[chromosome][start:end]}" except KeyError: - print("Unrecognised Chromosome: "+chr) - pass - # Convert metagenome to SeqRecord object (required by SeqIO.write) - record = SeqRecord(Seq(metagenome), id="repname", - name="", description="") - print("saving repgenome " + newname + ".fa" + " (" + str(k) + " of " - + str(nrepgenomes) + ")") - fastafilename = os.path.realpath(setup_folder + os.path.sep - + newname + ".fa") + print("Unrecognised Chromosome: " + rep_chr[repname][i]) + + # Create Fasta of repeat pseudogenome + fastafilename = f"{os.path.join(setup_folder, repname)}.fa" + record = SeqRecord(Seq(metagenome), id=repname, name='', description='') SeqIO.write(record, fastafilename, "fasta") - print("indexing repgenome " + newname + ".fa" + " (" + - str(k) + " of " + str(nrepgenomes) + ")") - command = shlex.split('bowtie-build -f ' + fastafilename + ' ' + - setup_folder + os.path.sep + newname) - p = subprocess.Popen(command).communicate() - k += 1 -print("... Done") + # Generate repeat pseudogenome bowtie index + bowtie_build_cmd = ["bowtie-build", "-f", fastafilename, + os.path.join(setup_folder, repname)] + subprocess.run(bowtie_build_cmd, check=True)