diff RepEnrich_setup.py @ 12:89e05f831259 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
author artbio
date Mon, 18 Mar 2024 09:39:44 +0000
parents 6bba3e33c2e7
children 530626b0757c
line wrap: on
line diff
--- a/RepEnrich_setup.py	Sat Mar 09 22:32:46 2024 +0000
+++ b/RepEnrich_setup.py	Mon Mar 18 09:39:44 2024 +0000
@@ -11,86 +11,49 @@
 from Bio.SeqRecord import SeqRecord
 
 parser = argparse.ArgumentParser(description='''
-             Part I: Prepartion of repetive element psuedogenomes and repetive\
-             element bamfiles.  This script prepares the annotation used by\
-             downstream applications to analyze for repetitive element\
-             enrichment. For this script to run properly bowtie must be\
-             loaded.  The repeat element psuedogenomes are prepared in order\
-             to analyze reads that map to multiple locations of the genome.\
-             The repeat element bamfiles are prepared in order to use a\
-             region sorter to analyze reads that map to a single location\
-             of the genome. You will 1) annotation_file:\
-             The repetitive element annotation file downloaded from\
-             RepeatMasker.org database for your organism of interest.\
-             2) genomefasta: Your genome of interest in fasta format,\
-             3)setup_folder: a folder to contain repeat element setup files\
-             command-line usage
-             EXAMPLE: python master_setup.py\
-             /users/nneretti/data/annotation/mm9/mm9_repeatmasker.txt\
-             /users/nneretti/data/annotation/mm9/mm9.fa\
-             /users/nneretti/data/annotation/mm9/setup_folder''',
+             Prepartion of repetive element pseudogenomes bowtie\
+             indexes and annotation files used by RepEnrich.py enrichment.''',
                                  prog='getargs_genome_maker.py')
-parser.add_argument('--version', action='version', version='%(prog)s 0.1')
-parser.add_argument('annotation_file', action='store',
+parser.add_argument('--annotation_file', action='store',
                     metavar='annotation_file',
-                    help='''List annotation file. The annotation file contains\
-                         the repeat masker annotation for the genome of\
-                         interest and may be downloaded at RepeatMasker.org\
-                         Example /data/annotation/mm9/mm9.fa.out''')
-parser.add_argument('genomefasta', action='store', metavar='genomefasta',
-                    help='''File name and path for genome of interest in fasta\
-                         format.  Example /data/annotation/mm9/mm9.fa''')
-parser.add_argument('setup_folder', action='store', metavar='setup_folder',
-                    help='''List folder to contain bamfiles for repeats and\
+                    help='''Repeat masker annotation of the genome of\
+                         interest. Download from RepeatMasker.org\
+                         Example: mm9.fa.out''')
+parser.add_argument('--genomefasta', action='store', metavar='genomefasta',
+                    help='''Genome of interest in fasta format.\
+                         Example: mm9.fa''')
+parser.add_argument('--setup_folder', action='store', metavar='setup_folder',
+                    help='''Folder that contains bowtie indexes of repeats and\
                          repeat element psuedogenomes.\
-                         Example /data/annotation/mm9/setup''')
-parser.add_argument('--nfragmentsfile1', action='store',
-                    dest='nfragmentsfile1', metavar='nfragmentsfile1',
-                    default='./repnames_nfragments.txt',
-                    help='''Output location of a description file that saves\
-                         the number of fragments processed per repname.
-                         Default ./repnames_nfragments.txt''')
+                         Example working/setup''')
 parser.add_argument('--gaplength', action='store', dest='gaplength',
                     metavar='gaplength', default='200', type=int,
-                    help='Length of the spacer used to build\
-                         repeat psuedogeneomes.  Default 200')
+                    help='''Length of the N-spacer in the\
+                         repeat pseudogenomes.  Default 200''')
 parser.add_argument('--flankinglength', action='store', dest='flankinglength',
                     metavar='flankinglength', default='25', type=int,
-                    help='Length of the flanking region adjacent to the repeat\
-                         element that is used to build repeat psuedogeneomes.\
-                         The flanking length should be set according to the\
-                         length of your reads.  Default 25')
-parser.add_argument('--is_bed', action='store', dest='is_bed',
-                    metavar='is_bed', default='FALSE',
-                    help='''Is the annotation file a bed file. This is also a\
-                         compatible format. The file needs to be a tab\
-                         separated bed with optional fields.
-                         Ex. format:
-                         chr\tstart\tend\tName_element\tclass\tfamily.
-                         The class and family should identical to name_element\
-                         if not applicable.  Default FALSE change to TRUE''')
+                    help='''Length of the flanking regions used to build\
+                         repeat pseudogenomes. Flanking length should be set\
+                         according to the length of your reads.\
+                         Default 25, for 50 nt reads''')
 args = parser.parse_args()
 
-# parameters and paths specified in args_parse
+# parameters from argsparse
 gapl = args.gaplength
 flankingl = args.flankinglength
 annotation_file = args.annotation_file
 genomefasta = args.genomefasta
 setup_folder = args.setup_folder
-nfragmentsfile1 = args.nfragmentsfile1
-is_bed = args.is_bed
 
-##############################################################################
 # check that the programs we need are available
 try:
     subprocess.call(shlex.split("bowtie --version"),
                     stdout=open(os.devnull, 'wb'),
                     stderr=open(os.devnull, 'wb'))
 except OSError:
-    print("Error: Bowtie or BEDTools not loaded")
+    print("Error: Bowtie not available in the path")
     raise
 
-##############################################################################
 # Define a text importer
 csv.field_size_limit(sys.maxsize)
 
@@ -105,154 +68,67 @@
 # Make a setup folder
 if not os.path.exists(setup_folder):
     os.makedirs(setup_folder)
-##############################################################################
-# load genome into dictionary
-print("loading genome...")
+# load genome into dictionary and compute length
 g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta"))
+idxgenome, lgenome, genome = {}, {}, {}
 
-print("Precomputing length of all chromosomes...")
-idxgenome = {}
-lgenome = {}
-genome = {}
-allchrs = g.keys()
-k = 0
-for chr in allchrs:
-    genome[chr] = str(g[chr].seq)
-#    del g[chr]
+for k, chr in enumerate(g.keys()):
+    genome[chr] = g[chr].seq
     lgenome[chr] = len(genome[chr])
     idxgenome[chr] = k
-    k = k + 1
-del g
 
-##############################################################################
 # Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter
-if is_bed == "FALSE":
-    repeat_elements = []
-    fout = open(os.path.realpath(setup_folder + os.path.sep
-                                 + 'repnames.bed'), 'w')
-    fin = import_text(annotation_file, ' ')
-    x = 0
-    rep_chr = {}
-    rep_start = {}
-    rep_end = {}
-    x = 0
+repeat_elements = []
+# these dictionaries will contain lists
+rep_chr, rep_start, rep_end = {}, {}, {}
+fin = import_text(annotation_file, ' ')
+with open(os.path.join(setup_folder, 'repnames.bed'), 'w') as fout:
+    for i in range(3):
+        next(fin)
     for line in fin:
-        if x > 2:
-            line9 = line[9].replace("(", "_").replace(")",
-                                                      "_").replace("/", "_")
-            repname = line9
-            if repname not in repeat_elements:
-                repeat_elements.append(repname)
-            repchr = line[4]
-            repstart = int(line[5])
-            repend = int(line[6])
-            fout.write(str(repchr) + '\t' + str(repstart) + '\t' + str(repend)
-                       + '\t' + str(repname) + '\n')
-            if repname in rep_chr:
-                rep_chr[repname].append(repchr)
-                rep_start[repname].append(int(repstart))
-                rep_end[repname].append(int(repend))
-            else:
-                rep_chr[repname] = [repchr]
-                rep_start[repname] = [int(repstart)]
-                rep_end[repname] = [int(repend)]
-        x += 1
-if is_bed == "TRUE":
-    repeat_elements = []
-    fout = open(os.path.realpath(setup_folder + os.path.sep + 'repnames.bed'),
-                'w')
-    fin = open(os.path.realpath(annotation_file), 'r')
-    x = 0
-    rep_chr = {}
-    rep_start = {}
-    rep_end = {}
-    x = 0
-    for line in fin:
-        line = line.strip('\n')
-        line = line.split('\t')
-        line3 = line[3].replace("(", "_").replace(")", "_").replace("/", "_")
-        repname = line3
+        repname = line[9].translate(str.maketrans('()/', '___'))
         if repname not in repeat_elements:
             repeat_elements.append(repname)
-        repchr = line[0]
-        repstart = int(line[1])
-        repend = int(line[2])
-        fout.write(str(repchr) + '\t' + str(repstart) + '\t' +
-                   str(repend) + '\t' + str(repname) + '\n')
-        # if rep_chr.has_key(repname):
+        repchr = line[4]
+        repstart = line[5]
+        repend = line[6]
+        fout.write('\t'.join([repchr, repstart, repend, repname]) + '\n')
         if repname in rep_chr:
             rep_chr[repname].append(repchr)
-            rep_start[repname].append(int(repstart))
-            rep_end[repname].append(int(repend))
+            rep_start[repname].append(repstart)
+            rep_end[repname].append(repend)
         else:
             rep_chr[repname] = [repchr]
-            rep_start[repname] = [int(repstart)]
-            rep_end[repname] = [int(repend)]
+            rep_start[repname] = [repstart]
+            rep_end[repname] = [repend]
 
-fin.close()
-fout.close()
-repeat_elements = sorted(repeat_elements)
-print("Writing a key for all repeats...")
-# print to fout the binary key that contains each repeat type with the
-# associated binary number; sort the binary key:
-fout = open(os.path.realpath(setup_folder + os.path.sep +
-                             'repgenomes_key.txt'), 'w')
-x = 0
-for repeat in repeat_elements:
-    # print >> fout, str(repeat) + '\t' + str(x)
-    fout.write(str(repeat) + '\t' + str(x) + '\n')
-    x += 1
-fout.close()
-##############################################################################
-# generate spacer for psuedogenomes
-spacer = ""
-for i in range(gapl):
-    spacer = spacer + "N"
+# sort repeat_elements and print them in repgenomes_key.txt
+with open(os.path.join(setup_folder, 'repgenomes_key.txt'), 'w') as fout:
+    for i, repeat in enumerate(sorted(repeat_elements)):
+        fout.write('\t'.join([repeat, str(i)]) + '\n')
 
-# save file with number of fragments processed per repname
-print("Saving number of fragments processed per repname to "
-      + nfragmentsfile1)
-fout1 = open(os.path.realpath(nfragmentsfile1), "w")
-for repname in rep_chr.keys():
-    rep_chr_current = rep_chr[repname]
-#    print >>fout1, str(len(rep_chr[repname])) + "\t" + repname
-    fout1.write(str(len(rep_chr[repname])) + "\t" + repname + '\n')
-fout1.close()
+# generate spacer for pseudogenomes
+spacer = ''.join(['N' for i in range(gapl)])
 
-# generate metagenomes and save them to FASTA files
-k = 1
-nrepgenomes = len(rep_chr.keys())
-for repname in rep_chr.keys():
-    metagenome = ""
-    newname = repname.replace("(", "_").replace(")", "_").replace("/", "_")
-    print("processing repgenome " + newname + ".fa" + " (" + str(k)
-          + " of " + str(nrepgenomes) + ")")
-    rep_chr_current = rep_chr[repname]
-    rep_start_current = rep_start[repname]
-    rep_end_current = rep_end[repname]
-    print("-------> " + str(len(rep_chr[repname])) + " fragments")
-    for i in range(len(rep_chr[repname])):
+# generate metagenomes and save them to FASTA files for bowtie build
+for repname in rep_chr:
+    metagenome = ''
+    for i, repeat in enumerate(rep_chr[repname]):
         try:
-            chr = rep_chr_current[i]
-            rstart = max(rep_start_current[i] - flankingl, 0)
-            rend = min(rep_end_current[i] + flankingl, lgenome[chr]-1)
-            metagenome = metagenome + spacer + genome[chr][rstart:(rend+1)]
+            chromosome = rep_chr[repname][i]
+            start = max(int(rep_start[repname][i]) - flankingl, 0)
+            end = min(int(rep_end[repname][i]) + flankingl,
+                      int(lgenome[chr])-1) + 1
+            metagenome = f"{metagenome}{spacer}{genome[chromosome][start:end]}"
         except KeyError:
-            print("Unrecognised Chromosome: "+chr)
-            pass
-    # Convert metagenome to SeqRecord object (required by SeqIO.write)
-    record = SeqRecord(Seq(metagenome), id="repname",
-                       name="", description="")
-    print("saving repgenome " + newname + ".fa" + " (" + str(k) + " of "
-          + str(nrepgenomes) + ")")
-    fastafilename = os.path.realpath(setup_folder + os.path.sep
-                                     + newname + ".fa")
+            print("Unrecognised Chromosome: " + rep_chr[repname][i])
+
+    # Create Fasta of repeat pseudogenome
+    fastafilename = f"{os.path.join(setup_folder, repname)}.fa"
+    record = SeqRecord(Seq(metagenome), id=repname, name='', description='')
     SeqIO.write(record, fastafilename, "fasta")
-    print("indexing repgenome " + newname + ".fa" + " (" +
-          str(k) + " of " + str(nrepgenomes) + ")")
-    command = shlex.split('bowtie-build -f ' + fastafilename + ' ' +
-                          setup_folder + os.path.sep + newname)
-    p = subprocess.Popen(command).communicate()
-    k += 1
 
-print("... Done")
+    # Generate repeat pseudogenome bowtie index
+    bowtie_build_cmd = ["bowtie-build", "-f", fastafilename,
+                        os.path.join(setup_folder, repname)]
+    subprocess.run(bowtie_build_cmd, check=True)