diff RepEnrich_setup.py @ 0:f6f0f1e5e940 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
author artbio
date Wed, 02 Aug 2017 05:17:29 -0400
parents
children 6bba3e33c2e7
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/RepEnrich_setup.py	Wed Aug 02 05:17:29 2017 -0400
@@ -0,0 +1,259 @@
+#!/usr/bin/env python
+import argparse
+import csv
+import os
+import shlex
+import subprocess
+import sys
+
+from Bio import SeqIO
+from Bio.Alphabet import IUPAC
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+
+parser = argparse.ArgumentParser(description='''
+             Part I: Prepartion of repetive element psuedogenomes and repetive\
+             element bamfiles.  This script prepares the annotation used by\
+             downstream applications to analyze for repetitive element\
+             enrichment. For this script to run properly bowtie must be\
+             loaded.  The repeat element psuedogenomes are prepared in order\
+             to analyze reads that map to multiple locations of the genome.\
+             The repeat element bamfiles are prepared in order to use a\
+             region sorter to analyze reads that map to a single location\
+             of the genome. You will 1) annotation_file:\
+             The repetitive element annotation file downloaded from\
+             RepeatMasker.org database for your organism of interest.\
+             2) genomefasta: Your genome of interest in fasta format,\
+             3)setup_folder: a folder to contain repeat element setup files\
+             command-line usage
+             EXAMPLE: python master_setup.py\
+             /users/nneretti/data/annotation/mm9/mm9_repeatmasker.txt\
+             /users/nneretti/data/annotation/mm9/mm9.fa\
+             /users/nneretti/data/annotation/mm9/setup_folder''',
+                                 prog='getargs_genome_maker.py')
+parser.add_argument('--version', action='version', version='%(prog)s 0.1')
+parser.add_argument('annotation_file', action='store',
+                    metavar='annotation_file',
+                    help='''List annotation file. The annotation file contains\
+                         the repeat masker annotation for the genome of\
+                         interest and may be downloaded at RepeatMasker.org\
+                         Example /data/annotation/mm9/mm9.fa.out''')
+parser.add_argument('genomefasta', action='store', metavar='genomefasta',
+                    help='''File name and path for genome of interest in fasta\
+                         format.  Example /data/annotation/mm9/mm9.fa''')
+parser.add_argument('setup_folder', action='store', metavar='setup_folder',
+                    help='''List folder to contain bamfiles for repeats and\
+                         repeat element psuedogenomes.\
+                         Example /data/annotation/mm9/setup''')
+parser.add_argument('--nfragmentsfile1', action='store',
+                    dest='nfragmentsfile1', metavar='nfragmentsfile1',
+                    default='./repnames_nfragments.txt',
+                    help='''Output location of a description file that saves\
+                         the number of fragments processed per repname.
+                         Default ./repnames_nfragments.txt''')
+parser.add_argument('--gaplength', action='store', dest='gaplength',
+                    metavar='gaplength', default='200', type=int,
+                    help='Length of the spacer used to build\
+                         repeat psuedogeneomes.  Default 200')
+parser.add_argument('--flankinglength', action='store', dest='flankinglength',
+                    metavar='flankinglength', default='25', type=int,
+                    help='Length of the flanking region adjacent to the repeat\
+                         element that is used to build repeat psuedogeneomes.\
+                         The flanking length should be set according to the\
+                         length of your reads.  Default 25')
+parser.add_argument('--is_bed', action='store', dest='is_bed',
+                    metavar='is_bed', default='FALSE',
+                    help='''Is the annotation file a bed file. This is also a\
+                         compatible format. The file needs to be a tab\
+                         separated bed with optional fields.
+                         Ex. format:
+                         chr\tstart\tend\tName_element\tclass\tfamily.
+                         The class and family should identical to name_element\
+                         if not applicable.  Default FALSE change to TRUE''')
+args = parser.parse_args()
+
+# parameters and paths specified in args_parse
+gapl = args.gaplength
+flankingl = args.flankinglength
+annotation_file = args.annotation_file
+genomefasta = args.genomefasta
+setup_folder = args.setup_folder
+nfragmentsfile1 = args.nfragmentsfile1
+is_bed = args.is_bed
+
+##############################################################################
+# check that the programs we need are available
+try:
+    subprocess.call(shlex.split("bowtie --version"),
+                    stdout=open(os.devnull, 'wb'),
+                    stderr=open(os.devnull, 'wb'))
+except OSError:
+    print("Error: Bowtie or BEDTools not loaded")
+    raise
+
+##############################################################################
+# Define a text importer
+csv.field_size_limit(sys.maxsize)
+
+
+def import_text(filename, separator):
+    for line in csv.reader(open(os.path.realpath(filename)),
+                           delimiter=separator, skipinitialspace=True):
+        if line:
+            yield line
+
+
+# Make a setup folder
+if not os.path.exists(setup_folder):
+    os.makedirs(setup_folder)
+##############################################################################
+# load genome into dictionary
+print("loading genome...")
+g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta"))
+
+print("Precomputing length of all chromosomes...")
+idxgenome = {}
+lgenome = {}
+genome = {}
+allchrs = g.keys()
+k = 0
+for chr in allchrs:
+    genome[chr] = str(g[chr].seq)
+#    del g[chr]
+    lgenome[chr] = len(genome[chr])
+    idxgenome[chr] = k
+    k = k + 1
+del g
+
+##############################################################################
+# Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter
+if is_bed == "FALSE":
+    repeat_elements = []
+    fout = open(os.path.realpath(setup_folder + os.path.sep
+                                 + 'repnames.bed'), 'w')
+    fin = import_text(annotation_file, ' ')
+    x = 0
+    rep_chr = {}
+    rep_start = {}
+    rep_end = {}
+    x = 0
+    for line in fin:
+        if x > 2:
+            line9 = line[9].replace("(", "_").replace(")",
+                                                      "_").replace("/", "_")
+            repname = line9
+            if repname not in repeat_elements:
+                repeat_elements.append(repname)
+            repchr = line[4]
+            repstart = int(line[5])
+            repend = int(line[6])
+            fout.write(str(repchr) + '\t' + str(repstart) + '\t' + str(repend)
+                       + '\t' + str(repname) + '\n')
+            if repname in rep_chr:
+                rep_chr[repname].append(repchr)
+                rep_start[repname].append(int(repstart))
+                rep_end[repname].append(int(repend))
+            else:
+                rep_chr[repname] = [repchr]
+                rep_start[repname] = [int(repstart)]
+                rep_end[repname] = [int(repend)]
+        x += 1
+if is_bed == "TRUE":
+    repeat_elements = []
+    fout = open(os.path.realpath(setup_folder + os.path.sep + 'repnames.bed'),
+                'w')
+    fin = open(os.path.realpath(annotation_file), 'r')
+    x = 0
+    rep_chr = {}
+    rep_start = {}
+    rep_end = {}
+    x = 0
+    for line in fin:
+        line = line.strip('\n')
+        line = line.split('\t')
+        line3 = line[3].replace("(", "_").replace(")", "_").replace("/", "_")
+        repname = line3
+        if repname not in repeat_elements:
+            repeat_elements.append(repname)
+        repchr = line[0]
+        repstart = int(line[1])
+        repend = int(line[2])
+        fout.write(str(repchr) + '\t' + str(repstart) + '\t' +
+                   str(repend) + '\t' + str(repname) + '\n')
+        # if rep_chr.has_key(repname):
+        if repname in rep_chr:
+            rep_chr[repname].append(repchr)
+            rep_start[repname].append(int(repstart))
+            rep_end[repname].append(int(repend))
+        else:
+            rep_chr[repname] = [repchr]
+            rep_start[repname] = [int(repstart)]
+            rep_end[repname] = [int(repend)]
+
+fin.close()
+fout.close()
+repeat_elements = sorted(repeat_elements)
+print("Writing a key for all repeats...")
+# print to fout the binary key that contains each repeat type with the
+# associated binary number; sort the binary key:
+fout = open(os.path.realpath(setup_folder + os.path.sep +
+                             'repgenomes_key.txt'), 'w')
+x = 0
+for repeat in repeat_elements:
+    # print >> fout, str(repeat) + '\t' + str(x)
+    fout.write(str(repeat) + '\t' + str(x) + '\n')
+    x += 1
+fout.close()
+##############################################################################
+# generate spacer for psuedogenomes
+spacer = ""
+for i in range(gapl):
+    spacer = spacer + "N"
+
+# save file with number of fragments processed per repname
+print("Saving number of fragments processed per repname to "
+      + nfragmentsfile1)
+fout1 = open(os.path.realpath(nfragmentsfile1), "w")
+for repname in rep_chr.keys():
+    rep_chr_current = rep_chr[repname]
+#    print >>fout1, str(len(rep_chr[repname])) + "\t" + repname
+    fout1.write(str(len(rep_chr[repname])) + "\t" + repname + '\n')
+fout1.close()
+
+# generate metagenomes and save them to FASTA files
+k = 1
+nrepgenomes = len(rep_chr.keys())
+for repname in rep_chr.keys():
+    metagenome = ""
+    newname = repname.replace("(", "_").replace(")", "_").replace("/", "_")
+    print("processing repgenome " + newname + ".fa" + " (" + str(k)
+          + " of " + str(nrepgenomes) + ")")
+    rep_chr_current = rep_chr[repname]
+    rep_start_current = rep_start[repname]
+    rep_end_current = rep_end[repname]
+    print("-------> " + str(len(rep_chr[repname])) + " fragments")
+    for i in range(len(rep_chr[repname])):
+        try:
+            chr = rep_chr_current[i]
+            rstart = max(rep_start_current[i] - flankingl, 0)
+            rend = min(rep_end_current[i] + flankingl, lgenome[chr]-1)
+            metagenome = metagenome + spacer + genome[chr][rstart:(rend+1)]
+        except KeyError:
+            print("Unrecognised Chromosome: "+chr)
+            pass
+    # Convert metagenome to SeqRecord object (required by SeqIO.write)
+    record = SeqRecord(Seq(metagenome, IUPAC.unambiguous_dna), id="repname",
+                       name="", description="")
+    print("saving repgenome " + newname + ".fa" + " (" + str(k) + " of "
+          + str(nrepgenomes) + ")")
+    fastafilename = os.path.realpath(setup_folder + os.path.sep
+                                     + newname + ".fa")
+    SeqIO.write(record, fastafilename, "fasta")
+    print("indexing repgenome " + newname + ".fa" + " (" +
+          str(k) + " of " + str(nrepgenomes) + ")")
+    command = shlex.split('bowtie-build -f ' + fastafilename + ' ' +
+                          setup_folder + os.path.sep + newname)
+    p = subprocess.Popen(command).communicate()
+    k += 1
+
+print("... Done")