comparison RepEnrich_setup.py @ 0:1435d142041b draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit d5ebd581fa3a22ca61ce07a31c01bb70610fbcf5
author drosofff
date Tue, 23 May 2017 18:37:22 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1435d142041b
1 #!/usr/bin/env python
2 import argparse
3 import csv
4 import os
5 import shlex
6 import subprocess
7 import sys
8 from Bio import SeqIO
9 from Bio.Seq import Seq
10 from Bio.SeqRecord import SeqRecord
11 from Bio.Alphabet import IUPAC
12
13 parser = argparse.ArgumentParser(description='Part I: Prepartion of repetive element psuedogenomes and repetive element bamfiles. This script prepares the annotation used by downstream applications to analyze for repetitive element enrichment. For this script to run properly bowtie must be loaded. The repeat element psuedogenomes are prepared in order to analyze reads that map to multiple locations of the genome. The repeat element bamfiles are prepared in order to use a region sorter to analyze reads that map to a single location of the genome.You will 1) annotation_file: The repetitive element annotation file downloaded from RepeatMasker.org database for your organism of interest. 2) genomefasta: Your genome of interest in fasta format, 3)setup_folder: a folder to contain repeat element setup files command-line usage EXAMPLE: python master_setup.py /users/nneretti/data/annotation/mm9/mm9_repeatmasker.txt /users/nneretti/data/annotation/mm9/mm9.fa /users/nneretti/data/annotation/mm9/setup_folder', prog='getargs_genome_maker.py')
14 parser.add_argument('--version', action='version', version='%(prog)s 0.1')
15 parser.add_argument('annotation_file', action= 'store', metavar='annotation_file', help='List annotation file. The annotation file contains the repeat masker annotation for the genome of interest and may be downloaded at RepeatMasker.org Example /data/annotation/mm9/mm9.fa.out')
16 parser.add_argument('genomefasta', action= 'store', metavar='genomefasta', help='File name and path for genome of interest in fasta format. Example /data/annotation/mm9/mm9.fa')
17 parser.add_argument('setup_folder', action= 'store', metavar='setup_folder', help='List folder to contain bamfiles for repeats and repeat element psuedogenomes. Example /data/annotation/mm9/setup')
18 parser.add_argument('--nfragmentsfile1', action= 'store', dest='nfragmentsfile1', metavar='nfragmentsfile1', default='./repnames_nfragments.txt', help='Output location of a description file that saves the number of fragments processed per repname. Default ./repnames_nfragments.txt')
19 parser.add_argument('--gaplength', action= 'store', dest='gaplength', metavar='gaplength', default= '200', type=int, help='Length of the spacer used to build repeat psuedogeneomes. Default 200')
20 parser.add_argument('--flankinglength', action= 'store', dest='flankinglength', metavar='flankinglength', default= '25', type=int, help='Length of the flanking region adjacent to the repeat element that is used to build repeat psuedogeneomes. The flanking length should be set according to the length of your reads. Default 25')
21 parser.add_argument('--is_bed', action= 'store', dest='is_bed', metavar='is_bed', default= 'FALSE', help='Is the annotation file a bed file. This is also a compatible format. The file needs to be a tab seperated bed with optional fields. Ex. format chr\tstart\tend\tName_element\tclass\tfamily. The class and family should identical to name_element if not applicable. Default FALSE change to TRUE')
22 args = parser.parse_args()
23
24 # parameters and paths specified in args_parse
25 gapl = args.gaplength
26 flankingl = args.flankinglength
27 annotation_file = args.annotation_file
28 genomefasta = args.genomefasta
29 setup_folder = args.setup_folder
30 nfragmentsfile1 = args.nfragmentsfile1
31 is_bed = args.is_bed
32
33 ################################################################################
34 # check that the programs we need are available
35 try:
36 subprocess.call(shlex.split("bowtie --version"), stdout=open(os.devnull, 'wb'), stderr=open(os.devnull, 'wb'))
37 except OSError:
38 print ("Error: Bowtie or BEDTools not loaded")
39 raise
40
41 ################################################################################
42 # Define a text importer
43 csv.field_size_limit(sys.maxsize)
44 def import_text(filename, separator):
45 for line in csv.reader(open(os.path.realpath(filename)), delimiter=separator,
46 skipinitialspace=True):
47 if line:
48 yield line
49 # Make a setup folder
50 if not os.path.exists(setup_folder):
51 os.makedirs(setup_folder)
52
53 ################################################################################
54 # load genome into dictionary
55 print ("loading genome...")
56 g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta"))
57
58 print ("Precomputing length of all chromosomes...")
59 idxgenome = {}
60 lgenome = {}
61 genome = {}
62 allchrs = g.keys()
63 k = 0
64 for chr in allchrs:
65 genome[chr] = str(g[chr].seq)
66 # del g[chr]
67 lgenome[chr] = len(genome[chr])
68 idxgenome[chr] = k
69 k = k + 1
70 del g
71
72 ################################################################################
73 # Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter
74 if is_bed == "FALSE":
75 repeat_elements= []
76 fout = open(os.path.realpath(setup_folder + os.path.sep + 'repnames.bed'), 'w')
77 fin = import_text(annotation_file, ' ')
78 x = 0
79 rep_chr = {}
80 rep_start = {}
81 rep_end = {}
82 x = 0
83 for line in fin:
84 if x>2:
85 line9 = line[9].replace("(","_").replace(")","_").replace("/","_")
86 repname = line9
87 if not repname in repeat_elements:
88 repeat_elements.append(repname)
89 repchr = line[4]
90 repstart = int(line[5])
91 repend = int(line[6])
92 # print >> fout, str(repchr) + '\t'+str(repstart)+ '\t'+str(repend)+ '\t'+str(repname)
93 fout.write(str(repchr) + '\t'+str(repstart)+ '\t'+str(repend)+ '\t'+str(repname)+ '\n')
94 # if rep_chr.has_key(repname):
95 if repname in rep_chr:
96 rep_chr[repname].append(repchr)
97 rep_start[repname].append(int(repstart))
98 rep_end[repname].append(int(repend))
99 else:
100 rep_chr[repname] = [repchr]
101 rep_start[repname] = [int(repstart)]
102 rep_end[repname] = [int(repend)]
103 x +=1
104 if is_bed == "TRUE":
105 repeat_elements= []
106 fout = open(os.path.realpath(setup_folder + os.path.sep + 'repnames.bed'), 'w')
107 fin = open(os.path.realpath(annotation_file), 'r')
108 x =0
109 rep_chr = {}
110 rep_start = {}
111 rep_end = {}
112 x =0
113 for line in fin:
114 line=line.strip('\n')
115 line=line.split('\t')
116 line3 = line[3].replace("(","_").replace(")","_").replace("/","_")
117 repname = line3
118 if not repname in repeat_elements:
119 repeat_elements.append(repname)
120 repchr = line[0]
121 repstart = int(line[1])
122 repend = int(line[2])
123 # print >> fout, str(repchr) + '\t'+str(repstart)+ '\t'+str(repend)+ '\t'+str(repname)
124 fout.write(str(repchr) + '\t'+str(repstart)+ '\t'+str(repend)+ '\t'+str(repname) + '\n')
125 # if rep_chr.has_key(repname):
126 if repname in rep_chr:
127 rep_chr[repname].append(repchr)
128 rep_start[repname].append(int(repstart))
129 rep_end[repname].append(int(repend))
130 else:
131 rep_chr[repname] = [repchr]
132 rep_start[repname] = [int(repstart)]
133 rep_end[repname] = [int(repend)]
134
135 fin.close()
136 fout.close()
137 repeat_elements = sorted(repeat_elements)
138 print ("Writing a key for all repeats...")
139 #print to fout the binary key that contains each repeat type with the associated binary number; sort the binary key:
140 fout = open(os.path.realpath(setup_folder + os.path.sep + 'repgenomes_key.txt'), 'w')
141 x = 0
142 for repeat in repeat_elements:
143 # print >> fout, str(repeat) + '\t' + str(x)
144 fout.write(str(repeat) + '\t' + str(x) + '\n')
145 x +=1
146 fout.close()
147 ################################################################################
148 # generate spacer for psuedogenomes
149 spacer = ""
150 for i in range(gapl):
151 spacer = spacer + "N"
152
153 # save file with number of fragments processed per repname
154 print ("Saving number of fragments processed per repname to " + nfragmentsfile1)
155 fout1 = open(os.path.realpath(nfragmentsfile1),"w")
156 for repname in rep_chr.keys():
157 rep_chr_current = rep_chr[repname]
158 # print >>fout1, str(len(rep_chr[repname])) + "\t" + repname
159 fout1.write(str(len(rep_chr[repname])) + "\t" + repname + '\n')
160 fout1.close()
161
162 # generate metagenomes and save them to FASTA files
163 k = 1
164 nrepgenomes = len(rep_chr.keys())
165 for repname in rep_chr.keys():
166 metagenome = ""
167 newname = repname.replace("(","_").replace(")","_").replace("/","_")
168 print ("processing repgenome " + newname + ".fa" + " (" + str(k) + " of " + str(nrepgenomes) + ")")
169 rep_chr_current = rep_chr[repname]
170 rep_start_current = rep_start[repname]
171 rep_end_current = rep_end[repname]
172 print ("-------> " + str(len(rep_chr[repname])) + " fragments")
173 for i in range(len(rep_chr[repname])):
174 try:
175 chr = rep_chr_current[i]
176 rstart = max(rep_start_current[i] - flankingl, 0)
177 rend = min(rep_end_current[i] + flankingl, lgenome[chr]-1)
178 metagenome = metagenome + spacer + genome[chr][rstart:(rend+1)]
179 except KeyError:
180 print ("Unrecognised Chromosome: "+chr)
181 pass
182
183 # Convert metagenome to SeqRecord object (required by SeqIO.write)
184 record = SeqRecord(Seq(metagenome, IUPAC.unambiguous_dna), id = "repname", name = "", description = "")
185 print ("saving repgenome " + newname + ".fa" + " (" + str(k) + " of " + str(nrepgenomes) + ")")
186 fastafilename = os.path.realpath(setup_folder + os.path.sep + newname + ".fa")
187 SeqIO.write(record, fastafilename, "fasta")
188 print ("indexing repgenome " + newname + ".fa" + " (" + str(k) + " of " + str(nrepgenomes) + ")")
189 command = shlex.split('bowtie-build -f ' + fastafilename + ' ' + setup_folder + os.path.sep + newname)
190 p = subprocess.Popen(command).communicate()
191 k += 1
192
193 print ("... Done")
194