Mercurial > repos > artbio > repenrich
comparison RepEnrich_setup.py @ 0:f6f0f1e5e940 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
author | artbio |
---|---|
date | Wed, 02 Aug 2017 05:17:29 -0400 |
parents | |
children | 6bba3e33c2e7 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:f6f0f1e5e940 |
---|---|
1 #!/usr/bin/env python | |
2 import argparse | |
3 import csv | |
4 import os | |
5 import shlex | |
6 import subprocess | |
7 import sys | |
8 | |
9 from Bio import SeqIO | |
10 from Bio.Alphabet import IUPAC | |
11 from Bio.Seq import Seq | |
12 from Bio.SeqRecord import SeqRecord | |
13 | |
14 parser = argparse.ArgumentParser(description=''' | |
15 Part I: Prepartion of repetive element psuedogenomes and repetive\ | |
16 element bamfiles. This script prepares the annotation used by\ | |
17 downstream applications to analyze for repetitive element\ | |
18 enrichment. For this script to run properly bowtie must be\ | |
19 loaded. The repeat element psuedogenomes are prepared in order\ | |
20 to analyze reads that map to multiple locations of the genome.\ | |
21 The repeat element bamfiles are prepared in order to use a\ | |
22 region sorter to analyze reads that map to a single location\ | |
23 of the genome. You will 1) annotation_file:\ | |
24 The repetitive element annotation file downloaded from\ | |
25 RepeatMasker.org database for your organism of interest.\ | |
26 2) genomefasta: Your genome of interest in fasta format,\ | |
27 3)setup_folder: a folder to contain repeat element setup files\ | |
28 command-line usage | |
29 EXAMPLE: python master_setup.py\ | |
30 /users/nneretti/data/annotation/mm9/mm9_repeatmasker.txt\ | |
31 /users/nneretti/data/annotation/mm9/mm9.fa\ | |
32 /users/nneretti/data/annotation/mm9/setup_folder''', | |
33 prog='getargs_genome_maker.py') | |
34 parser.add_argument('--version', action='version', version='%(prog)s 0.1') | |
35 parser.add_argument('annotation_file', action='store', | |
36 metavar='annotation_file', | |
37 help='''List annotation file. The annotation file contains\ | |
38 the repeat masker annotation for the genome of\ | |
39 interest and may be downloaded at RepeatMasker.org\ | |
40 Example /data/annotation/mm9/mm9.fa.out''') | |
41 parser.add_argument('genomefasta', action='store', metavar='genomefasta', | |
42 help='''File name and path for genome of interest in fasta\ | |
43 format. Example /data/annotation/mm9/mm9.fa''') | |
44 parser.add_argument('setup_folder', action='store', metavar='setup_folder', | |
45 help='''List folder to contain bamfiles for repeats and\ | |
46 repeat element psuedogenomes.\ | |
47 Example /data/annotation/mm9/setup''') | |
48 parser.add_argument('--nfragmentsfile1', action='store', | |
49 dest='nfragmentsfile1', metavar='nfragmentsfile1', | |
50 default='./repnames_nfragments.txt', | |
51 help='''Output location of a description file that saves\ | |
52 the number of fragments processed per repname. | |
53 Default ./repnames_nfragments.txt''') | |
54 parser.add_argument('--gaplength', action='store', dest='gaplength', | |
55 metavar='gaplength', default='200', type=int, | |
56 help='Length of the spacer used to build\ | |
57 repeat psuedogeneomes. Default 200') | |
58 parser.add_argument('--flankinglength', action='store', dest='flankinglength', | |
59 metavar='flankinglength', default='25', type=int, | |
60 help='Length of the flanking region adjacent to the repeat\ | |
61 element that is used to build repeat psuedogeneomes.\ | |
62 The flanking length should be set according to the\ | |
63 length of your reads. Default 25') | |
64 parser.add_argument('--is_bed', action='store', dest='is_bed', | |
65 metavar='is_bed', default='FALSE', | |
66 help='''Is the annotation file a bed file. This is also a\ | |
67 compatible format. The file needs to be a tab\ | |
68 separated bed with optional fields. | |
69 Ex. format: | |
70 chr\tstart\tend\tName_element\tclass\tfamily. | |
71 The class and family should identical to name_element\ | |
72 if not applicable. Default FALSE change to TRUE''') | |
73 args = parser.parse_args() | |
74 | |
75 # parameters and paths specified in args_parse | |
76 gapl = args.gaplength | |
77 flankingl = args.flankinglength | |
78 annotation_file = args.annotation_file | |
79 genomefasta = args.genomefasta | |
80 setup_folder = args.setup_folder | |
81 nfragmentsfile1 = args.nfragmentsfile1 | |
82 is_bed = args.is_bed | |
83 | |
84 ############################################################################## | |
85 # check that the programs we need are available | |
86 try: | |
87 subprocess.call(shlex.split("bowtie --version"), | |
88 stdout=open(os.devnull, 'wb'), | |
89 stderr=open(os.devnull, 'wb')) | |
90 except OSError: | |
91 print("Error: Bowtie or BEDTools not loaded") | |
92 raise | |
93 | |
94 ############################################################################## | |
95 # Define a text importer | |
96 csv.field_size_limit(sys.maxsize) | |
97 | |
98 | |
99 def import_text(filename, separator): | |
100 for line in csv.reader(open(os.path.realpath(filename)), | |
101 delimiter=separator, skipinitialspace=True): | |
102 if line: | |
103 yield line | |
104 | |
105 | |
106 # Make a setup folder | |
107 if not os.path.exists(setup_folder): | |
108 os.makedirs(setup_folder) | |
109 ############################################################################## | |
110 # load genome into dictionary | |
111 print("loading genome...") | |
112 g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta")) | |
113 | |
114 print("Precomputing length of all chromosomes...") | |
115 idxgenome = {} | |
116 lgenome = {} | |
117 genome = {} | |
118 allchrs = g.keys() | |
119 k = 0 | |
120 for chr in allchrs: | |
121 genome[chr] = str(g[chr].seq) | |
122 # del g[chr] | |
123 lgenome[chr] = len(genome[chr]) | |
124 idxgenome[chr] = k | |
125 k = k + 1 | |
126 del g | |
127 | |
128 ############################################################################## | |
129 # Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter | |
130 if is_bed == "FALSE": | |
131 repeat_elements = [] | |
132 fout = open(os.path.realpath(setup_folder + os.path.sep | |
133 + 'repnames.bed'), 'w') | |
134 fin = import_text(annotation_file, ' ') | |
135 x = 0 | |
136 rep_chr = {} | |
137 rep_start = {} | |
138 rep_end = {} | |
139 x = 0 | |
140 for line in fin: | |
141 if x > 2: | |
142 line9 = line[9].replace("(", "_").replace(")", | |
143 "_").replace("/", "_") | |
144 repname = line9 | |
145 if repname not in repeat_elements: | |
146 repeat_elements.append(repname) | |
147 repchr = line[4] | |
148 repstart = int(line[5]) | |
149 repend = int(line[6]) | |
150 fout.write(str(repchr) + '\t' + str(repstart) + '\t' + str(repend) | |
151 + '\t' + str(repname) + '\n') | |
152 if repname in rep_chr: | |
153 rep_chr[repname].append(repchr) | |
154 rep_start[repname].append(int(repstart)) | |
155 rep_end[repname].append(int(repend)) | |
156 else: | |
157 rep_chr[repname] = [repchr] | |
158 rep_start[repname] = [int(repstart)] | |
159 rep_end[repname] = [int(repend)] | |
160 x += 1 | |
161 if is_bed == "TRUE": | |
162 repeat_elements = [] | |
163 fout = open(os.path.realpath(setup_folder + os.path.sep + 'repnames.bed'), | |
164 'w') | |
165 fin = open(os.path.realpath(annotation_file), 'r') | |
166 x = 0 | |
167 rep_chr = {} | |
168 rep_start = {} | |
169 rep_end = {} | |
170 x = 0 | |
171 for line in fin: | |
172 line = line.strip('\n') | |
173 line = line.split('\t') | |
174 line3 = line[3].replace("(", "_").replace(")", "_").replace("/", "_") | |
175 repname = line3 | |
176 if repname not in repeat_elements: | |
177 repeat_elements.append(repname) | |
178 repchr = line[0] | |
179 repstart = int(line[1]) | |
180 repend = int(line[2]) | |
181 fout.write(str(repchr) + '\t' + str(repstart) + '\t' + | |
182 str(repend) + '\t' + str(repname) + '\n') | |
183 # if rep_chr.has_key(repname): | |
184 if repname in rep_chr: | |
185 rep_chr[repname].append(repchr) | |
186 rep_start[repname].append(int(repstart)) | |
187 rep_end[repname].append(int(repend)) | |
188 else: | |
189 rep_chr[repname] = [repchr] | |
190 rep_start[repname] = [int(repstart)] | |
191 rep_end[repname] = [int(repend)] | |
192 | |
193 fin.close() | |
194 fout.close() | |
195 repeat_elements = sorted(repeat_elements) | |
196 print("Writing a key for all repeats...") | |
197 # print to fout the binary key that contains each repeat type with the | |
198 # associated binary number; sort the binary key: | |
199 fout = open(os.path.realpath(setup_folder + os.path.sep + | |
200 'repgenomes_key.txt'), 'w') | |
201 x = 0 | |
202 for repeat in repeat_elements: | |
203 # print >> fout, str(repeat) + '\t' + str(x) | |
204 fout.write(str(repeat) + '\t' + str(x) + '\n') | |
205 x += 1 | |
206 fout.close() | |
207 ############################################################################## | |
208 # generate spacer for psuedogenomes | |
209 spacer = "" | |
210 for i in range(gapl): | |
211 spacer = spacer + "N" | |
212 | |
213 # save file with number of fragments processed per repname | |
214 print("Saving number of fragments processed per repname to " | |
215 + nfragmentsfile1) | |
216 fout1 = open(os.path.realpath(nfragmentsfile1), "w") | |
217 for repname in rep_chr.keys(): | |
218 rep_chr_current = rep_chr[repname] | |
219 # print >>fout1, str(len(rep_chr[repname])) + "\t" + repname | |
220 fout1.write(str(len(rep_chr[repname])) + "\t" + repname + '\n') | |
221 fout1.close() | |
222 | |
223 # generate metagenomes and save them to FASTA files | |
224 k = 1 | |
225 nrepgenomes = len(rep_chr.keys()) | |
226 for repname in rep_chr.keys(): | |
227 metagenome = "" | |
228 newname = repname.replace("(", "_").replace(")", "_").replace("/", "_") | |
229 print("processing repgenome " + newname + ".fa" + " (" + str(k) | |
230 + " of " + str(nrepgenomes) + ")") | |
231 rep_chr_current = rep_chr[repname] | |
232 rep_start_current = rep_start[repname] | |
233 rep_end_current = rep_end[repname] | |
234 print("-------> " + str(len(rep_chr[repname])) + " fragments") | |
235 for i in range(len(rep_chr[repname])): | |
236 try: | |
237 chr = rep_chr_current[i] | |
238 rstart = max(rep_start_current[i] - flankingl, 0) | |
239 rend = min(rep_end_current[i] + flankingl, lgenome[chr]-1) | |
240 metagenome = metagenome + spacer + genome[chr][rstart:(rend+1)] | |
241 except KeyError: | |
242 print("Unrecognised Chromosome: "+chr) | |
243 pass | |
244 # Convert metagenome to SeqRecord object (required by SeqIO.write) | |
245 record = SeqRecord(Seq(metagenome, IUPAC.unambiguous_dna), id="repname", | |
246 name="", description="") | |
247 print("saving repgenome " + newname + ".fa" + " (" + str(k) + " of " | |
248 + str(nrepgenomes) + ")") | |
249 fastafilename = os.path.realpath(setup_folder + os.path.sep | |
250 + newname + ".fa") | |
251 SeqIO.write(record, fastafilename, "fasta") | |
252 print("indexing repgenome " + newname + ".fa" + " (" + | |
253 str(k) + " of " + str(nrepgenomes) + ")") | |
254 command = shlex.split('bowtie-build -f ' + fastafilename + ' ' + | |
255 setup_folder + os.path.sep + newname) | |
256 p = subprocess.Popen(command).communicate() | |
257 k += 1 | |
258 | |
259 print("... Done") |