Mercurial > repos > artbio > repenrich
comparison RepEnrich_setup.py @ 12:89e05f831259 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
author | artbio |
---|---|
date | Mon, 18 Mar 2024 09:39:44 +0000 |
parents | 6bba3e33c2e7 |
children | 530626b0757c |
comparison
equal
deleted
inserted
replaced
11:6bba3e33c2e7 | 12:89e05f831259 |
---|---|
9 from Bio import SeqIO | 9 from Bio import SeqIO |
10 from Bio.Seq import Seq | 10 from Bio.Seq import Seq |
11 from Bio.SeqRecord import SeqRecord | 11 from Bio.SeqRecord import SeqRecord |
12 | 12 |
13 parser = argparse.ArgumentParser(description=''' | 13 parser = argparse.ArgumentParser(description=''' |
14 Part I: Prepartion of repetive element psuedogenomes and repetive\ | 14 Prepartion of repetive element pseudogenomes bowtie\ |
15 element bamfiles. This script prepares the annotation used by\ | 15 indexes and annotation files used by RepEnrich.py enrichment.''', |
16 downstream applications to analyze for repetitive element\ | |
17 enrichment. For this script to run properly bowtie must be\ | |
18 loaded. The repeat element psuedogenomes are prepared in order\ | |
19 to analyze reads that map to multiple locations of the genome.\ | |
20 The repeat element bamfiles are prepared in order to use a\ | |
21 region sorter to analyze reads that map to a single location\ | |
22 of the genome. You will 1) annotation_file:\ | |
23 The repetitive element annotation file downloaded from\ | |
24 RepeatMasker.org database for your organism of interest.\ | |
25 2) genomefasta: Your genome of interest in fasta format,\ | |
26 3)setup_folder: a folder to contain repeat element setup files\ | |
27 command-line usage | |
28 EXAMPLE: python master_setup.py\ | |
29 /users/nneretti/data/annotation/mm9/mm9_repeatmasker.txt\ | |
30 /users/nneretti/data/annotation/mm9/mm9.fa\ | |
31 /users/nneretti/data/annotation/mm9/setup_folder''', | |
32 prog='getargs_genome_maker.py') | 16 prog='getargs_genome_maker.py') |
33 parser.add_argument('--version', action='version', version='%(prog)s 0.1') | 17 parser.add_argument('--annotation_file', action='store', |
34 parser.add_argument('annotation_file', action='store', | |
35 metavar='annotation_file', | 18 metavar='annotation_file', |
36 help='''List annotation file. The annotation file contains\ | 19 help='''Repeat masker annotation of the genome of\ |
37 the repeat masker annotation for the genome of\ | 20 interest. Download from RepeatMasker.org\ |
38 interest and may be downloaded at RepeatMasker.org\ | 21 Example: mm9.fa.out''') |
39 Example /data/annotation/mm9/mm9.fa.out''') | 22 parser.add_argument('--genomefasta', action='store', metavar='genomefasta', |
40 parser.add_argument('genomefasta', action='store', metavar='genomefasta', | 23 help='''Genome of interest in fasta format.\ |
41 help='''File name and path for genome of interest in fasta\ | 24 Example: mm9.fa''') |
42 format. Example /data/annotation/mm9/mm9.fa''') | 25 parser.add_argument('--setup_folder', action='store', metavar='setup_folder', |
43 parser.add_argument('setup_folder', action='store', metavar='setup_folder', | 26 help='''Folder that contains bowtie indexes of repeats and\ |
44 help='''List folder to contain bamfiles for repeats and\ | |
45 repeat element psuedogenomes.\ | 27 repeat element psuedogenomes.\ |
46 Example /data/annotation/mm9/setup''') | 28 Example working/setup''') |
47 parser.add_argument('--nfragmentsfile1', action='store', | |
48 dest='nfragmentsfile1', metavar='nfragmentsfile1', | |
49 default='./repnames_nfragments.txt', | |
50 help='''Output location of a description file that saves\ | |
51 the number of fragments processed per repname. | |
52 Default ./repnames_nfragments.txt''') | |
53 parser.add_argument('--gaplength', action='store', dest='gaplength', | 29 parser.add_argument('--gaplength', action='store', dest='gaplength', |
54 metavar='gaplength', default='200', type=int, | 30 metavar='gaplength', default='200', type=int, |
55 help='Length of the spacer used to build\ | 31 help='''Length of the N-spacer in the\ |
56 repeat psuedogeneomes. Default 200') | 32 repeat pseudogenomes. Default 200''') |
57 parser.add_argument('--flankinglength', action='store', dest='flankinglength', | 33 parser.add_argument('--flankinglength', action='store', dest='flankinglength', |
58 metavar='flankinglength', default='25', type=int, | 34 metavar='flankinglength', default='25', type=int, |
59 help='Length of the flanking region adjacent to the repeat\ | 35 help='''Length of the flanking regions used to build\ |
60 element that is used to build repeat psuedogeneomes.\ | 36 repeat pseudogenomes. Flanking length should be set\ |
61 The flanking length should be set according to the\ | 37 according to the length of your reads.\ |
62 length of your reads. Default 25') | 38 Default 25, for 50 nt reads''') |
63 parser.add_argument('--is_bed', action='store', dest='is_bed', | |
64 metavar='is_bed', default='FALSE', | |
65 help='''Is the annotation file a bed file. This is also a\ | |
66 compatible format. The file needs to be a tab\ | |
67 separated bed with optional fields. | |
68 Ex. format: | |
69 chr\tstart\tend\tName_element\tclass\tfamily. | |
70 The class and family should identical to name_element\ | |
71 if not applicable. Default FALSE change to TRUE''') | |
72 args = parser.parse_args() | 39 args = parser.parse_args() |
73 | 40 |
74 # parameters and paths specified in args_parse | 41 # parameters from argsparse |
75 gapl = args.gaplength | 42 gapl = args.gaplength |
76 flankingl = args.flankinglength | 43 flankingl = args.flankinglength |
77 annotation_file = args.annotation_file | 44 annotation_file = args.annotation_file |
78 genomefasta = args.genomefasta | 45 genomefasta = args.genomefasta |
79 setup_folder = args.setup_folder | 46 setup_folder = args.setup_folder |
80 nfragmentsfile1 = args.nfragmentsfile1 | |
81 is_bed = args.is_bed | |
82 | 47 |
83 ############################################################################## | |
84 # check that the programs we need are available | 48 # check that the programs we need are available |
85 try: | 49 try: |
86 subprocess.call(shlex.split("bowtie --version"), | 50 subprocess.call(shlex.split("bowtie --version"), |
87 stdout=open(os.devnull, 'wb'), | 51 stdout=open(os.devnull, 'wb'), |
88 stderr=open(os.devnull, 'wb')) | 52 stderr=open(os.devnull, 'wb')) |
89 except OSError: | 53 except OSError: |
90 print("Error: Bowtie or BEDTools not loaded") | 54 print("Error: Bowtie not available in the path") |
91 raise | 55 raise |
92 | 56 |
93 ############################################################################## | |
94 # Define a text importer | 57 # Define a text importer |
95 csv.field_size_limit(sys.maxsize) | 58 csv.field_size_limit(sys.maxsize) |
96 | 59 |
97 | 60 |
98 def import_text(filename, separator): | 61 def import_text(filename, separator): |
103 | 66 |
104 | 67 |
105 # Make a setup folder | 68 # Make a setup folder |
106 if not os.path.exists(setup_folder): | 69 if not os.path.exists(setup_folder): |
107 os.makedirs(setup_folder) | 70 os.makedirs(setup_folder) |
108 ############################################################################## | 71 # load genome into dictionary and compute length |
109 # load genome into dictionary | |
110 print("loading genome...") | |
111 g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta")) | 72 g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta")) |
73 idxgenome, lgenome, genome = {}, {}, {} | |
112 | 74 |
113 print("Precomputing length of all chromosomes...") | 75 for k, chr in enumerate(g.keys()): |
114 idxgenome = {} | 76 genome[chr] = g[chr].seq |
115 lgenome = {} | |
116 genome = {} | |
117 allchrs = g.keys() | |
118 k = 0 | |
119 for chr in allchrs: | |
120 genome[chr] = str(g[chr].seq) | |
121 # del g[chr] | |
122 lgenome[chr] = len(genome[chr]) | 77 lgenome[chr] = len(genome[chr]) |
123 idxgenome[chr] = k | 78 idxgenome[chr] = k |
124 k = k + 1 | |
125 del g | |
126 | 79 |
127 ############################################################################## | |
128 # Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter | 80 # Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter |
129 if is_bed == "FALSE": | 81 repeat_elements = [] |
130 repeat_elements = [] | 82 # these dictionaries will contain lists |
131 fout = open(os.path.realpath(setup_folder + os.path.sep | 83 rep_chr, rep_start, rep_end = {}, {}, {} |
132 + 'repnames.bed'), 'w') | 84 fin = import_text(annotation_file, ' ') |
133 fin = import_text(annotation_file, ' ') | 85 with open(os.path.join(setup_folder, 'repnames.bed'), 'w') as fout: |
134 x = 0 | 86 for i in range(3): |
135 rep_chr = {} | 87 next(fin) |
136 rep_start = {} | |
137 rep_end = {} | |
138 x = 0 | |
139 for line in fin: | 88 for line in fin: |
140 if x > 2: | 89 repname = line[9].translate(str.maketrans('()/', '___')) |
141 line9 = line[9].replace("(", "_").replace(")", | |
142 "_").replace("/", "_") | |
143 repname = line9 | |
144 if repname not in repeat_elements: | |
145 repeat_elements.append(repname) | |
146 repchr = line[4] | |
147 repstart = int(line[5]) | |
148 repend = int(line[6]) | |
149 fout.write(str(repchr) + '\t' + str(repstart) + '\t' + str(repend) | |
150 + '\t' + str(repname) + '\n') | |
151 if repname in rep_chr: | |
152 rep_chr[repname].append(repchr) | |
153 rep_start[repname].append(int(repstart)) | |
154 rep_end[repname].append(int(repend)) | |
155 else: | |
156 rep_chr[repname] = [repchr] | |
157 rep_start[repname] = [int(repstart)] | |
158 rep_end[repname] = [int(repend)] | |
159 x += 1 | |
160 if is_bed == "TRUE": | |
161 repeat_elements = [] | |
162 fout = open(os.path.realpath(setup_folder + os.path.sep + 'repnames.bed'), | |
163 'w') | |
164 fin = open(os.path.realpath(annotation_file), 'r') | |
165 x = 0 | |
166 rep_chr = {} | |
167 rep_start = {} | |
168 rep_end = {} | |
169 x = 0 | |
170 for line in fin: | |
171 line = line.strip('\n') | |
172 line = line.split('\t') | |
173 line3 = line[3].replace("(", "_").replace(")", "_").replace("/", "_") | |
174 repname = line3 | |
175 if repname not in repeat_elements: | 90 if repname not in repeat_elements: |
176 repeat_elements.append(repname) | 91 repeat_elements.append(repname) |
177 repchr = line[0] | 92 repchr = line[4] |
178 repstart = int(line[1]) | 93 repstart = line[5] |
179 repend = int(line[2]) | 94 repend = line[6] |
180 fout.write(str(repchr) + '\t' + str(repstart) + '\t' + | 95 fout.write('\t'.join([repchr, repstart, repend, repname]) + '\n') |
181 str(repend) + '\t' + str(repname) + '\n') | |
182 # if rep_chr.has_key(repname): | |
183 if repname in rep_chr: | 96 if repname in rep_chr: |
184 rep_chr[repname].append(repchr) | 97 rep_chr[repname].append(repchr) |
185 rep_start[repname].append(int(repstart)) | 98 rep_start[repname].append(repstart) |
186 rep_end[repname].append(int(repend)) | 99 rep_end[repname].append(repend) |
187 else: | 100 else: |
188 rep_chr[repname] = [repchr] | 101 rep_chr[repname] = [repchr] |
189 rep_start[repname] = [int(repstart)] | 102 rep_start[repname] = [repstart] |
190 rep_end[repname] = [int(repend)] | 103 rep_end[repname] = [repend] |
191 | 104 |
192 fin.close() | 105 # sort repeat_elements and print them in repgenomes_key.txt |
193 fout.close() | 106 with open(os.path.join(setup_folder, 'repgenomes_key.txt'), 'w') as fout: |
194 repeat_elements = sorted(repeat_elements) | 107 for i, repeat in enumerate(sorted(repeat_elements)): |
195 print("Writing a key for all repeats...") | 108 fout.write('\t'.join([repeat, str(i)]) + '\n') |
196 # print to fout the binary key that contains each repeat type with the | |
197 # associated binary number; sort the binary key: | |
198 fout = open(os.path.realpath(setup_folder + os.path.sep + | |
199 'repgenomes_key.txt'), 'w') | |
200 x = 0 | |
201 for repeat in repeat_elements: | |
202 # print >> fout, str(repeat) + '\t' + str(x) | |
203 fout.write(str(repeat) + '\t' + str(x) + '\n') | |
204 x += 1 | |
205 fout.close() | |
206 ############################################################################## | |
207 # generate spacer for psuedogenomes | |
208 spacer = "" | |
209 for i in range(gapl): | |
210 spacer = spacer + "N" | |
211 | 109 |
212 # save file with number of fragments processed per repname | 110 # generate spacer for pseudogenomes |
213 print("Saving number of fragments processed per repname to " | 111 spacer = ''.join(['N' for i in range(gapl)]) |
214 + nfragmentsfile1) | |
215 fout1 = open(os.path.realpath(nfragmentsfile1), "w") | |
216 for repname in rep_chr.keys(): | |
217 rep_chr_current = rep_chr[repname] | |
218 # print >>fout1, str(len(rep_chr[repname])) + "\t" + repname | |
219 fout1.write(str(len(rep_chr[repname])) + "\t" + repname + '\n') | |
220 fout1.close() | |
221 | 112 |
222 # generate metagenomes and save them to FASTA files | 113 # generate metagenomes and save them to FASTA files for bowtie build |
223 k = 1 | 114 for repname in rep_chr: |
224 nrepgenomes = len(rep_chr.keys()) | 115 metagenome = '' |
225 for repname in rep_chr.keys(): | 116 for i, repeat in enumerate(rep_chr[repname]): |
226 metagenome = "" | |
227 newname = repname.replace("(", "_").replace(")", "_").replace("/", "_") | |
228 print("processing repgenome " + newname + ".fa" + " (" + str(k) | |
229 + " of " + str(nrepgenomes) + ")") | |
230 rep_chr_current = rep_chr[repname] | |
231 rep_start_current = rep_start[repname] | |
232 rep_end_current = rep_end[repname] | |
233 print("-------> " + str(len(rep_chr[repname])) + " fragments") | |
234 for i in range(len(rep_chr[repname])): | |
235 try: | 117 try: |
236 chr = rep_chr_current[i] | 118 chromosome = rep_chr[repname][i] |
237 rstart = max(rep_start_current[i] - flankingl, 0) | 119 start = max(int(rep_start[repname][i]) - flankingl, 0) |
238 rend = min(rep_end_current[i] + flankingl, lgenome[chr]-1) | 120 end = min(int(rep_end[repname][i]) + flankingl, |
239 metagenome = metagenome + spacer + genome[chr][rstart:(rend+1)] | 121 int(lgenome[chr])-1) + 1 |
122 metagenome = f"{metagenome}{spacer}{genome[chromosome][start:end]}" | |
240 except KeyError: | 123 except KeyError: |
241 print("Unrecognised Chromosome: "+chr) | 124 print("Unrecognised Chromosome: " + rep_chr[repname][i]) |
242 pass | 125 |
243 # Convert metagenome to SeqRecord object (required by SeqIO.write) | 126 # Create Fasta of repeat pseudogenome |
244 record = SeqRecord(Seq(metagenome), id="repname", | 127 fastafilename = f"{os.path.join(setup_folder, repname)}.fa" |
245 name="", description="") | 128 record = SeqRecord(Seq(metagenome), id=repname, name='', description='') |
246 print("saving repgenome " + newname + ".fa" + " (" + str(k) + " of " | |
247 + str(nrepgenomes) + ")") | |
248 fastafilename = os.path.realpath(setup_folder + os.path.sep | |
249 + newname + ".fa") | |
250 SeqIO.write(record, fastafilename, "fasta") | 129 SeqIO.write(record, fastafilename, "fasta") |
251 print("indexing repgenome " + newname + ".fa" + " (" + | |
252 str(k) + " of " + str(nrepgenomes) + ")") | |
253 command = shlex.split('bowtie-build -f ' + fastafilename + ' ' + | |
254 setup_folder + os.path.sep + newname) | |
255 p = subprocess.Popen(command).communicate() | |
256 k += 1 | |
257 | 130 |
258 print("... Done") | 131 # Generate repeat pseudogenome bowtie index |
132 bowtie_build_cmd = ["bowtie-build", "-f", fastafilename, | |
133 os.path.join(setup_folder, repname)] | |
134 subprocess.run(bowtie_build_cmd, check=True) |