comparison RepEnrich_setup.py @ 12:89e05f831259 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
author artbio
date Mon, 18 Mar 2024 09:39:44 +0000
parents 6bba3e33c2e7
children 530626b0757c
comparison
equal deleted inserted replaced
11:6bba3e33c2e7 12:89e05f831259
9 from Bio import SeqIO 9 from Bio import SeqIO
10 from Bio.Seq import Seq 10 from Bio.Seq import Seq
11 from Bio.SeqRecord import SeqRecord 11 from Bio.SeqRecord import SeqRecord
12 12
13 parser = argparse.ArgumentParser(description=''' 13 parser = argparse.ArgumentParser(description='''
14 Part I: Prepartion of repetive element psuedogenomes and repetive\ 14 Prepartion of repetive element pseudogenomes bowtie\
15 element bamfiles. This script prepares the annotation used by\ 15 indexes and annotation files used by RepEnrich.py enrichment.''',
16 downstream applications to analyze for repetitive element\
17 enrichment. For this script to run properly bowtie must be\
18 loaded. The repeat element psuedogenomes are prepared in order\
19 to analyze reads that map to multiple locations of the genome.\
20 The repeat element bamfiles are prepared in order to use a\
21 region sorter to analyze reads that map to a single location\
22 of the genome. You will 1) annotation_file:\
23 The repetitive element annotation file downloaded from\
24 RepeatMasker.org database for your organism of interest.\
25 2) genomefasta: Your genome of interest in fasta format,\
26 3)setup_folder: a folder to contain repeat element setup files\
27 command-line usage
28 EXAMPLE: python master_setup.py\
29 /users/nneretti/data/annotation/mm9/mm9_repeatmasker.txt\
30 /users/nneretti/data/annotation/mm9/mm9.fa\
31 /users/nneretti/data/annotation/mm9/setup_folder''',
32 prog='getargs_genome_maker.py') 16 prog='getargs_genome_maker.py')
33 parser.add_argument('--version', action='version', version='%(prog)s 0.1') 17 parser.add_argument('--annotation_file', action='store',
34 parser.add_argument('annotation_file', action='store',
35 metavar='annotation_file', 18 metavar='annotation_file',
36 help='''List annotation file. The annotation file contains\ 19 help='''Repeat masker annotation of the genome of\
37 the repeat masker annotation for the genome of\ 20 interest. Download from RepeatMasker.org\
38 interest and may be downloaded at RepeatMasker.org\ 21 Example: mm9.fa.out''')
39 Example /data/annotation/mm9/mm9.fa.out''') 22 parser.add_argument('--genomefasta', action='store', metavar='genomefasta',
40 parser.add_argument('genomefasta', action='store', metavar='genomefasta', 23 help='''Genome of interest in fasta format.\
41 help='''File name and path for genome of interest in fasta\ 24 Example: mm9.fa''')
42 format. Example /data/annotation/mm9/mm9.fa''') 25 parser.add_argument('--setup_folder', action='store', metavar='setup_folder',
43 parser.add_argument('setup_folder', action='store', metavar='setup_folder', 26 help='''Folder that contains bowtie indexes of repeats and\
44 help='''List folder to contain bamfiles for repeats and\
45 repeat element psuedogenomes.\ 27 repeat element psuedogenomes.\
46 Example /data/annotation/mm9/setup''') 28 Example working/setup''')
47 parser.add_argument('--nfragmentsfile1', action='store',
48 dest='nfragmentsfile1', metavar='nfragmentsfile1',
49 default='./repnames_nfragments.txt',
50 help='''Output location of a description file that saves\
51 the number of fragments processed per repname.
52 Default ./repnames_nfragments.txt''')
53 parser.add_argument('--gaplength', action='store', dest='gaplength', 29 parser.add_argument('--gaplength', action='store', dest='gaplength',
54 metavar='gaplength', default='200', type=int, 30 metavar='gaplength', default='200', type=int,
55 help='Length of the spacer used to build\ 31 help='''Length of the N-spacer in the\
56 repeat psuedogeneomes. Default 200') 32 repeat pseudogenomes. Default 200''')
57 parser.add_argument('--flankinglength', action='store', dest='flankinglength', 33 parser.add_argument('--flankinglength', action='store', dest='flankinglength',
58 metavar='flankinglength', default='25', type=int, 34 metavar='flankinglength', default='25', type=int,
59 help='Length of the flanking region adjacent to the repeat\ 35 help='''Length of the flanking regions used to build\
60 element that is used to build repeat psuedogeneomes.\ 36 repeat pseudogenomes. Flanking length should be set\
61 The flanking length should be set according to the\ 37 according to the length of your reads.\
62 length of your reads. Default 25') 38 Default 25, for 50 nt reads''')
63 parser.add_argument('--is_bed', action='store', dest='is_bed',
64 metavar='is_bed', default='FALSE',
65 help='''Is the annotation file a bed file. This is also a\
66 compatible format. The file needs to be a tab\
67 separated bed with optional fields.
68 Ex. format:
69 chr\tstart\tend\tName_element\tclass\tfamily.
70 The class and family should identical to name_element\
71 if not applicable. Default FALSE change to TRUE''')
72 args = parser.parse_args() 39 args = parser.parse_args()
73 40
74 # parameters and paths specified in args_parse 41 # parameters from argsparse
75 gapl = args.gaplength 42 gapl = args.gaplength
76 flankingl = args.flankinglength 43 flankingl = args.flankinglength
77 annotation_file = args.annotation_file 44 annotation_file = args.annotation_file
78 genomefasta = args.genomefasta 45 genomefasta = args.genomefasta
79 setup_folder = args.setup_folder 46 setup_folder = args.setup_folder
80 nfragmentsfile1 = args.nfragmentsfile1
81 is_bed = args.is_bed
82 47
83 ##############################################################################
84 # check that the programs we need are available 48 # check that the programs we need are available
85 try: 49 try:
86 subprocess.call(shlex.split("bowtie --version"), 50 subprocess.call(shlex.split("bowtie --version"),
87 stdout=open(os.devnull, 'wb'), 51 stdout=open(os.devnull, 'wb'),
88 stderr=open(os.devnull, 'wb')) 52 stderr=open(os.devnull, 'wb'))
89 except OSError: 53 except OSError:
90 print("Error: Bowtie or BEDTools not loaded") 54 print("Error: Bowtie not available in the path")
91 raise 55 raise
92 56
93 ##############################################################################
94 # Define a text importer 57 # Define a text importer
95 csv.field_size_limit(sys.maxsize) 58 csv.field_size_limit(sys.maxsize)
96 59
97 60
98 def import_text(filename, separator): 61 def import_text(filename, separator):
103 66
104 67
105 # Make a setup folder 68 # Make a setup folder
106 if not os.path.exists(setup_folder): 69 if not os.path.exists(setup_folder):
107 os.makedirs(setup_folder) 70 os.makedirs(setup_folder)
108 ############################################################################## 71 # load genome into dictionary and compute length
109 # load genome into dictionary
110 print("loading genome...")
111 g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta")) 72 g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta"))
73 idxgenome, lgenome, genome = {}, {}, {}
112 74
113 print("Precomputing length of all chromosomes...") 75 for k, chr in enumerate(g.keys()):
114 idxgenome = {} 76 genome[chr] = g[chr].seq
115 lgenome = {}
116 genome = {}
117 allchrs = g.keys()
118 k = 0
119 for chr in allchrs:
120 genome[chr] = str(g[chr].seq)
121 # del g[chr]
122 lgenome[chr] = len(genome[chr]) 77 lgenome[chr] = len(genome[chr])
123 idxgenome[chr] = k 78 idxgenome[chr] = k
124 k = k + 1
125 del g
126 79
127 ##############################################################################
128 # Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter 80 # Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter
129 if is_bed == "FALSE": 81 repeat_elements = []
130 repeat_elements = [] 82 # these dictionaries will contain lists
131 fout = open(os.path.realpath(setup_folder + os.path.sep 83 rep_chr, rep_start, rep_end = {}, {}, {}
132 + 'repnames.bed'), 'w') 84 fin = import_text(annotation_file, ' ')
133 fin = import_text(annotation_file, ' ') 85 with open(os.path.join(setup_folder, 'repnames.bed'), 'w') as fout:
134 x = 0 86 for i in range(3):
135 rep_chr = {} 87 next(fin)
136 rep_start = {}
137 rep_end = {}
138 x = 0
139 for line in fin: 88 for line in fin:
140 if x > 2: 89 repname = line[9].translate(str.maketrans('()/', '___'))
141 line9 = line[9].replace("(", "_").replace(")",
142 "_").replace("/", "_")
143 repname = line9
144 if repname not in repeat_elements:
145 repeat_elements.append(repname)
146 repchr = line[4]
147 repstart = int(line[5])
148 repend = int(line[6])
149 fout.write(str(repchr) + '\t' + str(repstart) + '\t' + str(repend)
150 + '\t' + str(repname) + '\n')
151 if repname in rep_chr:
152 rep_chr[repname].append(repchr)
153 rep_start[repname].append(int(repstart))
154 rep_end[repname].append(int(repend))
155 else:
156 rep_chr[repname] = [repchr]
157 rep_start[repname] = [int(repstart)]
158 rep_end[repname] = [int(repend)]
159 x += 1
160 if is_bed == "TRUE":
161 repeat_elements = []
162 fout = open(os.path.realpath(setup_folder + os.path.sep + 'repnames.bed'),
163 'w')
164 fin = open(os.path.realpath(annotation_file), 'r')
165 x = 0
166 rep_chr = {}
167 rep_start = {}
168 rep_end = {}
169 x = 0
170 for line in fin:
171 line = line.strip('\n')
172 line = line.split('\t')
173 line3 = line[3].replace("(", "_").replace(")", "_").replace("/", "_")
174 repname = line3
175 if repname not in repeat_elements: 90 if repname not in repeat_elements:
176 repeat_elements.append(repname) 91 repeat_elements.append(repname)
177 repchr = line[0] 92 repchr = line[4]
178 repstart = int(line[1]) 93 repstart = line[5]
179 repend = int(line[2]) 94 repend = line[6]
180 fout.write(str(repchr) + '\t' + str(repstart) + '\t' + 95 fout.write('\t'.join([repchr, repstart, repend, repname]) + '\n')
181 str(repend) + '\t' + str(repname) + '\n')
182 # if rep_chr.has_key(repname):
183 if repname in rep_chr: 96 if repname in rep_chr:
184 rep_chr[repname].append(repchr) 97 rep_chr[repname].append(repchr)
185 rep_start[repname].append(int(repstart)) 98 rep_start[repname].append(repstart)
186 rep_end[repname].append(int(repend)) 99 rep_end[repname].append(repend)
187 else: 100 else:
188 rep_chr[repname] = [repchr] 101 rep_chr[repname] = [repchr]
189 rep_start[repname] = [int(repstart)] 102 rep_start[repname] = [repstart]
190 rep_end[repname] = [int(repend)] 103 rep_end[repname] = [repend]
191 104
192 fin.close() 105 # sort repeat_elements and print them in repgenomes_key.txt
193 fout.close() 106 with open(os.path.join(setup_folder, 'repgenomes_key.txt'), 'w') as fout:
194 repeat_elements = sorted(repeat_elements) 107 for i, repeat in enumerate(sorted(repeat_elements)):
195 print("Writing a key for all repeats...") 108 fout.write('\t'.join([repeat, str(i)]) + '\n')
196 # print to fout the binary key that contains each repeat type with the
197 # associated binary number; sort the binary key:
198 fout = open(os.path.realpath(setup_folder + os.path.sep +
199 'repgenomes_key.txt'), 'w')
200 x = 0
201 for repeat in repeat_elements:
202 # print >> fout, str(repeat) + '\t' + str(x)
203 fout.write(str(repeat) + '\t' + str(x) + '\n')
204 x += 1
205 fout.close()
206 ##############################################################################
207 # generate spacer for psuedogenomes
208 spacer = ""
209 for i in range(gapl):
210 spacer = spacer + "N"
211 109
212 # save file with number of fragments processed per repname 110 # generate spacer for pseudogenomes
213 print("Saving number of fragments processed per repname to " 111 spacer = ''.join(['N' for i in range(gapl)])
214 + nfragmentsfile1)
215 fout1 = open(os.path.realpath(nfragmentsfile1), "w")
216 for repname in rep_chr.keys():
217 rep_chr_current = rep_chr[repname]
218 # print >>fout1, str(len(rep_chr[repname])) + "\t" + repname
219 fout1.write(str(len(rep_chr[repname])) + "\t" + repname + '\n')
220 fout1.close()
221 112
222 # generate metagenomes and save them to FASTA files 113 # generate metagenomes and save them to FASTA files for bowtie build
223 k = 1 114 for repname in rep_chr:
224 nrepgenomes = len(rep_chr.keys()) 115 metagenome = ''
225 for repname in rep_chr.keys(): 116 for i, repeat in enumerate(rep_chr[repname]):
226 metagenome = ""
227 newname = repname.replace("(", "_").replace(")", "_").replace("/", "_")
228 print("processing repgenome " + newname + ".fa" + " (" + str(k)
229 + " of " + str(nrepgenomes) + ")")
230 rep_chr_current = rep_chr[repname]
231 rep_start_current = rep_start[repname]
232 rep_end_current = rep_end[repname]
233 print("-------> " + str(len(rep_chr[repname])) + " fragments")
234 for i in range(len(rep_chr[repname])):
235 try: 117 try:
236 chr = rep_chr_current[i] 118 chromosome = rep_chr[repname][i]
237 rstart = max(rep_start_current[i] - flankingl, 0) 119 start = max(int(rep_start[repname][i]) - flankingl, 0)
238 rend = min(rep_end_current[i] + flankingl, lgenome[chr]-1) 120 end = min(int(rep_end[repname][i]) + flankingl,
239 metagenome = metagenome + spacer + genome[chr][rstart:(rend+1)] 121 int(lgenome[chr])-1) + 1
122 metagenome = f"{metagenome}{spacer}{genome[chromosome][start:end]}"
240 except KeyError: 123 except KeyError:
241 print("Unrecognised Chromosome: "+chr) 124 print("Unrecognised Chromosome: " + rep_chr[repname][i])
242 pass 125
243 # Convert metagenome to SeqRecord object (required by SeqIO.write) 126 # Create Fasta of repeat pseudogenome
244 record = SeqRecord(Seq(metagenome), id="repname", 127 fastafilename = f"{os.path.join(setup_folder, repname)}.fa"
245 name="", description="") 128 record = SeqRecord(Seq(metagenome), id=repname, name='', description='')
246 print("saving repgenome " + newname + ".fa" + " (" + str(k) + " of "
247 + str(nrepgenomes) + ")")
248 fastafilename = os.path.realpath(setup_folder + os.path.sep
249 + newname + ".fa")
250 SeqIO.write(record, fastafilename, "fasta") 129 SeqIO.write(record, fastafilename, "fasta")
251 print("indexing repgenome " + newname + ".fa" + " (" +
252 str(k) + " of " + str(nrepgenomes) + ")")
253 command = shlex.split('bowtie-build -f ' + fastafilename + ' ' +
254 setup_folder + os.path.sep + newname)
255 p = subprocess.Popen(command).communicate()
256 k += 1
257 130
258 print("... Done") 131 # Generate repeat pseudogenome bowtie index
132 bowtie_build_cmd = ["bowtie-build", "-f", fastafilename,
133 os.path.join(setup_folder, repname)]
134 subprocess.run(bowtie_build_cmd, check=True)