comparison RepEnrich.py @ 12:89e05f831259 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
author artbio
date Mon, 18 Mar 2024 09:39:44 +0000
parents 6f4143893463
children 530626b0757c
comparison
equal deleted inserted replaced
11:6bba3e33c2e7 12:89e05f831259
1 import argparse 1 import argparse
2 import csv 2 import csv
3 import os 3 import os
4 import shlex 4 import shlex
5 import shutil
6 import subprocess 5 import subprocess
7 import sys 6 import sys
8 7
9 import numpy 8 import numpy
10 9
11 10
12 parser = argparse.ArgumentParser(description=''' 11 parser = argparse.ArgumentParser(description='''
13 Part II: Conducting the alignments to the psuedogenomes. Before\ 12 Repenrich aligns reads to Repeat Elements pseudogenomes\
14 doing this step you will require 1) a bamfile of the unique\ 13 and counts aligned reads. RepEnrich_setup must be run\
15 alignments with index 2) a fastq file of the reads mapping to\ 14 before its use''')
16 more than one location. These files can be obtained using the\ 15 parser.add_argument('--annotation_file', action='store',
17 following bowtie options [EXAMPLE: bowtie -S -m 1\
18 --max multimap.fastq mm9 mate1_reads.fastq] Once you have the\
19 unique alignment bamfile and the reads mapping to more than one\
20 location in a fastq file you can run this step. EXAMPLE: python\
21 master_output.py\
22 /users/nneretti/data/annotation/hg19/hg19_repeatmasker.txt\
23 /users/nneretti/datasets/repeatmapping/POL3/Pol3_human/
24 HeLa_InputChIPseq_Rep1 HeLa_InputChIPseq_Rep1\
25 /users/nneretti/data/annotation/hg19/setup_folder\
26 HeLa_InputChIPseq_Rep1_multimap.fastq\
27 HeLa_InputChIPseq_Rep1.bam''')
28 parser.add_argument('--version', action='version', version='%(prog)s 0.1')
29 parser.add_argument('annotation_file', action='store',
30 metavar='annotation_file', 16 metavar='annotation_file',
31 help='List RepeatMasker.org annotation file for your\ 17 help='RepeatMasker.org annotation file for your\
32 organism. The file may be downloaded from the\ 18 organism. The file may be downloaded from\
33 RepeatMasker.org website. Example:\ 19 RepeatMasker.org. E.g. hg19_repeatmasker.txt')
34 /data/annotation/hg19/hg19_repeatmasker.txt') 20 parser.add_argument('--outputfolder', action='store', metavar='outputfolder',
35 parser.add_argument('outputfolder', action='store', metavar='outputfolder', 21 help='Folder that will contain results. Should be the\
36 help='List folder to contain results.\ 22 same as the one used for RepEnrich_setup.\
37 Example: /outputfolder') 23 Example: ./outputfolder')
38 parser.add_argument('outputprefix', action='store', metavar='outputprefix', 24 parser.add_argument('--outputprefix', action='store', metavar='outputprefix',
39 help='Enter prefix name for data.\ 25 help='Prefix name for Repenrich output files.')
40 Example: HeLa_InputChIPseq_Rep1') 26 parser.add_argument('--setup_folder', action='store', metavar='setup_folder',
41 parser.add_argument('setup_folder', action='store', metavar='setup_folder', 27 help='Folder produced by RepEnrich_setup which contains\
42 help='List folder that contains the repeat element\ 28 repeat element pseudogenomes.')
43 pseudogenomes.\ 29 parser.add_argument('--fastqfile', action='store', metavar='fastqfile',
44 Example: /data/annotation/hg19/setup_folder') 30 help='File of fastq reads mapping to multiple\
45 parser.add_argument('fastqfile', action='store', metavar='fastqfile',
46 help='Enter file for the fastq reads that map to multiple\
47 locations. Example: /data/multimap.fastq') 31 locations. Example: /data/multimap.fastq')
48 parser.add_argument('alignment_bam', action='store', metavar='alignment_bam', 32 parser.add_argument('--alignment_bam', action='store', metavar='alignment_bam',
49 help='Enter bamfile output for reads that map uniquely.\ 33 help='Bam alignments of unique mapper reads.')
50 Example /bamfiles/old.bam')
51 parser.add_argument('--pairedend', action='store', dest='pairedend', 34 parser.add_argument('--pairedend', action='store', dest='pairedend',
52 default='FALSE', 35 default='FALSE',
53 help='Designate this option for paired-end sequencing.\ 36 help='Change to TRUE for paired-end fastq files.\
54 Default FALSE change to TRUE') 37 Default FALSE')
55 parser.add_argument('--collapserepeat', action='store', dest='collapserepeat',
56 metavar='collapserepeat', default='Simple_repeat',
57 help='Designate this option to generate a collapsed repeat\
58 type. Uncollapsed output is generated in addition to\
59 collapsed repeat type. Simple_repeat is default to\
60 simplify downstream analysis. You can change the\
61 default to another repeat name to collapse a\
62 seperate specific repeat instead or if the name of\
63 Simple_repeat is different for your organism.\
64 Default Simple_repeat')
65 parser.add_argument('--fastqfile2', action='store', dest='fastqfile2', 38 parser.add_argument('--fastqfile2', action='store', dest='fastqfile2',
66 metavar='fastqfile2', default='none', 39 metavar='fastqfile2', default='none',
67 help='Enter fastqfile2 when using paired-end option.\ 40 help='fastqfile #2 when using paired-end option.\
68 Default none') 41 Default none')
69 parser.add_argument('--cpus', action='store', dest='cpus', metavar='cpus', 42 parser.add_argument('--cpus', action='store', dest='cpus', metavar='cpus',
70 default="1", type=int, 43 default="1", type=int,
71 help='Enter available cpus per node. The more cpus the\ 44 help='Number of CPUs. The more cpus the\
72 faster RepEnrich performs. RepEnrich is designed to\ 45 faster RepEnrich performs. Default: "1"')
73 only work on one node. Default: "1"') 46
74 parser.add_argument('--allcountmethod', action='store', dest='allcountmethod',
75 metavar='allcountmethod', default="FALSE",
76 help='By default the pipeline only outputs the fraction\
77 count method. Consdidered to be the best way to\
78 count multimapped reads. Changing this option will\
79 include the unique count method, a conservative\
80 count, and the total count method, a liberal\
81 counting strategy. Our evaluation of simulated data\
82 indicated fraction counting is best.\
83 Default = FALSE, change to TRUE')
84 parser.add_argument('--is_bed', action='store', dest='is_bed',
85 metavar='is_bed', default='FALSE',
86 help='Is the annotation file a bed file.\
87 This is also a compatible format. The file needs to\
88 be a tab seperated bed with optional fields.\
89 Ex. format: chr\tstart\tend\tName_element\tclass\
90 \tfamily. The class and family should identical to\
91 name_element if not applicable.\
92 Default FALSE change to TRUE')
93 args = parser.parse_args() 47 args = parser.parse_args()
94 48
95 # parameters 49 # parameters
96 annotation_file = args.annotation_file 50 annotation_file = args.annotation_file
97 outputfolder = args.outputfolder 51 outputfolder = args.outputfolder
98 outputfile_prefix = args.outputprefix 52 outputfile_prefix = args.outputprefix
99 setup_folder = args.setup_folder 53 setup_folder = args.setup_folder
100 repeat_bed = setup_folder + os.path.sep + 'repnames.bed' 54 repeat_bed = os.path.join(setup_folder, 'repnames.bed')
101 unique_mapper_bam = args.alignment_bam 55 unique_mapper_bam = args.alignment_bam
102 fastqfile_1 = args.fastqfile 56 fastqfile_1 = args.fastqfile
103 fastqfile_2 = args.fastqfile2 57 fastqfile_2 = args.fastqfile2
104 cpus = args.cpus 58 cpus = args.cpus
105 b_opt = "-k1 -p " + str(1) + " --quiet" 59 b_opt = "-k1 -p 1 --quiet"
106 simple_repeat = args.collapserepeat 60 # Change if simple repeats are differently annotated in your organism
61 simple_repeat = "Simple_repeat"
107 paired_end = args.pairedend 62 paired_end = args.pairedend
108 allcountmethod = args.allcountmethod 63
109 is_bed = args.is_bed
110
111 ##############################################################################
112 # check that the programs we need are available 64 # check that the programs we need are available
113 try: 65 try:
114 subprocess.call(shlex.split("coverageBed -h"), 66 subprocess.call(shlex.split("coverageBed -h"),
115 stdout=open(os.devnull, 'wb'), 67 stdout=open(os.devnull, 'wb'),
116 stderr=open(os.devnull, 'wb')) 68 stderr=open(os.devnull, 'wb'))
117 subprocess.call(shlex.split("bowtie --version"), 69 subprocess.call(shlex.split("bowtie --version"),
118 stdout=open(os.devnull, 'wb'), 70 stdout=open(os.devnull, 'wb'),
119 stderr=open(os.devnull, 'wb')) 71 stderr=open(os.devnull, 'wb'))
120 except OSError: 72 except OSError:
121 print("Error: Bowtie or BEDTools not loaded") 73 print("Error: Bowtie or bedtools not loaded")
122 raise 74 raise
123 75
124 ##############################################################################
125 # define a csv reader that reads space deliminated files 76 # define a csv reader that reads space deliminated files
126 print('Preparing for analysis using RepEnrich...') 77 print('Preparing for analysis using RepEnrich...')
127 csv.field_size_limit(sys.maxsize) 78 csv.field_size_limit(sys.maxsize)
128 79
129 80
132 skipinitialspace=True): 83 skipinitialspace=True):
133 if line: 84 if line:
134 yield line 85 yield line
135 86
136 87
137 ############################################################################## 88 # build dictionaries to convert repclass and rep families
138 # build dictionaries to convert repclass and rep families' 89 repeatclass, repeatfamily = {}, {}
139 if is_bed == "FALSE": 90 repeats = import_text(annotation_file, ' ')
140 repeatclass = {} 91 # skip three first lines of the iterator
141 repeatfamily = {} 92 for line in range(3):
142 fin = import_text(annotation_file, ' ') 93 next(repeats)
143 x = 0 94 for repeat in repeats:
144 for line in fin: 95 classfamily = []
145 if x > 2: 96 classfamily = repeat[10].split('/')
146 classfamily = [] 97 matching_repeat = repeat[9].translate(str.maketrans('()/', '___'))
147 classfamily = line[10].split(os.path.sep) 98 repeatclass[matching_repeat] = classfamily[0]
148 line9 = line[9].replace("(", "_").replace( 99 if len(classfamily) == 2:
149 ")", "_").replace("/", "_") 100 repeatfamily[matching_repeat] = classfamily[1]
150 repeatclass[line9] = classfamily[0] 101 else:
151 if len(classfamily) == 2: 102 repeatfamily[matching_repeat] = classfamily[0]
152 repeatfamily[line9] = classfamily[1] 103
153 else:
154 repeatfamily[line9] = classfamily[0]
155 x += 1
156 if is_bed == "TRUE":
157 repeatclass = {}
158 repeatfamily = {}
159 fin = open(annotation_file, 'r')
160 for line in fin:
161 line = line.strip('\n')
162 line = line.split('\t')
163 theclass = line[4]
164 thefamily = line[5]
165 line3 = line[3].replace("(", "_").replace(")", "_").replace("/", "_")
166 repeatclass[line3] = theclass
167 repeatfamily[line3] = thefamily
168 fin.close()
169
170 ##############################################################################
171 # build list of repeats initializing dictionaries for downstream analysis' 104 # build list of repeats initializing dictionaries for downstream analysis'
172 fin = import_text(setup_folder + os.path.sep + 'repgenomes_key.txt', '\t') 105 repgenome_path = os.path.join(setup_folder, 'repgenomes_key.txt')
173 repeat_key = {} 106 reptotalcounts = {line[0]: 0 for line in import_text(repgenome_path, '\t')}
174 rev_repeat_key = {} 107 fractionalcounts = {line[0]: 0 for line in import_text(repgenome_path, '\t')}
175 repeat_list = [] 108 classtotalcounts = {repeatclass[line[0]]: 0 for line in import_text(
176 reptotalcounts = {} 109 repgenome_path, '\t') if line[0] in repeatclass}
177 classfractionalcounts = {} 110 classfractionalcounts = {repeatclass[line[0]]: 0 for line in import_text(
178 familyfractionalcounts = {} 111 repgenome_path, '\t') if line[0] in repeatclass}
179 classtotalcounts = {} 112 familytotalcounts = {repeatfamily[line[0]]: 0 for line in import_text(
180 familytotalcounts = {} 113 repgenome_path, '\t') if line[0] in repeatfamily}
181 reptotalcounts_simple = {} 114 familyfractionalcounts = {repeatfamily[line[0]]: 0 for line in import_text(
182 fractionalcounts = {} 115 repgenome_path, '\t') if line[0] in repeatfamily}
183 i = 0 116 reptotalcounts_simple = {(simple_repeat if line[0] in repeatfamily and
184 for line in fin: 117 repeatfamily[line[0]] == simple_repeat else
185 reptotalcounts[line[0]] = 0 118 line[0]): 0 for line in import_text(
186 fractionalcounts[line[0]] = 0 119 repgenome_path, '\t')}
187 if line[0] in repeatclass: 120 repeat_key = {line[0]: int(line[1]) for line in import_text(
188 classtotalcounts[repeatclass[line[0]]] = 0 121 repgenome_path, '\t')}
189 classfractionalcounts[repeatclass[line[0]]] = 0 122 rev_repeat_key = {int(line[1]): line[0] for line in import_text(
190 if line[0] in repeatfamily: 123 repgenome_path, '\t')}
191 familytotalcounts[repeatfamily[line[0]]] = 0 124 repeat_list = [line[0] for line in import_text(repgenome_path, '\t')]
192 familyfractionalcounts[repeatfamily[line[0]]] = 0 125
193 if line[0] in repeatfamily:
194 if repeatfamily[line[0]] == simple_repeat:
195 reptotalcounts_simple[simple_repeat] = 0
196 else:
197 reptotalcounts_simple[line[0]] = 0
198 repeat_list.append(line[0])
199 repeat_key[line[0]] = int(line[1])
200 rev_repeat_key[int(line[1])] = line[0]
201 fin.close()
202 ##############################################################################
203 # map the repeats to the psuedogenomes: 126 # map the repeats to the psuedogenomes:
204 if not os.path.exists(outputfolder): 127 if not os.path.exists(outputfolder):
205 os.mkdir(outputfolder) 128 os.mkdir(outputfolder)
206 ############################################################################## 129
207 # Conduct the regions sorting 130 # Conduct the regions sorting
208 print('Conducting region sorting on unique mapping reads....') 131 fileout = os.path.join(outputfolder, f"{outputfile_prefix}_regionsorter.txt")
209 fileout = outputfolder + os.path.sep + outputfile_prefix + '_regionsorter.txt' 132 command = shlex.split(f"coverageBed -abam {unique_mapper_bam} -b \
133 {os.path.join(setup_folder, 'repnames.bed')}")
210 with open(fileout, 'w') as stdout: 134 with open(fileout, 'w') as stdout:
211 command = shlex.split("coverageBed -abam " + unique_mapper_bam + " -b " + 135 subprocess.run(command, stdout=stdout, check=True)
212 setup_folder + os.path.sep + 'repnames.bed') 136
213 p = subprocess.Popen(command, stdout=stdout)
214 p.communicate()
215 stdout.close()
216 filein = open(outputfolder + os.path.sep + outputfile_prefix
217 + '_regionsorter.txt', 'r')
218 counts = {} 137 counts = {}
219 sumofrepeatreads = 0 138 sumofrepeatreads = 0
220 for line in filein: 139 with open(fileout) as filein:
221 line = line.split('\t') 140 for line in filein:
222 if not str(repeat_key[line[3]]) in counts: 141 line = line.split('\t')
223 counts[str(repeat_key[line[3]])] = 0 142 if not str(repeat_key[line[3]]) in counts:
224 counts[str(repeat_key[line[3]])] += int(line[4]) 143 counts[str(repeat_key[line[3]])] = 0
225 sumofrepeatreads += int(line[4]) 144 counts[str(repeat_key[line[3]])] += int(line[4])
226 print('Identified ' + str(sumofrepeatreads) + 145 sumofrepeatreads += int(line[4])
227 'unique reads that mapped to repeats.') 146 print(f"Identified {sumofrepeatreads} unique reads that \
228 ############################################################################## 147 mapped to repeats.")
229 if paired_end == 'TRUE': 148
230 if not os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'): 149
231 os.mkdir(outputfolder + os.path.sep + 'pair1_bowtie') 150 def run_bowtie(metagenome, fastqfile, folder):
232 if not os.path.exists(outputfolder + os.path.sep + 'pair2_bowtie'): 151 metagenomepath = os.path.join(setup_folder, metagenome)
233 os.mkdir(outputfolder + os.path.sep + 'pair2_bowtie') 152 output_file = os.path.join(folder, f"{metagenome}.bowtie")
234 folder_pair1 = outputfolder + os.path.sep + 'pair1_bowtie' 153 command = shlex.split(f"bowtie {b_opt} {metagenomepath} {fastqfile}")
235 folder_pair2 = outputfolder + os.path.sep + 'pair2_bowtie' 154 with open(output_file, 'w') as stdout:
236 ############################################################################## 155 return subprocess.Popen(command, stdout=stdout)
237 print("Processing repeat psuedogenomes...") 156
238 ps = [] 157
239 psb = [] 158 if paired_end == 'FALSE':
159 folder_pair1 = os.path.join(outputfolder, 'pair1_bowtie')
160 os.makedirs(folder_pair1, exist_ok=True)
161
162 print("Processing repeat pseudogenomes...")
163 processes = []
240 ticker = 0 164 ticker = 0
165
241 for metagenome in repeat_list: 166 for metagenome in repeat_list:
242 metagenomepath = setup_folder + os.path.sep + metagenome 167 processes.append(run_bowtie(metagenome, fastqfile_1, folder_pair1))
243 file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie'
244 file2 = folder_pair2 + os.path.sep + metagenome + '.bowtie'
245 with open(file1, 'w') as stdout:
246 command = shlex.split("bowtie " + b_opt + " " +
247 metagenomepath + " " + fastqfile_1)
248 p = subprocess.Popen(command, stdout=stdout)
249 with open(file2, 'w') as stdout:
250 command = shlex.split("bowtie " + b_opt + " " +
251 metagenomepath + " " + fastqfile_2)
252 pp = subprocess.Popen(command, stdout=stdout)
253 ps.append(p)
254 ticker += 1
255 psb.append(pp)
256 ticker += 1 168 ticker += 1
257 if ticker == cpus: 169 if ticker == cpus:
170 for p in processes:
171 p.communicate()
172 ticker = 0
173 processes = []
174
175 for p in processes:
176 p.communicate()
177 # Combine the output from both read pairs:
178 print('Sorting and combining the output for both read pairs....')
179 sorted_bowtie = os.path.join(outputfolder, 'sorted_bowtie')
180 os.makedirs(sorted_bowtie, exist_ok=True)
181 for metagenome in repeat_list:
182 file1 = os.path.join(folder_pair1, f"{metagenome}.bowtie")
183 fileout = os.path.join(sorted_bowtie, f"{metagenome}.bowtie")
184 with open(fileout, 'w') as stdout:
185 p1 = subprocess.Popen(['cat', file1], stdout=subprocess.PIPE)
186 p2 = subprocess.Popen(['cut', '-f1'], stdin=p1.stdout,
187 stdout=subprocess.PIPE)
188 p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin=p2.stdout,
189 stdout=subprocess.PIPE)
190 p4 = subprocess.Popen(['sort'], stdin=p3.stdout,
191 stdout=subprocess.PIPE)
192 p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout)
193 p5.communicate()
194 stdout.close()
195 print('completed ...')
196 else:
197 folder_pair1 = os.path.join(outputfolder, 'pair1_bowtie')
198 folder_pair2 = os.path.join(outputfolder, 'pair2_bowtie')
199 os.makedirs(folder_pair1, exist_ok=True)
200 os.makedirs(folder_pair2, exist_ok=True)
201
202 print("Processing repeat pseudogenomes...")
203 ps, psb, ticker = [], [], 0
204
205 for metagenome in repeat_list:
206 ps.append(run_bowtie(metagenome, fastqfile_1, folder_pair1))
207 ticker += 1
208 if fastqfile_2 != 'none':
209 psb.append(run_bowtie(metagenome, fastqfile_2, folder_pair2))
210 ticker += 1
211 if ticker >= cpus:
258 for p in ps: 212 for p in ps:
259 p.communicate() 213 p.communicate()
260 for p in psb: 214 for p in psb:
261 p.communicate() 215 p.communicate()
262 ticker = 0 216 ticker = 0
217 ps = []
263 psb = [] 218 psb = []
264 ps = [] 219
265 if len(ps) > 0: 220 for p in ps:
266 for p in ps: 221 p.communicate()
267 p.communicate() 222 for p in psb:
268 stdout.close() 223 p.communicate()
269 224 # combine the output from both read pairs:
270 ############################################################################## 225 print('Sorting and combining the output for both read pairs...')
271 # combine the output from both read pairs:
272 print('sorting and combining the output for both read pairs...')
273 if not os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'): 226 if not os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'):
274 os.mkdir(outputfolder + os.path.sep + 'sorted_bowtie') 227 os.mkdir(outputfolder + os.path.sep + 'sorted_bowtie')
275 sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie' 228 sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie'
276 for metagenome in repeat_list: 229 for metagenome in repeat_list:
277 file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie' 230 file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie'
288 stdout=subprocess.PIPE) 241 stdout=subprocess.PIPE)
289 p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout) 242 p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout)
290 p5.communicate() 243 p5.communicate()
291 stdout.close() 244 stdout.close()
292 print('completed ...') 245 print('completed ...')
293 ############################################################################## 246
294 if paired_end == 'FALSE':
295 if not os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'):
296 os.mkdir(outputfolder + os.path.sep + 'pair1_bowtie')
297 folder_pair1 = outputfolder + os.path.sep + 'pair1_bowtie'
298 ##############################################################################
299 ps = []
300 ticker = 0
301 print("Processing repeat psuedogenomes...")
302 for metagenome in repeat_list:
303 metagenomepath = setup_folder + os.path.sep + metagenome
304 file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie'
305 with open(file1, 'w') as stdout:
306 command = shlex.split("bowtie " + b_opt + " " +
307 metagenomepath + " " + fastqfile_1)
308 p = subprocess.Popen(command, stdout=stdout)
309 ps.append(p)
310 ticker += 1
311 if ticker == cpus:
312 for p in ps:
313 p.communicate()
314 ticker = 0
315 ps = []
316 if len(ps) > 0:
317 for p in ps:
318 p.communicate()
319 stdout.close()
320
321 ##############################################################################
322 # combine the output from both read pairs:
323 print('Sorting and combining the output for both read pairs....')
324 if not os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'):
325 os.mkdir(outputfolder + os.path.sep + 'sorted_bowtie')
326 sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie'
327 for metagenome in repeat_list:
328 file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie'
329 fileout = sorted_bowtie + os.path.sep + metagenome + '.bowtie'
330 with open(fileout, 'w') as stdout:
331 p1 = subprocess.Popen(['cat', file1], stdout=subprocess.PIPE)
332 p2 = subprocess.Popen(['cut', '-f1'], stdin=p1.stdout,
333 stdout=subprocess.PIPE)
334 p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin=p2.stdout,
335 stdout=subprocess.PIPE)
336 p4 = subprocess.Popen(['sort'], stdin=p3.stdout,
337 stdout=subprocess.PIPE)
338 p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout)
339 p5.communicate()
340 stdout.close()
341 print('completed ...')
342
343 ##############################################################################
344 # build a file of repeat keys for all reads 247 # build a file of repeat keys for all reads
345 print('Writing and processing intermediate files...') 248 print('Writing and processing intermediate files...')
346 sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie' 249 sorted_bowtie = os.path.join(outputfolder, 'sorted_bowtie')
250 sumofrepeatreads = 0
347 readid = {} 251 readid = {}
348 sumofrepeatreads = 0 252
349 for rep in repeat_list: 253 for rep in repeat_list:
350 for data in import_text(sorted_bowtie + os.path.sep + 254 for data in import_text(
351 rep + '.bowtie', '\t'): 255 f"{os.path.join(sorted_bowtie, rep)}.bowtie", '\t'):
352 readid[data[0]] = '' 256 readid[data[0]] = ''
257
353 for rep in repeat_list: 258 for rep in repeat_list:
354 for data in import_text(sorted_bowtie + os.path.sep 259 for data in import_text(
355 + rep + '.bowtie', '\t'): 260 f"{os.path.join(sorted_bowtie, rep)}.bowtie", '\t'):
356 readid[data[0]] += str(repeat_key[rep]) + str(',') 261 readid[data[0]] += f"{repeat_key[rep]},"
262
357 for subfamilies in readid.values(): 263 for subfamilies in readid.values():
358 if subfamilies not in counts: 264 if subfamilies not in counts:
359 counts[subfamilies] = 0 265 counts[subfamilies] = 0
360 counts[subfamilies] += 1 266 counts[subfamilies] += 1
361 sumofrepeatreads += 1 267 sumofrepeatreads += 1
362 del readid 268
363 print('Identified ' + str(sumofrepeatreads) + 269 print(f'Identified {sumofrepeatreads} reads that mapped to \
364 ' reads that mapped to repeats for unique and multimappers.') 270 repeats for unique and multimappers.')
365
366 ##############################################################################
367 print("Conducting final calculations...") 271 print("Conducting final calculations...")
368
369
370 def convert(x):
371 '''
372 build a converter to numeric label for repeat and yield a combined list
373 of repnames seperated by backslash
374 '''
375 x = x.strip(',')
376 x = x.split(',')
377 global repname
378 repname = ""
379 for i in x:
380 repname = repname + os.path.sep + rev_repeat_key[int(i)]
381
382 272
383 # building the total counts for repeat element enrichment... 273 # building the total counts for repeat element enrichment...
384 for x in counts.keys(): 274 for x in counts.keys():
385 count = counts[x] 275 count = counts[x]
386 x = x.strip(',') 276 x = x.strip(',').split(',')
387 x = x.split(',')
388 for i in x: 277 for i in x:
389 reptotalcounts[rev_repeat_key[int(i)]] += int(count) 278 reptotalcounts[rev_repeat_key[int(i)]] += int(count)
390 # building the fractional counts for repeat element enrichment... 279 # building the fractional counts for repeat element enrichment...
391 for x in counts.keys(): 280 for x in counts.keys():
392 count = counts[x] 281 count = counts[x]
393 x = x.strip(',') 282 x = x.strip(',') .split(',')
394 x = x.split(',')
395 splits = len(x) 283 splits = len(x)
396 for i in x: 284 for i in x:
397 fractionalcounts[rev_repeat_key[int(i)]] += float( 285 fractionalcounts[rev_repeat_key[int(i)]] += float(
398 numpy.divide(float(count), float(splits))) 286 numpy.divide(float(count), float(splits)))
399 # building categorized table of repeat element enrichment... 287 # building categorized table of repeat element enrichment...
400 repcounts = {} 288 repcounts = {}
401 repcounts['other'] = 0 289 repcounts['other'] = 0
402 for key in counts.keys(): 290 for key in counts.keys():
403 convert(key) 291 key_list = key.strip(',').split(',')
292 repname = ''
293 for i in key_list:
294 repname = os.path.join(repname, rev_repeat_key[int(i)])
404 repcounts[repname] = counts[key] 295 repcounts[repname] = counts[key]
405 # building the total counts for class enrichment... 296 # building the total counts for class enrichment...
406 for key in reptotalcounts.keys(): 297 for key in reptotalcounts.keys():
407 classtotalcounts[repeatclass[key]] += reptotalcounts[key] 298 classtotalcounts[repeatclass[key]] += reptotalcounts[key]
408 # building total counts for family enrichment... 299 # building total counts for family enrichment...
409 for key in reptotalcounts.keys(): 300 for key in reptotalcounts.keys():
410 familytotalcounts[repeatfamily[key]] += reptotalcounts[key] 301 familytotalcounts[repeatfamily[key]] += reptotalcounts[key]
411 # building unique counts table' 302 # building unique counts table
412 repcounts2 = {} 303 repcounts2 = {}
413 for rep in repeat_list: 304 for rep in repeat_list:
414 if "/" + rep in repcounts: 305 if "/" + rep in repcounts:
415 repcounts2[rep] = repcounts["/" + rep] 306 repcounts2[rep] = repcounts["/" + rep]
416 else: 307 else:
420 classfractionalcounts[repeatclass[key]] += fractionalcounts[key] 311 classfractionalcounts[repeatclass[key]] += fractionalcounts[key]
421 # building fractional counts for family enrichment... 312 # building fractional counts for family enrichment...
422 for key in fractionalcounts.keys(): 313 for key in fractionalcounts.keys():
423 familyfractionalcounts[repeatfamily[key]] += fractionalcounts[key] 314 familyfractionalcounts[repeatfamily[key]] += fractionalcounts[key]
424 315
425 ##############################################################################
426 print('Writing final output and removing intermediate files...')
427 # print output to file of the categorized counts and total overlapping counts: 316 # print output to file of the categorized counts and total overlapping counts:
428 if allcountmethod == "TRUE": 317 print('Writing final output...')
429 fout1 = open(outputfolder + os.path.sep + outputfile_prefix 318 with open(f"{os.path.join(outputfolder, outputfile_prefix)}_"
430 + '_total_counts.txt', 'w') 319 f"class_fraction_counts.txt", 'w') as fout:
431 for key in sorted(reptotalcounts.keys()):
432 fout1.write(str(key) + '\t' + repeatclass[key] + '\t' +
433 repeatfamily[key] + '\t' + str(reptotalcounts[key])
434 + '\n')
435 fout2 = open(outputfolder + os.path.sep + outputfile_prefix
436 + '_class_total_counts.txt', 'w')
437 for key in sorted(classtotalcounts.keys()):
438 fout2.write(str(key) + '\t' + str(classtotalcounts[key]) + '\n')
439 fout3 = open(outputfolder + os.path.sep + outputfile_prefix
440 + '_family_total_counts.txt', 'w')
441 for key in sorted(familytotalcounts.keys()):
442 fout3.write(str(key) + '\t' + str(familytotalcounts[key]) + '\n')
443 fout4 = open(outputfolder + os.path.sep + outputfile_prefix +
444 '_unique_counts.txt', 'w')
445 for key in sorted(repcounts2.keys()):
446 fout4.write(str(key) + '\t' + repeatclass[key] + '\t' +
447 repeatfamily[key] + '\t' + str(repcounts2[key]) + '\n')
448 fout5 = open(outputfolder + os.path.sep + outputfile_prefix
449 + '_class_fraction_counts.txt', 'w')
450 for key in sorted(classfractionalcounts.keys()): 320 for key in sorted(classfractionalcounts.keys()):
451 fout5.write(str(key) + '\t' + str(classfractionalcounts[key]) + '\n') 321 fout.write(f"{key}\t{classfractionalcounts[key]}\n")
452 fout6 = open(outputfolder + os.path.sep + outputfile_prefix + 322
453 '_family_fraction_counts.txt', 'w') 323 with open(f"{os.path.join(outputfolder, outputfile_prefix)}_"
324 f"family_fraction_counts.txt", 'w') as fout:
454 for key in sorted(familyfractionalcounts.keys()): 325 for key in sorted(familyfractionalcounts.keys()):
455 fout6.write(str(key) + '\t' + str(familyfractionalcounts[key]) + '\n') 326 fout.write(f"{key}\t{familyfractionalcounts[key]}\n")
456 fout7 = open(outputfolder + os.path.sep + outputfile_prefix 327
457 + '_fraction_counts.txt', 'w') 328 with open(f"{os.path.join(outputfolder, outputfile_prefix)}_"
329 f"fraction_counts.txt", 'w') as fout:
458 for key in sorted(fractionalcounts.keys()): 330 for key in sorted(fractionalcounts.keys()):
459 fout7.write(str(key) + '\t' + repeatclass[key] + '\t' + 331 fout.write(f"{key}\t{repeatclass[key]}\t{repeatfamily[key]}\t"
460 repeatfamily[key] + '\t' + str(int(fractionalcounts[key])) 332 f"{int(fractionalcounts[key])}\n")
461 + '\n')
462 fout1.close()
463 fout2.close()
464 fout3.close()
465 fout4.close()
466 fout5.close()
467 fout6.close()
468 fout7.close()
469 else:
470 fout1 = open(outputfolder + os.path.sep + outputfile_prefix +
471 '_class_fraction_counts.txt', 'w')
472 for key in sorted(classfractionalcounts.keys()):
473 fout1.write(str(key) + '\t' + str(classfractionalcounts[key]) + '\n')
474 fout2 = open(outputfolder + os.path.sep + outputfile_prefix +
475 '_family_fraction_counts.txt', 'w')
476 for key in sorted(familyfractionalcounts.keys()):
477 fout2.write(str(key) + '\t' + str(familyfractionalcounts[key]) + '\n')
478 fout3 = open(outputfolder + os.path.sep + outputfile_prefix +
479 '_fraction_counts.txt', 'w')
480 for key in sorted(fractionalcounts.keys()):
481 fout3.write(str(key) + '\t' + repeatclass[key] + '\t' +
482 repeatfamily[key] + '\t' + str(int(fractionalcounts[key]))
483 + '\n')
484 fout1.close()
485 fout2.close()
486 fout3.close()
487 ##############################################################################
488 # Remove Large intermediate files
489 if os.path.exists(outputfolder + os.path.sep + outputfile_prefix +
490 '_regionsorter.txt'):
491 os.remove(outputfolder + os.path.sep + outputfile_prefix +
492 '_regionsorter.txt')
493 if os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'):
494 shutil.rmtree(outputfolder + os.path.sep + 'pair1_bowtie')
495 if os.path.exists(outputfolder + os.path.sep + 'pair2_bowtie'):
496 shutil.rmtree(outputfolder + os.path.sep + 'pair2_bowtie')
497 if os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'):
498 shutil.rmtree(outputfolder + os.path.sep + 'sorted_bowtie')
499 print("... Done")