Mercurial > repos > artbio > repenrich
comparison RepEnrich.py @ 12:89e05f831259 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
author | artbio |
---|---|
date | Mon, 18 Mar 2024 09:39:44 +0000 |
parents | 6f4143893463 |
children | 530626b0757c |
comparison
equal
deleted
inserted
replaced
11:6bba3e33c2e7 | 12:89e05f831259 |
---|---|
1 import argparse | 1 import argparse |
2 import csv | 2 import csv |
3 import os | 3 import os |
4 import shlex | 4 import shlex |
5 import shutil | |
6 import subprocess | 5 import subprocess |
7 import sys | 6 import sys |
8 | 7 |
9 import numpy | 8 import numpy |
10 | 9 |
11 | 10 |
12 parser = argparse.ArgumentParser(description=''' | 11 parser = argparse.ArgumentParser(description=''' |
13 Part II: Conducting the alignments to the psuedogenomes. Before\ | 12 Repenrich aligns reads to Repeat Elements pseudogenomes\ |
14 doing this step you will require 1) a bamfile of the unique\ | 13 and counts aligned reads. RepEnrich_setup must be run\ |
15 alignments with index 2) a fastq file of the reads mapping to\ | 14 before its use''') |
16 more than one location. These files can be obtained using the\ | 15 parser.add_argument('--annotation_file', action='store', |
17 following bowtie options [EXAMPLE: bowtie -S -m 1\ | |
18 --max multimap.fastq mm9 mate1_reads.fastq] Once you have the\ | |
19 unique alignment bamfile and the reads mapping to more than one\ | |
20 location in a fastq file you can run this step. EXAMPLE: python\ | |
21 master_output.py\ | |
22 /users/nneretti/data/annotation/hg19/hg19_repeatmasker.txt\ | |
23 /users/nneretti/datasets/repeatmapping/POL3/Pol3_human/ | |
24 HeLa_InputChIPseq_Rep1 HeLa_InputChIPseq_Rep1\ | |
25 /users/nneretti/data/annotation/hg19/setup_folder\ | |
26 HeLa_InputChIPseq_Rep1_multimap.fastq\ | |
27 HeLa_InputChIPseq_Rep1.bam''') | |
28 parser.add_argument('--version', action='version', version='%(prog)s 0.1') | |
29 parser.add_argument('annotation_file', action='store', | |
30 metavar='annotation_file', | 16 metavar='annotation_file', |
31 help='List RepeatMasker.org annotation file for your\ | 17 help='RepeatMasker.org annotation file for your\ |
32 organism. The file may be downloaded from the\ | 18 organism. The file may be downloaded from\ |
33 RepeatMasker.org website. Example:\ | 19 RepeatMasker.org. E.g. hg19_repeatmasker.txt') |
34 /data/annotation/hg19/hg19_repeatmasker.txt') | 20 parser.add_argument('--outputfolder', action='store', metavar='outputfolder', |
35 parser.add_argument('outputfolder', action='store', metavar='outputfolder', | 21 help='Folder that will contain results. Should be the\ |
36 help='List folder to contain results.\ | 22 same as the one used for RepEnrich_setup.\ |
37 Example: /outputfolder') | 23 Example: ./outputfolder') |
38 parser.add_argument('outputprefix', action='store', metavar='outputprefix', | 24 parser.add_argument('--outputprefix', action='store', metavar='outputprefix', |
39 help='Enter prefix name for data.\ | 25 help='Prefix name for Repenrich output files.') |
40 Example: HeLa_InputChIPseq_Rep1') | 26 parser.add_argument('--setup_folder', action='store', metavar='setup_folder', |
41 parser.add_argument('setup_folder', action='store', metavar='setup_folder', | 27 help='Folder produced by RepEnrich_setup which contains\ |
42 help='List folder that contains the repeat element\ | 28 repeat element pseudogenomes.') |
43 pseudogenomes.\ | 29 parser.add_argument('--fastqfile', action='store', metavar='fastqfile', |
44 Example: /data/annotation/hg19/setup_folder') | 30 help='File of fastq reads mapping to multiple\ |
45 parser.add_argument('fastqfile', action='store', metavar='fastqfile', | |
46 help='Enter file for the fastq reads that map to multiple\ | |
47 locations. Example: /data/multimap.fastq') | 31 locations. Example: /data/multimap.fastq') |
48 parser.add_argument('alignment_bam', action='store', metavar='alignment_bam', | 32 parser.add_argument('--alignment_bam', action='store', metavar='alignment_bam', |
49 help='Enter bamfile output for reads that map uniquely.\ | 33 help='Bam alignments of unique mapper reads.') |
50 Example /bamfiles/old.bam') | |
51 parser.add_argument('--pairedend', action='store', dest='pairedend', | 34 parser.add_argument('--pairedend', action='store', dest='pairedend', |
52 default='FALSE', | 35 default='FALSE', |
53 help='Designate this option for paired-end sequencing.\ | 36 help='Change to TRUE for paired-end fastq files.\ |
54 Default FALSE change to TRUE') | 37 Default FALSE') |
55 parser.add_argument('--collapserepeat', action='store', dest='collapserepeat', | |
56 metavar='collapserepeat', default='Simple_repeat', | |
57 help='Designate this option to generate a collapsed repeat\ | |
58 type. Uncollapsed output is generated in addition to\ | |
59 collapsed repeat type. Simple_repeat is default to\ | |
60 simplify downstream analysis. You can change the\ | |
61 default to another repeat name to collapse a\ | |
62 seperate specific repeat instead or if the name of\ | |
63 Simple_repeat is different for your organism.\ | |
64 Default Simple_repeat') | |
65 parser.add_argument('--fastqfile2', action='store', dest='fastqfile2', | 38 parser.add_argument('--fastqfile2', action='store', dest='fastqfile2', |
66 metavar='fastqfile2', default='none', | 39 metavar='fastqfile2', default='none', |
67 help='Enter fastqfile2 when using paired-end option.\ | 40 help='fastqfile #2 when using paired-end option.\ |
68 Default none') | 41 Default none') |
69 parser.add_argument('--cpus', action='store', dest='cpus', metavar='cpus', | 42 parser.add_argument('--cpus', action='store', dest='cpus', metavar='cpus', |
70 default="1", type=int, | 43 default="1", type=int, |
71 help='Enter available cpus per node. The more cpus the\ | 44 help='Number of CPUs. The more cpus the\ |
72 faster RepEnrich performs. RepEnrich is designed to\ | 45 faster RepEnrich performs. Default: "1"') |
73 only work on one node. Default: "1"') | 46 |
74 parser.add_argument('--allcountmethod', action='store', dest='allcountmethod', | |
75 metavar='allcountmethod', default="FALSE", | |
76 help='By default the pipeline only outputs the fraction\ | |
77 count method. Consdidered to be the best way to\ | |
78 count multimapped reads. Changing this option will\ | |
79 include the unique count method, a conservative\ | |
80 count, and the total count method, a liberal\ | |
81 counting strategy. Our evaluation of simulated data\ | |
82 indicated fraction counting is best.\ | |
83 Default = FALSE, change to TRUE') | |
84 parser.add_argument('--is_bed', action='store', dest='is_bed', | |
85 metavar='is_bed', default='FALSE', | |
86 help='Is the annotation file a bed file.\ | |
87 This is also a compatible format. The file needs to\ | |
88 be a tab seperated bed with optional fields.\ | |
89 Ex. format: chr\tstart\tend\tName_element\tclass\ | |
90 \tfamily. The class and family should identical to\ | |
91 name_element if not applicable.\ | |
92 Default FALSE change to TRUE') | |
93 args = parser.parse_args() | 47 args = parser.parse_args() |
94 | 48 |
95 # parameters | 49 # parameters |
96 annotation_file = args.annotation_file | 50 annotation_file = args.annotation_file |
97 outputfolder = args.outputfolder | 51 outputfolder = args.outputfolder |
98 outputfile_prefix = args.outputprefix | 52 outputfile_prefix = args.outputprefix |
99 setup_folder = args.setup_folder | 53 setup_folder = args.setup_folder |
100 repeat_bed = setup_folder + os.path.sep + 'repnames.bed' | 54 repeat_bed = os.path.join(setup_folder, 'repnames.bed') |
101 unique_mapper_bam = args.alignment_bam | 55 unique_mapper_bam = args.alignment_bam |
102 fastqfile_1 = args.fastqfile | 56 fastqfile_1 = args.fastqfile |
103 fastqfile_2 = args.fastqfile2 | 57 fastqfile_2 = args.fastqfile2 |
104 cpus = args.cpus | 58 cpus = args.cpus |
105 b_opt = "-k1 -p " + str(1) + " --quiet" | 59 b_opt = "-k1 -p 1 --quiet" |
106 simple_repeat = args.collapserepeat | 60 # Change if simple repeats are differently annotated in your organism |
61 simple_repeat = "Simple_repeat" | |
107 paired_end = args.pairedend | 62 paired_end = args.pairedend |
108 allcountmethod = args.allcountmethod | 63 |
109 is_bed = args.is_bed | |
110 | |
111 ############################################################################## | |
112 # check that the programs we need are available | 64 # check that the programs we need are available |
113 try: | 65 try: |
114 subprocess.call(shlex.split("coverageBed -h"), | 66 subprocess.call(shlex.split("coverageBed -h"), |
115 stdout=open(os.devnull, 'wb'), | 67 stdout=open(os.devnull, 'wb'), |
116 stderr=open(os.devnull, 'wb')) | 68 stderr=open(os.devnull, 'wb')) |
117 subprocess.call(shlex.split("bowtie --version"), | 69 subprocess.call(shlex.split("bowtie --version"), |
118 stdout=open(os.devnull, 'wb'), | 70 stdout=open(os.devnull, 'wb'), |
119 stderr=open(os.devnull, 'wb')) | 71 stderr=open(os.devnull, 'wb')) |
120 except OSError: | 72 except OSError: |
121 print("Error: Bowtie or BEDTools not loaded") | 73 print("Error: Bowtie or bedtools not loaded") |
122 raise | 74 raise |
123 | 75 |
124 ############################################################################## | |
125 # define a csv reader that reads space deliminated files | 76 # define a csv reader that reads space deliminated files |
126 print('Preparing for analysis using RepEnrich...') | 77 print('Preparing for analysis using RepEnrich...') |
127 csv.field_size_limit(sys.maxsize) | 78 csv.field_size_limit(sys.maxsize) |
128 | 79 |
129 | 80 |
132 skipinitialspace=True): | 83 skipinitialspace=True): |
133 if line: | 84 if line: |
134 yield line | 85 yield line |
135 | 86 |
136 | 87 |
137 ############################################################################## | 88 # build dictionaries to convert repclass and rep families |
138 # build dictionaries to convert repclass and rep families' | 89 repeatclass, repeatfamily = {}, {} |
139 if is_bed == "FALSE": | 90 repeats = import_text(annotation_file, ' ') |
140 repeatclass = {} | 91 # skip three first lines of the iterator |
141 repeatfamily = {} | 92 for line in range(3): |
142 fin = import_text(annotation_file, ' ') | 93 next(repeats) |
143 x = 0 | 94 for repeat in repeats: |
144 for line in fin: | 95 classfamily = [] |
145 if x > 2: | 96 classfamily = repeat[10].split('/') |
146 classfamily = [] | 97 matching_repeat = repeat[9].translate(str.maketrans('()/', '___')) |
147 classfamily = line[10].split(os.path.sep) | 98 repeatclass[matching_repeat] = classfamily[0] |
148 line9 = line[9].replace("(", "_").replace( | 99 if len(classfamily) == 2: |
149 ")", "_").replace("/", "_") | 100 repeatfamily[matching_repeat] = classfamily[1] |
150 repeatclass[line9] = classfamily[0] | 101 else: |
151 if len(classfamily) == 2: | 102 repeatfamily[matching_repeat] = classfamily[0] |
152 repeatfamily[line9] = classfamily[1] | 103 |
153 else: | |
154 repeatfamily[line9] = classfamily[0] | |
155 x += 1 | |
156 if is_bed == "TRUE": | |
157 repeatclass = {} | |
158 repeatfamily = {} | |
159 fin = open(annotation_file, 'r') | |
160 for line in fin: | |
161 line = line.strip('\n') | |
162 line = line.split('\t') | |
163 theclass = line[4] | |
164 thefamily = line[5] | |
165 line3 = line[3].replace("(", "_").replace(")", "_").replace("/", "_") | |
166 repeatclass[line3] = theclass | |
167 repeatfamily[line3] = thefamily | |
168 fin.close() | |
169 | |
170 ############################################################################## | |
171 # build list of repeats initializing dictionaries for downstream analysis' | 104 # build list of repeats initializing dictionaries for downstream analysis' |
172 fin = import_text(setup_folder + os.path.sep + 'repgenomes_key.txt', '\t') | 105 repgenome_path = os.path.join(setup_folder, 'repgenomes_key.txt') |
173 repeat_key = {} | 106 reptotalcounts = {line[0]: 0 for line in import_text(repgenome_path, '\t')} |
174 rev_repeat_key = {} | 107 fractionalcounts = {line[0]: 0 for line in import_text(repgenome_path, '\t')} |
175 repeat_list = [] | 108 classtotalcounts = {repeatclass[line[0]]: 0 for line in import_text( |
176 reptotalcounts = {} | 109 repgenome_path, '\t') if line[0] in repeatclass} |
177 classfractionalcounts = {} | 110 classfractionalcounts = {repeatclass[line[0]]: 0 for line in import_text( |
178 familyfractionalcounts = {} | 111 repgenome_path, '\t') if line[0] in repeatclass} |
179 classtotalcounts = {} | 112 familytotalcounts = {repeatfamily[line[0]]: 0 for line in import_text( |
180 familytotalcounts = {} | 113 repgenome_path, '\t') if line[0] in repeatfamily} |
181 reptotalcounts_simple = {} | 114 familyfractionalcounts = {repeatfamily[line[0]]: 0 for line in import_text( |
182 fractionalcounts = {} | 115 repgenome_path, '\t') if line[0] in repeatfamily} |
183 i = 0 | 116 reptotalcounts_simple = {(simple_repeat if line[0] in repeatfamily and |
184 for line in fin: | 117 repeatfamily[line[0]] == simple_repeat else |
185 reptotalcounts[line[0]] = 0 | 118 line[0]): 0 for line in import_text( |
186 fractionalcounts[line[0]] = 0 | 119 repgenome_path, '\t')} |
187 if line[0] in repeatclass: | 120 repeat_key = {line[0]: int(line[1]) for line in import_text( |
188 classtotalcounts[repeatclass[line[0]]] = 0 | 121 repgenome_path, '\t')} |
189 classfractionalcounts[repeatclass[line[0]]] = 0 | 122 rev_repeat_key = {int(line[1]): line[0] for line in import_text( |
190 if line[0] in repeatfamily: | 123 repgenome_path, '\t')} |
191 familytotalcounts[repeatfamily[line[0]]] = 0 | 124 repeat_list = [line[0] for line in import_text(repgenome_path, '\t')] |
192 familyfractionalcounts[repeatfamily[line[0]]] = 0 | 125 |
193 if line[0] in repeatfamily: | |
194 if repeatfamily[line[0]] == simple_repeat: | |
195 reptotalcounts_simple[simple_repeat] = 0 | |
196 else: | |
197 reptotalcounts_simple[line[0]] = 0 | |
198 repeat_list.append(line[0]) | |
199 repeat_key[line[0]] = int(line[1]) | |
200 rev_repeat_key[int(line[1])] = line[0] | |
201 fin.close() | |
202 ############################################################################## | |
203 # map the repeats to the psuedogenomes: | 126 # map the repeats to the psuedogenomes: |
204 if not os.path.exists(outputfolder): | 127 if not os.path.exists(outputfolder): |
205 os.mkdir(outputfolder) | 128 os.mkdir(outputfolder) |
206 ############################################################################## | 129 |
207 # Conduct the regions sorting | 130 # Conduct the regions sorting |
208 print('Conducting region sorting on unique mapping reads....') | 131 fileout = os.path.join(outputfolder, f"{outputfile_prefix}_regionsorter.txt") |
209 fileout = outputfolder + os.path.sep + outputfile_prefix + '_regionsorter.txt' | 132 command = shlex.split(f"coverageBed -abam {unique_mapper_bam} -b \ |
133 {os.path.join(setup_folder, 'repnames.bed')}") | |
210 with open(fileout, 'w') as stdout: | 134 with open(fileout, 'w') as stdout: |
211 command = shlex.split("coverageBed -abam " + unique_mapper_bam + " -b " + | 135 subprocess.run(command, stdout=stdout, check=True) |
212 setup_folder + os.path.sep + 'repnames.bed') | 136 |
213 p = subprocess.Popen(command, stdout=stdout) | |
214 p.communicate() | |
215 stdout.close() | |
216 filein = open(outputfolder + os.path.sep + outputfile_prefix | |
217 + '_regionsorter.txt', 'r') | |
218 counts = {} | 137 counts = {} |
219 sumofrepeatreads = 0 | 138 sumofrepeatreads = 0 |
220 for line in filein: | 139 with open(fileout) as filein: |
221 line = line.split('\t') | 140 for line in filein: |
222 if not str(repeat_key[line[3]]) in counts: | 141 line = line.split('\t') |
223 counts[str(repeat_key[line[3]])] = 0 | 142 if not str(repeat_key[line[3]]) in counts: |
224 counts[str(repeat_key[line[3]])] += int(line[4]) | 143 counts[str(repeat_key[line[3]])] = 0 |
225 sumofrepeatreads += int(line[4]) | 144 counts[str(repeat_key[line[3]])] += int(line[4]) |
226 print('Identified ' + str(sumofrepeatreads) + | 145 sumofrepeatreads += int(line[4]) |
227 'unique reads that mapped to repeats.') | 146 print(f"Identified {sumofrepeatreads} unique reads that \ |
228 ############################################################################## | 147 mapped to repeats.") |
229 if paired_end == 'TRUE': | 148 |
230 if not os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'): | 149 |
231 os.mkdir(outputfolder + os.path.sep + 'pair1_bowtie') | 150 def run_bowtie(metagenome, fastqfile, folder): |
232 if not os.path.exists(outputfolder + os.path.sep + 'pair2_bowtie'): | 151 metagenomepath = os.path.join(setup_folder, metagenome) |
233 os.mkdir(outputfolder + os.path.sep + 'pair2_bowtie') | 152 output_file = os.path.join(folder, f"{metagenome}.bowtie") |
234 folder_pair1 = outputfolder + os.path.sep + 'pair1_bowtie' | 153 command = shlex.split(f"bowtie {b_opt} {metagenomepath} {fastqfile}") |
235 folder_pair2 = outputfolder + os.path.sep + 'pair2_bowtie' | 154 with open(output_file, 'w') as stdout: |
236 ############################################################################## | 155 return subprocess.Popen(command, stdout=stdout) |
237 print("Processing repeat psuedogenomes...") | 156 |
238 ps = [] | 157 |
239 psb = [] | 158 if paired_end == 'FALSE': |
159 folder_pair1 = os.path.join(outputfolder, 'pair1_bowtie') | |
160 os.makedirs(folder_pair1, exist_ok=True) | |
161 | |
162 print("Processing repeat pseudogenomes...") | |
163 processes = [] | |
240 ticker = 0 | 164 ticker = 0 |
165 | |
241 for metagenome in repeat_list: | 166 for metagenome in repeat_list: |
242 metagenomepath = setup_folder + os.path.sep + metagenome | 167 processes.append(run_bowtie(metagenome, fastqfile_1, folder_pair1)) |
243 file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie' | |
244 file2 = folder_pair2 + os.path.sep + metagenome + '.bowtie' | |
245 with open(file1, 'w') as stdout: | |
246 command = shlex.split("bowtie " + b_opt + " " + | |
247 metagenomepath + " " + fastqfile_1) | |
248 p = subprocess.Popen(command, stdout=stdout) | |
249 with open(file2, 'w') as stdout: | |
250 command = shlex.split("bowtie " + b_opt + " " + | |
251 metagenomepath + " " + fastqfile_2) | |
252 pp = subprocess.Popen(command, stdout=stdout) | |
253 ps.append(p) | |
254 ticker += 1 | |
255 psb.append(pp) | |
256 ticker += 1 | 168 ticker += 1 |
257 if ticker == cpus: | 169 if ticker == cpus: |
170 for p in processes: | |
171 p.communicate() | |
172 ticker = 0 | |
173 processes = [] | |
174 | |
175 for p in processes: | |
176 p.communicate() | |
177 # Combine the output from both read pairs: | |
178 print('Sorting and combining the output for both read pairs....') | |
179 sorted_bowtie = os.path.join(outputfolder, 'sorted_bowtie') | |
180 os.makedirs(sorted_bowtie, exist_ok=True) | |
181 for metagenome in repeat_list: | |
182 file1 = os.path.join(folder_pair1, f"{metagenome}.bowtie") | |
183 fileout = os.path.join(sorted_bowtie, f"{metagenome}.bowtie") | |
184 with open(fileout, 'w') as stdout: | |
185 p1 = subprocess.Popen(['cat', file1], stdout=subprocess.PIPE) | |
186 p2 = subprocess.Popen(['cut', '-f1'], stdin=p1.stdout, | |
187 stdout=subprocess.PIPE) | |
188 p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin=p2.stdout, | |
189 stdout=subprocess.PIPE) | |
190 p4 = subprocess.Popen(['sort'], stdin=p3.stdout, | |
191 stdout=subprocess.PIPE) | |
192 p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout) | |
193 p5.communicate() | |
194 stdout.close() | |
195 print('completed ...') | |
196 else: | |
197 folder_pair1 = os.path.join(outputfolder, 'pair1_bowtie') | |
198 folder_pair2 = os.path.join(outputfolder, 'pair2_bowtie') | |
199 os.makedirs(folder_pair1, exist_ok=True) | |
200 os.makedirs(folder_pair2, exist_ok=True) | |
201 | |
202 print("Processing repeat pseudogenomes...") | |
203 ps, psb, ticker = [], [], 0 | |
204 | |
205 for metagenome in repeat_list: | |
206 ps.append(run_bowtie(metagenome, fastqfile_1, folder_pair1)) | |
207 ticker += 1 | |
208 if fastqfile_2 != 'none': | |
209 psb.append(run_bowtie(metagenome, fastqfile_2, folder_pair2)) | |
210 ticker += 1 | |
211 if ticker >= cpus: | |
258 for p in ps: | 212 for p in ps: |
259 p.communicate() | 213 p.communicate() |
260 for p in psb: | 214 for p in psb: |
261 p.communicate() | 215 p.communicate() |
262 ticker = 0 | 216 ticker = 0 |
217 ps = [] | |
263 psb = [] | 218 psb = [] |
264 ps = [] | 219 |
265 if len(ps) > 0: | 220 for p in ps: |
266 for p in ps: | 221 p.communicate() |
267 p.communicate() | 222 for p in psb: |
268 stdout.close() | 223 p.communicate() |
269 | 224 # combine the output from both read pairs: |
270 ############################################################################## | 225 print('Sorting and combining the output for both read pairs...') |
271 # combine the output from both read pairs: | |
272 print('sorting and combining the output for both read pairs...') | |
273 if not os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'): | 226 if not os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'): |
274 os.mkdir(outputfolder + os.path.sep + 'sorted_bowtie') | 227 os.mkdir(outputfolder + os.path.sep + 'sorted_bowtie') |
275 sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie' | 228 sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie' |
276 for metagenome in repeat_list: | 229 for metagenome in repeat_list: |
277 file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie' | 230 file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie' |
288 stdout=subprocess.PIPE) | 241 stdout=subprocess.PIPE) |
289 p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout) | 242 p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout) |
290 p5.communicate() | 243 p5.communicate() |
291 stdout.close() | 244 stdout.close() |
292 print('completed ...') | 245 print('completed ...') |
293 ############################################################################## | 246 |
294 if paired_end == 'FALSE': | |
295 if not os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'): | |
296 os.mkdir(outputfolder + os.path.sep + 'pair1_bowtie') | |
297 folder_pair1 = outputfolder + os.path.sep + 'pair1_bowtie' | |
298 ############################################################################## | |
299 ps = [] | |
300 ticker = 0 | |
301 print("Processing repeat psuedogenomes...") | |
302 for metagenome in repeat_list: | |
303 metagenomepath = setup_folder + os.path.sep + metagenome | |
304 file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie' | |
305 with open(file1, 'w') as stdout: | |
306 command = shlex.split("bowtie " + b_opt + " " + | |
307 metagenomepath + " " + fastqfile_1) | |
308 p = subprocess.Popen(command, stdout=stdout) | |
309 ps.append(p) | |
310 ticker += 1 | |
311 if ticker == cpus: | |
312 for p in ps: | |
313 p.communicate() | |
314 ticker = 0 | |
315 ps = [] | |
316 if len(ps) > 0: | |
317 for p in ps: | |
318 p.communicate() | |
319 stdout.close() | |
320 | |
321 ############################################################################## | |
322 # combine the output from both read pairs: | |
323 print('Sorting and combining the output for both read pairs....') | |
324 if not os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'): | |
325 os.mkdir(outputfolder + os.path.sep + 'sorted_bowtie') | |
326 sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie' | |
327 for metagenome in repeat_list: | |
328 file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie' | |
329 fileout = sorted_bowtie + os.path.sep + metagenome + '.bowtie' | |
330 with open(fileout, 'w') as stdout: | |
331 p1 = subprocess.Popen(['cat', file1], stdout=subprocess.PIPE) | |
332 p2 = subprocess.Popen(['cut', '-f1'], stdin=p1.stdout, | |
333 stdout=subprocess.PIPE) | |
334 p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin=p2.stdout, | |
335 stdout=subprocess.PIPE) | |
336 p4 = subprocess.Popen(['sort'], stdin=p3.stdout, | |
337 stdout=subprocess.PIPE) | |
338 p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout) | |
339 p5.communicate() | |
340 stdout.close() | |
341 print('completed ...') | |
342 | |
343 ############################################################################## | |
344 # build a file of repeat keys for all reads | 247 # build a file of repeat keys for all reads |
345 print('Writing and processing intermediate files...') | 248 print('Writing and processing intermediate files...') |
346 sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie' | 249 sorted_bowtie = os.path.join(outputfolder, 'sorted_bowtie') |
250 sumofrepeatreads = 0 | |
347 readid = {} | 251 readid = {} |
348 sumofrepeatreads = 0 | 252 |
349 for rep in repeat_list: | 253 for rep in repeat_list: |
350 for data in import_text(sorted_bowtie + os.path.sep + | 254 for data in import_text( |
351 rep + '.bowtie', '\t'): | 255 f"{os.path.join(sorted_bowtie, rep)}.bowtie", '\t'): |
352 readid[data[0]] = '' | 256 readid[data[0]] = '' |
257 | |
353 for rep in repeat_list: | 258 for rep in repeat_list: |
354 for data in import_text(sorted_bowtie + os.path.sep | 259 for data in import_text( |
355 + rep + '.bowtie', '\t'): | 260 f"{os.path.join(sorted_bowtie, rep)}.bowtie", '\t'): |
356 readid[data[0]] += str(repeat_key[rep]) + str(',') | 261 readid[data[0]] += f"{repeat_key[rep]}," |
262 | |
357 for subfamilies in readid.values(): | 263 for subfamilies in readid.values(): |
358 if subfamilies not in counts: | 264 if subfamilies not in counts: |
359 counts[subfamilies] = 0 | 265 counts[subfamilies] = 0 |
360 counts[subfamilies] += 1 | 266 counts[subfamilies] += 1 |
361 sumofrepeatreads += 1 | 267 sumofrepeatreads += 1 |
362 del readid | 268 |
363 print('Identified ' + str(sumofrepeatreads) + | 269 print(f'Identified {sumofrepeatreads} reads that mapped to \ |
364 ' reads that mapped to repeats for unique and multimappers.') | 270 repeats for unique and multimappers.') |
365 | |
366 ############################################################################## | |
367 print("Conducting final calculations...") | 271 print("Conducting final calculations...") |
368 | |
369 | |
370 def convert(x): | |
371 ''' | |
372 build a converter to numeric label for repeat and yield a combined list | |
373 of repnames seperated by backslash | |
374 ''' | |
375 x = x.strip(',') | |
376 x = x.split(',') | |
377 global repname | |
378 repname = "" | |
379 for i in x: | |
380 repname = repname + os.path.sep + rev_repeat_key[int(i)] | |
381 | |
382 | 272 |
383 # building the total counts for repeat element enrichment... | 273 # building the total counts for repeat element enrichment... |
384 for x in counts.keys(): | 274 for x in counts.keys(): |
385 count = counts[x] | 275 count = counts[x] |
386 x = x.strip(',') | 276 x = x.strip(',').split(',') |
387 x = x.split(',') | |
388 for i in x: | 277 for i in x: |
389 reptotalcounts[rev_repeat_key[int(i)]] += int(count) | 278 reptotalcounts[rev_repeat_key[int(i)]] += int(count) |
390 # building the fractional counts for repeat element enrichment... | 279 # building the fractional counts for repeat element enrichment... |
391 for x in counts.keys(): | 280 for x in counts.keys(): |
392 count = counts[x] | 281 count = counts[x] |
393 x = x.strip(',') | 282 x = x.strip(',') .split(',') |
394 x = x.split(',') | |
395 splits = len(x) | 283 splits = len(x) |
396 for i in x: | 284 for i in x: |
397 fractionalcounts[rev_repeat_key[int(i)]] += float( | 285 fractionalcounts[rev_repeat_key[int(i)]] += float( |
398 numpy.divide(float(count), float(splits))) | 286 numpy.divide(float(count), float(splits))) |
399 # building categorized table of repeat element enrichment... | 287 # building categorized table of repeat element enrichment... |
400 repcounts = {} | 288 repcounts = {} |
401 repcounts['other'] = 0 | 289 repcounts['other'] = 0 |
402 for key in counts.keys(): | 290 for key in counts.keys(): |
403 convert(key) | 291 key_list = key.strip(',').split(',') |
292 repname = '' | |
293 for i in key_list: | |
294 repname = os.path.join(repname, rev_repeat_key[int(i)]) | |
404 repcounts[repname] = counts[key] | 295 repcounts[repname] = counts[key] |
405 # building the total counts for class enrichment... | 296 # building the total counts for class enrichment... |
406 for key in reptotalcounts.keys(): | 297 for key in reptotalcounts.keys(): |
407 classtotalcounts[repeatclass[key]] += reptotalcounts[key] | 298 classtotalcounts[repeatclass[key]] += reptotalcounts[key] |
408 # building total counts for family enrichment... | 299 # building total counts for family enrichment... |
409 for key in reptotalcounts.keys(): | 300 for key in reptotalcounts.keys(): |
410 familytotalcounts[repeatfamily[key]] += reptotalcounts[key] | 301 familytotalcounts[repeatfamily[key]] += reptotalcounts[key] |
411 # building unique counts table' | 302 # building unique counts table |
412 repcounts2 = {} | 303 repcounts2 = {} |
413 for rep in repeat_list: | 304 for rep in repeat_list: |
414 if "/" + rep in repcounts: | 305 if "/" + rep in repcounts: |
415 repcounts2[rep] = repcounts["/" + rep] | 306 repcounts2[rep] = repcounts["/" + rep] |
416 else: | 307 else: |
420 classfractionalcounts[repeatclass[key]] += fractionalcounts[key] | 311 classfractionalcounts[repeatclass[key]] += fractionalcounts[key] |
421 # building fractional counts for family enrichment... | 312 # building fractional counts for family enrichment... |
422 for key in fractionalcounts.keys(): | 313 for key in fractionalcounts.keys(): |
423 familyfractionalcounts[repeatfamily[key]] += fractionalcounts[key] | 314 familyfractionalcounts[repeatfamily[key]] += fractionalcounts[key] |
424 | 315 |
425 ############################################################################## | |
426 print('Writing final output and removing intermediate files...') | |
427 # print output to file of the categorized counts and total overlapping counts: | 316 # print output to file of the categorized counts and total overlapping counts: |
428 if allcountmethod == "TRUE": | 317 print('Writing final output...') |
429 fout1 = open(outputfolder + os.path.sep + outputfile_prefix | 318 with open(f"{os.path.join(outputfolder, outputfile_prefix)}_" |
430 + '_total_counts.txt', 'w') | 319 f"class_fraction_counts.txt", 'w') as fout: |
431 for key in sorted(reptotalcounts.keys()): | |
432 fout1.write(str(key) + '\t' + repeatclass[key] + '\t' + | |
433 repeatfamily[key] + '\t' + str(reptotalcounts[key]) | |
434 + '\n') | |
435 fout2 = open(outputfolder + os.path.sep + outputfile_prefix | |
436 + '_class_total_counts.txt', 'w') | |
437 for key in sorted(classtotalcounts.keys()): | |
438 fout2.write(str(key) + '\t' + str(classtotalcounts[key]) + '\n') | |
439 fout3 = open(outputfolder + os.path.sep + outputfile_prefix | |
440 + '_family_total_counts.txt', 'w') | |
441 for key in sorted(familytotalcounts.keys()): | |
442 fout3.write(str(key) + '\t' + str(familytotalcounts[key]) + '\n') | |
443 fout4 = open(outputfolder + os.path.sep + outputfile_prefix + | |
444 '_unique_counts.txt', 'w') | |
445 for key in sorted(repcounts2.keys()): | |
446 fout4.write(str(key) + '\t' + repeatclass[key] + '\t' + | |
447 repeatfamily[key] + '\t' + str(repcounts2[key]) + '\n') | |
448 fout5 = open(outputfolder + os.path.sep + outputfile_prefix | |
449 + '_class_fraction_counts.txt', 'w') | |
450 for key in sorted(classfractionalcounts.keys()): | 320 for key in sorted(classfractionalcounts.keys()): |
451 fout5.write(str(key) + '\t' + str(classfractionalcounts[key]) + '\n') | 321 fout.write(f"{key}\t{classfractionalcounts[key]}\n") |
452 fout6 = open(outputfolder + os.path.sep + outputfile_prefix + | 322 |
453 '_family_fraction_counts.txt', 'w') | 323 with open(f"{os.path.join(outputfolder, outputfile_prefix)}_" |
324 f"family_fraction_counts.txt", 'w') as fout: | |
454 for key in sorted(familyfractionalcounts.keys()): | 325 for key in sorted(familyfractionalcounts.keys()): |
455 fout6.write(str(key) + '\t' + str(familyfractionalcounts[key]) + '\n') | 326 fout.write(f"{key}\t{familyfractionalcounts[key]}\n") |
456 fout7 = open(outputfolder + os.path.sep + outputfile_prefix | 327 |
457 + '_fraction_counts.txt', 'w') | 328 with open(f"{os.path.join(outputfolder, outputfile_prefix)}_" |
329 f"fraction_counts.txt", 'w') as fout: | |
458 for key in sorted(fractionalcounts.keys()): | 330 for key in sorted(fractionalcounts.keys()): |
459 fout7.write(str(key) + '\t' + repeatclass[key] + '\t' + | 331 fout.write(f"{key}\t{repeatclass[key]}\t{repeatfamily[key]}\t" |
460 repeatfamily[key] + '\t' + str(int(fractionalcounts[key])) | 332 f"{int(fractionalcounts[key])}\n") |
461 + '\n') | |
462 fout1.close() | |
463 fout2.close() | |
464 fout3.close() | |
465 fout4.close() | |
466 fout5.close() | |
467 fout6.close() | |
468 fout7.close() | |
469 else: | |
470 fout1 = open(outputfolder + os.path.sep + outputfile_prefix + | |
471 '_class_fraction_counts.txt', 'w') | |
472 for key in sorted(classfractionalcounts.keys()): | |
473 fout1.write(str(key) + '\t' + str(classfractionalcounts[key]) + '\n') | |
474 fout2 = open(outputfolder + os.path.sep + outputfile_prefix + | |
475 '_family_fraction_counts.txt', 'w') | |
476 for key in sorted(familyfractionalcounts.keys()): | |
477 fout2.write(str(key) + '\t' + str(familyfractionalcounts[key]) + '\n') | |
478 fout3 = open(outputfolder + os.path.sep + outputfile_prefix + | |
479 '_fraction_counts.txt', 'w') | |
480 for key in sorted(fractionalcounts.keys()): | |
481 fout3.write(str(key) + '\t' + repeatclass[key] + '\t' + | |
482 repeatfamily[key] + '\t' + str(int(fractionalcounts[key])) | |
483 + '\n') | |
484 fout1.close() | |
485 fout2.close() | |
486 fout3.close() | |
487 ############################################################################## | |
488 # Remove Large intermediate files | |
489 if os.path.exists(outputfolder + os.path.sep + outputfile_prefix + | |
490 '_regionsorter.txt'): | |
491 os.remove(outputfolder + os.path.sep + outputfile_prefix + | |
492 '_regionsorter.txt') | |
493 if os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'): | |
494 shutil.rmtree(outputfolder + os.path.sep + 'pair1_bowtie') | |
495 if os.path.exists(outputfolder + os.path.sep + 'pair2_bowtie'): | |
496 shutil.rmtree(outputfolder + os.path.sep + 'pair2_bowtie') | |
497 if os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'): | |
498 shutil.rmtree(outputfolder + os.path.sep + 'sorted_bowtie') | |
499 print("... Done") |