Mercurial > repos > artbio > repenrich2
comparison RepEnrich2.py @ 7:61e0404f0d76 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 067aa46ea7482d640369b95d5b3dabec5793396b
author | artbio |
---|---|
date | Tue, 23 Apr 2024 21:44:44 +0000 |
parents | 388a47ca4199 |
children | 567549a49eb2 |
comparison
equal
deleted
inserted
replaced
6:388a47ca4199 | 7:61e0404f0d76 |
---|---|
75 file = csv.reader(open(filename), delimiter=separator, | 75 file = csv.reader(open(filename), delimiter=separator, |
76 skipinitialspace=True) | 76 skipinitialspace=True) |
77 return [line for line in file if starts_with_numerical(line)] | 77 return [line for line in file if starts_with_numerical(line)] |
78 | 78 |
79 | 79 |
80 def run_bowtie(args): | |
81 metagenome, fastqfile = args | |
82 b_opt = "-k 1 -p 1 --quiet --no-hd" | |
83 command = shlex.split(f"bowtie2 {b_opt} -x {metagenome} {fastqfile}") | |
84 bowtie_align = subprocess.run(command, check=True, | |
85 capture_output=True, text=True).stdout | |
86 bowtie_align = bowtie_align.rstrip('\r\n').split('\n') | |
87 readlist = [metagenome] | |
88 if paired_end: | |
89 for line in bowtie_align: | |
90 readlist.append(line.split("/")[0]) | |
91 else: | |
92 for line in bowtie_align: | |
93 readlist.append(line.split("\t")[0]) | |
94 return readlist | |
95 | |
96 | |
97 # set a reference repeat list for the script | 80 # set a reference repeat list for the script |
98 repeat_list = [listline[9].translate( | 81 repeat_list = [listline[9].translate( |
99 str.maketrans( | 82 str.maketrans( |
100 '()/', '___')) for listline in import_text(annotation_file, ' ')] | 83 '()/', '___')) for listline in import_text(annotation_file, ' ')] |
101 repeat_list = sorted(list(set(repeat_list))) | 84 repeat_list = sorted(list(set(repeat_list))) |
119 with open("unique_mapper_counts.tsv", 'w') as fout: | 102 with open("unique_mapper_counts.tsv", 'w') as fout: |
120 fout.write("#element\tcount\n") | 103 fout.write("#element\tcount\n") |
121 for count in sorted(counts): | 104 for count in sorted(counts): |
122 fout.write(f"{count}\t{counts[count]}\n") | 105 fout.write(f"{count}\t{counts[count]}\n") |
123 | 106 |
107 | |
108 def run_bowtie(args): | |
109 ''' | |
110 write to files to save memory | |
111 ''' | |
112 metagenome, fastqfile = args | |
113 b_opt = "-k 1 -p 1 --quiet --no-hd" | |
114 command = shlex.split(f"bowtie2 {b_opt} -x {metagenome} {fastqfile}") | |
115 bowtie_align = subprocess.run(command, check=True, | |
116 capture_output=True, text=True).stdout | |
117 bowtie_align = bowtie_align.rstrip('\r\n').split('\n') | |
118 with open(f"{metagenome}.reads", "a+") as readfile: | |
119 for line in bowtie_align: | |
120 read = line.split()[0].split("/")[0] | |
121 if read: | |
122 readfile.write(f"{read}\n") | |
123 | |
124 | |
124 # multimapper parsing | 125 # multimapper parsing |
125 if not paired_end: | 126 args_list = [(metagenome, fastqfile_1) for metagenome in repeat_list] |
126 args_list = [(metagenome, fastqfile_1) for | 127 if paired_end: |
127 metagenome in repeat_list] | |
128 with ProcessPoolExecutor(max_workers=cpus) as executor: | |
129 results = executor.map(run_bowtie, args_list) | |
130 else: | |
131 args_list = [(metagenome, fastqfile_1) for | |
132 metagenome in repeat_list] | |
133 args_list.extend([(metagenome, fastqfile_2) for | 128 args_list.extend([(metagenome, fastqfile_2) for |
134 metagenome in repeat_list]) | 129 metagenome in repeat_list]) |
135 with ProcessPoolExecutor(max_workers=cpus) as executor: | 130 with ProcessPoolExecutor(max_workers=cpus) as executor: |
136 results = executor.map(run_bowtie, args_list) | 131 results = executor.map(run_bowtie, args_list) |
137 | 132 |
138 # Aggregate results (avoiding race conditions) | 133 # Aggregate results (avoiding race conditions) |
139 metagenome_reads = defaultdict(list) # repeat_name: list of multimap reads | 134 metagenome_reads = defaultdict(list) # metagenome: list of multimap reads |
140 for result in results: | 135 |
141 metagenome_reads[result[0]] += result[1:] | 136 # Now we read .reads file to populate metagnomes_reads |
142 | 137 for metagenome in repeat_list: |
143 for name in metagenome_reads: | 138 with open(f"{metagenome}.reads") as readfile: |
144 # read are only once in list | 139 for read in readfile: |
145 metagenome_reads[name] = list(set(metagenome_reads[name])) | 140 metagenome_reads[metagenome].append(read) |
146 # remove "no read" instances | 141 # read are only once in list |
147 metagenome_reads[name] = [read for read in metagenome_reads[name] | 142 metagenome_reads[metagenome] = list(set(metagenome_reads[metagenome])) |
148 if read != ""] | |
149 | 143 |
150 # implement repeats_by_reads from the inverse dictionnary metagenome_reads | 144 # implement repeats_by_reads from the inverse dictionnary metagenome_reads |
151 repeats_by_reads = defaultdict(list) # readids: list of repeats names | 145 repeats_by_reads = defaultdict(list) # readids: list of repeats names |
152 for repname in metagenome_reads: | 146 for repname in metagenome_reads: |
153 for read in metagenome_reads[repname]: | 147 for read in metagenome_reads[repname]: |