comparison RepEnrich2.py @ 7:61e0404f0d76 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 067aa46ea7482d640369b95d5b3dabec5793396b
author artbio
date Tue, 23 Apr 2024 21:44:44 +0000
parents 388a47ca4199
children 567549a49eb2
comparison
equal deleted inserted replaced
6:388a47ca4199 7:61e0404f0d76
75 file = csv.reader(open(filename), delimiter=separator, 75 file = csv.reader(open(filename), delimiter=separator,
76 skipinitialspace=True) 76 skipinitialspace=True)
77 return [line for line in file if starts_with_numerical(line)] 77 return [line for line in file if starts_with_numerical(line)]
78 78
79 79
80 def run_bowtie(args):
81 metagenome, fastqfile = args
82 b_opt = "-k 1 -p 1 --quiet --no-hd"
83 command = shlex.split(f"bowtie2 {b_opt} -x {metagenome} {fastqfile}")
84 bowtie_align = subprocess.run(command, check=True,
85 capture_output=True, text=True).stdout
86 bowtie_align = bowtie_align.rstrip('\r\n').split('\n')
87 readlist = [metagenome]
88 if paired_end:
89 for line in bowtie_align:
90 readlist.append(line.split("/")[0])
91 else:
92 for line in bowtie_align:
93 readlist.append(line.split("\t")[0])
94 return readlist
95
96
97 # set a reference repeat list for the script 80 # set a reference repeat list for the script
98 repeat_list = [listline[9].translate( 81 repeat_list = [listline[9].translate(
99 str.maketrans( 82 str.maketrans(
100 '()/', '___')) for listline in import_text(annotation_file, ' ')] 83 '()/', '___')) for listline in import_text(annotation_file, ' ')]
101 repeat_list = sorted(list(set(repeat_list))) 84 repeat_list = sorted(list(set(repeat_list)))
119 with open("unique_mapper_counts.tsv", 'w') as fout: 102 with open("unique_mapper_counts.tsv", 'w') as fout:
120 fout.write("#element\tcount\n") 103 fout.write("#element\tcount\n")
121 for count in sorted(counts): 104 for count in sorted(counts):
122 fout.write(f"{count}\t{counts[count]}\n") 105 fout.write(f"{count}\t{counts[count]}\n")
123 106
107
108 def run_bowtie(args):
109 '''
110 write to files to save memory
111 '''
112 metagenome, fastqfile = args
113 b_opt = "-k 1 -p 1 --quiet --no-hd"
114 command = shlex.split(f"bowtie2 {b_opt} -x {metagenome} {fastqfile}")
115 bowtie_align = subprocess.run(command, check=True,
116 capture_output=True, text=True).stdout
117 bowtie_align = bowtie_align.rstrip('\r\n').split('\n')
118 with open(f"{metagenome}.reads", "a+") as readfile:
119 for line in bowtie_align:
120 read = line.split()[0].split("/")[0]
121 if read:
122 readfile.write(f"{read}\n")
123
124
124 # multimapper parsing 125 # multimapper parsing
125 if not paired_end: 126 args_list = [(metagenome, fastqfile_1) for metagenome in repeat_list]
126 args_list = [(metagenome, fastqfile_1) for 127 if paired_end:
127 metagenome in repeat_list]
128 with ProcessPoolExecutor(max_workers=cpus) as executor:
129 results = executor.map(run_bowtie, args_list)
130 else:
131 args_list = [(metagenome, fastqfile_1) for
132 metagenome in repeat_list]
133 args_list.extend([(metagenome, fastqfile_2) for 128 args_list.extend([(metagenome, fastqfile_2) for
134 metagenome in repeat_list]) 129 metagenome in repeat_list])
135 with ProcessPoolExecutor(max_workers=cpus) as executor: 130 with ProcessPoolExecutor(max_workers=cpus) as executor:
136 results = executor.map(run_bowtie, args_list) 131 results = executor.map(run_bowtie, args_list)
137 132
138 # Aggregate results (avoiding race conditions) 133 # Aggregate results (avoiding race conditions)
139 metagenome_reads = defaultdict(list) # repeat_name: list of multimap reads 134 metagenome_reads = defaultdict(list) # metagenome: list of multimap reads
140 for result in results: 135
141 metagenome_reads[result[0]] += result[1:] 136 # Now we read .reads file to populate metagnomes_reads
142 137 for metagenome in repeat_list:
143 for name in metagenome_reads: 138 with open(f"{metagenome}.reads") as readfile:
144 # read are only once in list 139 for read in readfile:
145 metagenome_reads[name] = list(set(metagenome_reads[name])) 140 metagenome_reads[metagenome].append(read)
146 # remove "no read" instances 141 # read are only once in list
147 metagenome_reads[name] = [read for read in metagenome_reads[name] 142 metagenome_reads[metagenome] = list(set(metagenome_reads[metagenome]))
148 if read != ""]
149 143
150 # implement repeats_by_reads from the inverse dictionnary metagenome_reads 144 # implement repeats_by_reads from the inverse dictionnary metagenome_reads
151 repeats_by_reads = defaultdict(list) # readids: list of repeats names 145 repeats_by_reads = defaultdict(list) # readids: list of repeats names
152 for repname in metagenome_reads: 146 for repname in metagenome_reads:
153 for read in metagenome_reads[repname]: 147 for read in metagenome_reads[repname]: