Mercurial > repos > artbio > repenrich2
comparison RepEnrich2.py @ 7:61e0404f0d76 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 067aa46ea7482d640369b95d5b3dabec5793396b
| author | artbio |
|---|---|
| date | Tue, 23 Apr 2024 21:44:44 +0000 |
| parents | 388a47ca4199 |
| children | 567549a49eb2 |
comparison
equal
deleted
inserted
replaced
| 6:388a47ca4199 | 7:61e0404f0d76 |
|---|---|
| 75 file = csv.reader(open(filename), delimiter=separator, | 75 file = csv.reader(open(filename), delimiter=separator, |
| 76 skipinitialspace=True) | 76 skipinitialspace=True) |
| 77 return [line for line in file if starts_with_numerical(line)] | 77 return [line for line in file if starts_with_numerical(line)] |
| 78 | 78 |
| 79 | 79 |
| 80 def run_bowtie(args): | |
| 81 metagenome, fastqfile = args | |
| 82 b_opt = "-k 1 -p 1 --quiet --no-hd" | |
| 83 command = shlex.split(f"bowtie2 {b_opt} -x {metagenome} {fastqfile}") | |
| 84 bowtie_align = subprocess.run(command, check=True, | |
| 85 capture_output=True, text=True).stdout | |
| 86 bowtie_align = bowtie_align.rstrip('\r\n').split('\n') | |
| 87 readlist = [metagenome] | |
| 88 if paired_end: | |
| 89 for line in bowtie_align: | |
| 90 readlist.append(line.split("/")[0]) | |
| 91 else: | |
| 92 for line in bowtie_align: | |
| 93 readlist.append(line.split("\t")[0]) | |
| 94 return readlist | |
| 95 | |
| 96 | |
| 97 # set a reference repeat list for the script | 80 # set a reference repeat list for the script |
| 98 repeat_list = [listline[9].translate( | 81 repeat_list = [listline[9].translate( |
| 99 str.maketrans( | 82 str.maketrans( |
| 100 '()/', '___')) for listline in import_text(annotation_file, ' ')] | 83 '()/', '___')) for listline in import_text(annotation_file, ' ')] |
| 101 repeat_list = sorted(list(set(repeat_list))) | 84 repeat_list = sorted(list(set(repeat_list))) |
| 119 with open("unique_mapper_counts.tsv", 'w') as fout: | 102 with open("unique_mapper_counts.tsv", 'w') as fout: |
| 120 fout.write("#element\tcount\n") | 103 fout.write("#element\tcount\n") |
| 121 for count in sorted(counts): | 104 for count in sorted(counts): |
| 122 fout.write(f"{count}\t{counts[count]}\n") | 105 fout.write(f"{count}\t{counts[count]}\n") |
| 123 | 106 |
| 107 | |
| 108 def run_bowtie(args): | |
| 109 ''' | |
| 110 write to files to save memory | |
| 111 ''' | |
| 112 metagenome, fastqfile = args | |
| 113 b_opt = "-k 1 -p 1 --quiet --no-hd" | |
| 114 command = shlex.split(f"bowtie2 {b_opt} -x {metagenome} {fastqfile}") | |
| 115 bowtie_align = subprocess.run(command, check=True, | |
| 116 capture_output=True, text=True).stdout | |
| 117 bowtie_align = bowtie_align.rstrip('\r\n').split('\n') | |
| 118 with open(f"{metagenome}.reads", "a+") as readfile: | |
| 119 for line in bowtie_align: | |
| 120 read = line.split()[0].split("/")[0] | |
| 121 if read: | |
| 122 readfile.write(f"{read}\n") | |
| 123 | |
| 124 | |
| 124 # multimapper parsing | 125 # multimapper parsing |
| 125 if not paired_end: | 126 args_list = [(metagenome, fastqfile_1) for metagenome in repeat_list] |
| 126 args_list = [(metagenome, fastqfile_1) for | 127 if paired_end: |
| 127 metagenome in repeat_list] | |
| 128 with ProcessPoolExecutor(max_workers=cpus) as executor: | |
| 129 results = executor.map(run_bowtie, args_list) | |
| 130 else: | |
| 131 args_list = [(metagenome, fastqfile_1) for | |
| 132 metagenome in repeat_list] | |
| 133 args_list.extend([(metagenome, fastqfile_2) for | 128 args_list.extend([(metagenome, fastqfile_2) for |
| 134 metagenome in repeat_list]) | 129 metagenome in repeat_list]) |
| 135 with ProcessPoolExecutor(max_workers=cpus) as executor: | 130 with ProcessPoolExecutor(max_workers=cpus) as executor: |
| 136 results = executor.map(run_bowtie, args_list) | 131 results = executor.map(run_bowtie, args_list) |
| 137 | 132 |
| 138 # Aggregate results (avoiding race conditions) | 133 # Aggregate results (avoiding race conditions) |
| 139 metagenome_reads = defaultdict(list) # repeat_name: list of multimap reads | 134 metagenome_reads = defaultdict(list) # metagenome: list of multimap reads |
| 140 for result in results: | 135 |
| 141 metagenome_reads[result[0]] += result[1:] | 136 # Now we read .reads file to populate metagnomes_reads |
| 142 | 137 for metagenome in repeat_list: |
| 143 for name in metagenome_reads: | 138 with open(f"{metagenome}.reads") as readfile: |
| 144 # read are only once in list | 139 for read in readfile: |
| 145 metagenome_reads[name] = list(set(metagenome_reads[name])) | 140 metagenome_reads[metagenome].append(read) |
| 146 # remove "no read" instances | 141 # read are only once in list |
| 147 metagenome_reads[name] = [read for read in metagenome_reads[name] | 142 metagenome_reads[metagenome] = list(set(metagenome_reads[metagenome])) |
| 148 if read != ""] | |
| 149 | 143 |
| 150 # implement repeats_by_reads from the inverse dictionnary metagenome_reads | 144 # implement repeats_by_reads from the inverse dictionnary metagenome_reads |
| 151 repeats_by_reads = defaultdict(list) # readids: list of repeats names | 145 repeats_by_reads = defaultdict(list) # readids: list of repeats names |
| 152 for repname in metagenome_reads: | 146 for repname in metagenome_reads: |
| 153 for read in metagenome_reads[repname]: | 147 for read in metagenome_reads[repname]: |
