Mercurial > repos > artbio > repenrich
comparison RepEnrich_setup.py @ 13:530626b0757c draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
author | artbio |
---|---|
date | Tue, 02 Apr 2024 21:16:37 +0000 |
parents | 89e05f831259 |
children | bf866bedd4b4 |
comparison
equal
deleted
inserted
replaced
12:89e05f831259 | 13:530626b0757c |
---|---|
3 import csv | 3 import csv |
4 import os | 4 import os |
5 import shlex | 5 import shlex |
6 import subprocess | 6 import subprocess |
7 import sys | 7 import sys |
8 from collections import defaultdict | |
9 from concurrent.futures import ProcessPoolExecutor | |
10 | |
8 | 11 |
9 from Bio import SeqIO | 12 from Bio import SeqIO |
10 from Bio.Seq import Seq | 13 from Bio.Seq import Seq |
11 from Bio.SeqRecord import SeqRecord | 14 from Bio.SeqRecord import SeqRecord |
12 | 15 |
20 interest. Download from RepeatMasker.org\ | 23 interest. Download from RepeatMasker.org\ |
21 Example: mm9.fa.out''') | 24 Example: mm9.fa.out''') |
22 parser.add_argument('--genomefasta', action='store', metavar='genomefasta', | 25 parser.add_argument('--genomefasta', action='store', metavar='genomefasta', |
23 help='''Genome of interest in fasta format.\ | 26 help='''Genome of interest in fasta format.\ |
24 Example: mm9.fa''') | 27 Example: mm9.fa''') |
25 parser.add_argument('--setup_folder', action='store', metavar='setup_folder', | |
26 help='''Folder that contains bowtie indexes of repeats and\ | |
27 repeat element psuedogenomes.\ | |
28 Example working/setup''') | |
29 parser.add_argument('--gaplength', action='store', dest='gaplength', | 28 parser.add_argument('--gaplength', action='store', dest='gaplength', |
30 metavar='gaplength', default='200', type=int, | 29 metavar='gaplength', default='200', type=int, |
31 help='''Length of the N-spacer in the\ | 30 help='''Length of the N-spacer in the\ |
32 repeat pseudogenomes. Default 200''') | 31 repeat pseudogenomes. Default 200''') |
33 parser.add_argument('--flankinglength', action='store', dest='flankinglength', | 32 parser.add_argument('--flankinglength', action='store', dest='flankinglength', |
34 metavar='flankinglength', default='25', type=int, | 33 metavar='flankinglength', default='25', type=int, |
35 help='''Length of the flanking regions used to build\ | 34 help='''Length of the flanking regions used to build\ |
36 repeat pseudogenomes. Flanking length should be set\ | 35 repeat pseudogenomes. Flanking length should be set\ |
37 according to the length of your reads.\ | 36 according to the length of your reads.\ |
38 Default 25, for 50 nt reads''') | 37 Default 25, for 50 nt reads''') |
38 parser.add_argument('--cpus', action='store', dest='cpus', metavar='cpus', | |
39 default="1", type=int, | |
40 help='Number of CPUs. The more cpus the\ | |
41 faster RepEnrich performs. Default: "1"') | |
39 args = parser.parse_args() | 42 args = parser.parse_args() |
40 | 43 |
41 # parameters from argsparse | 44 # parameters from argsparse |
42 gapl = args.gaplength | 45 gapl = args.gaplength |
43 flankingl = args.flankinglength | 46 flankingl = args.flankinglength |
44 annotation_file = args.annotation_file | 47 annotation_file = args.annotation_file |
45 genomefasta = args.genomefasta | 48 genomefasta = args.genomefasta |
46 setup_folder = args.setup_folder | 49 cpus = args.cpus |
47 | 50 |
48 # check that the programs we need are available | 51 # check that the programs we need are available |
49 try: | 52 try: |
50 subprocess.call(shlex.split("bowtie --version"), | 53 subprocess.call(shlex.split("bowtie --version"), |
51 stdout=open(os.devnull, 'wb'), | 54 stdout=open(os.devnull, 'wb'), |
52 stderr=open(os.devnull, 'wb')) | 55 stderr=open(os.devnull, 'wb')) |
53 except OSError: | 56 except OSError: |
54 print("Error: Bowtie not available in the path") | 57 print("Error: Bowtie not available in the path") |
55 raise | 58 raise |
56 | 59 |
57 # Define a text importer | 60 |
58 csv.field_size_limit(sys.maxsize) | 61 def starts_with_numerical(list): |
62 try: | |
63 if len(list) == 0: | |
64 return False | |
65 int(list[0]) | |
66 return True | |
67 except ValueError: | |
68 return False | |
59 | 69 |
60 | 70 |
71 # define a text importer for .out/.txt format of repbase | |
61 def import_text(filename, separator): | 72 def import_text(filename, separator): |
62 for line in csv.reader(open(os.path.realpath(filename)), | 73 csv.field_size_limit(sys.maxsize) |
63 delimiter=separator, skipinitialspace=True): | 74 file = csv.reader(open(filename), delimiter=separator, |
64 if line: | 75 skipinitialspace=True) |
65 yield line | 76 return [line for line in file if starts_with_numerical(line)] |
66 | 77 |
67 | 78 |
68 # Make a setup folder | |
69 if not os.path.exists(setup_folder): | |
70 os.makedirs(setup_folder) | |
71 # load genome into dictionary and compute length | 79 # load genome into dictionary and compute length |
72 g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta")) | 80 g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta")) |
73 idxgenome, lgenome, genome = {}, {}, {} | 81 genome = defaultdict(dict) |
74 | 82 |
75 for k, chr in enumerate(g.keys()): | 83 for chr in g.keys(): |
76 genome[chr] = g[chr].seq | 84 genome[chr]['sequence'] = g[chr].seq |
77 lgenome[chr] = len(genome[chr]) | 85 genome[chr]['length'] = len(g[chr].seq) |
78 idxgenome[chr] = k | |
79 | 86 |
80 # Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter | 87 # Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter |
81 repeat_elements = [] | 88 repeat_elements = set() |
82 # these dictionaries will contain lists | 89 rep_coords = defaultdict(list) # Merged dictionary for coordinates |
83 rep_chr, rep_start, rep_end = {}, {}, {} | 90 |
84 fin = import_text(annotation_file, ' ') | 91 with open('repnames.bed', 'w') as fout: |
85 with open(os.path.join(setup_folder, 'repnames.bed'), 'w') as fout: | 92 f_in = import_text(annotation_file, ' ') |
86 for i in range(3): | 93 for line in f_in: |
87 next(fin) | |
88 for line in fin: | |
89 repname = line[9].translate(str.maketrans('()/', '___')) | 94 repname = line[9].translate(str.maketrans('()/', '___')) |
90 if repname not in repeat_elements: | 95 repeat_elements.add(repname) |
91 repeat_elements.append(repname) | 96 repchr, repstart, repend = line[4], line[5], line[6] |
92 repchr = line[4] | 97 fout.write(f"{repchr}\t{repstart}\t{repend}\t{repname}\n") |
93 repstart = line[5] | 98 rep_coords[repname].extend([repchr, repstart, repend]) |
94 repend = line[6] | 99 # repeat_elements now contains the unique repeat names |
95 fout.write('\t'.join([repchr, repstart, repend, repname]) + '\n') | 100 # rep_coords is a dictionary where keys are repeat names and values are lists |
96 if repname in rep_chr: | 101 # containing chromosome, start, and end coordinates for each repeat instance |
97 rep_chr[repname].append(repchr) | |
98 rep_start[repname].append(repstart) | |
99 rep_end[repname].append(repend) | |
100 else: | |
101 rep_chr[repname] = [repchr] | |
102 rep_start[repname] = [repstart] | |
103 rep_end[repname] = [repend] | |
104 | 102 |
105 # sort repeat_elements and print them in repgenomes_key.txt | 103 # sort repeat_elements and print them in repeatIDs.txt |
106 with open(os.path.join(setup_folder, 'repgenomes_key.txt'), 'w') as fout: | 104 with open('repeatIDs.txt', 'w') as fout: |
107 for i, repeat in enumerate(sorted(repeat_elements)): | 105 for i, repeat in enumerate(sorted(repeat_elements)): |
108 fout.write('\t'.join([repeat, str(i)]) + '\n') | 106 fout.write('\t'.join([repeat, str(i)]) + '\n') |
109 | 107 |
110 # generate spacer for pseudogenomes | 108 # generate spacer for pseudogenomes |
111 spacer = ''.join(['N' for i in range(gapl)]) | 109 spacer = ''.join(['N' for i in range(gapl)]) |
112 | 110 |
113 # generate metagenomes and save them to FASTA files for bowtie build | 111 # generate metagenomes and save them to FASTA files for bowtie build |
114 for repname in rep_chr: | 112 for repname in rep_coords: |
115 metagenome = '' | 113 metagenome = '' |
116 for i, repeat in enumerate(rep_chr[repname]): | 114 # iterating coordinate list by block of 3 (chr, start, end) |
117 try: | 115 block = 3 |
118 chromosome = rep_chr[repname][i] | 116 for i in range(0, len(rep_coords[repname]) - block + 1, block): |
119 start = max(int(rep_start[repname][i]) - flankingl, 0) | 117 batch = rep_coords[repname][i:i+block] |
120 end = min(int(rep_end[repname][i]) + flankingl, | 118 print(batch) |
121 int(lgenome[chr])-1) + 1 | 119 chromosome = batch[0] |
122 metagenome = f"{metagenome}{spacer}{genome[chromosome][start:end]}" | 120 start = max(int(batch[1]) - flankingl, 0) |
123 except KeyError: | 121 end = min(int(batch[2]) + flankingl, |
124 print("Unrecognised Chromosome: " + rep_chr[repname][i]) | 122 int(genome[chromosome]['length'])-1) + 1 |
123 metagenome = ( | |
124 f"{metagenome}{spacer}" | |
125 f"{genome[chromosome]['sequence'][start:end]}" | |
126 ) | |
125 | 127 |
126 # Create Fasta of repeat pseudogenome | 128 # Create Fasta of repeat pseudogenome |
127 fastafilename = f"{os.path.join(setup_folder, repname)}.fa" | 129 fastafilename = f"{repname}.fa" |
128 record = SeqRecord(Seq(metagenome), id=repname, name='', description='') | 130 record = SeqRecord(Seq(metagenome), id=repname, name='', description='') |
129 SeqIO.write(record, fastafilename, "fasta") | 131 SeqIO.write(record, fastafilename, "fasta") |
130 | 132 |
131 # Generate repeat pseudogenome bowtie index | 133 |
132 bowtie_build_cmd = ["bowtie-build", "-f", fastafilename, | 134 def bowtie_build(args): |
133 os.path.join(setup_folder, repname)] | 135 """ |
134 subprocess.run(bowtie_build_cmd, check=True) | 136 Function to be executed in parallel by ProcessPoolExecutor. |
137 """ | |
138 try: | |
139 bowtie_base, fasta = args | |
140 command = shlex.split(f"bowtie-build -f {fasta} {bowtie_base}") | |
141 squash = subprocess.run(command, capture_output=True, text=True) | |
142 return squash.stdout | |
143 except Exception as e: | |
144 return str(e) | |
145 | |
146 | |
147 args_list = [(name, f"{name}.fa") for name in rep_coords] | |
148 with ProcessPoolExecutor(max_workers=cpus) as executor: | |
149 executor.map(bowtie_build, args_list) |