Mercurial > repos > artbio > repenrich
comparison RepEnrich_setup.py @ 13:530626b0757c draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
| author | artbio | 
|---|---|
| date | Tue, 02 Apr 2024 21:16:37 +0000 | 
| parents | 89e05f831259 | 
| children | bf866bedd4b4 | 
   comparison
  equal
  deleted
  inserted
  replaced
| 12:89e05f831259 | 13:530626b0757c | 
|---|---|
| 3 import csv | 3 import csv | 
| 4 import os | 4 import os | 
| 5 import shlex | 5 import shlex | 
| 6 import subprocess | 6 import subprocess | 
| 7 import sys | 7 import sys | 
| 8 from collections import defaultdict | |
| 9 from concurrent.futures import ProcessPoolExecutor | |
| 10 | |
| 8 | 11 | 
| 9 from Bio import SeqIO | 12 from Bio import SeqIO | 
| 10 from Bio.Seq import Seq | 13 from Bio.Seq import Seq | 
| 11 from Bio.SeqRecord import SeqRecord | 14 from Bio.SeqRecord import SeqRecord | 
| 12 | 15 | 
| 20 interest. Download from RepeatMasker.org\ | 23 interest. Download from RepeatMasker.org\ | 
| 21 Example: mm9.fa.out''') | 24 Example: mm9.fa.out''') | 
| 22 parser.add_argument('--genomefasta', action='store', metavar='genomefasta', | 25 parser.add_argument('--genomefasta', action='store', metavar='genomefasta', | 
| 23 help='''Genome of interest in fasta format.\ | 26 help='''Genome of interest in fasta format.\ | 
| 24 Example: mm9.fa''') | 27 Example: mm9.fa''') | 
| 25 parser.add_argument('--setup_folder', action='store', metavar='setup_folder', | |
| 26 help='''Folder that contains bowtie indexes of repeats and\ | |
| 27 repeat element psuedogenomes.\ | |
| 28 Example working/setup''') | |
| 29 parser.add_argument('--gaplength', action='store', dest='gaplength', | 28 parser.add_argument('--gaplength', action='store', dest='gaplength', | 
| 30 metavar='gaplength', default='200', type=int, | 29 metavar='gaplength', default='200', type=int, | 
| 31 help='''Length of the N-spacer in the\ | 30 help='''Length of the N-spacer in the\ | 
| 32 repeat pseudogenomes. Default 200''') | 31 repeat pseudogenomes. Default 200''') | 
| 33 parser.add_argument('--flankinglength', action='store', dest='flankinglength', | 32 parser.add_argument('--flankinglength', action='store', dest='flankinglength', | 
| 34 metavar='flankinglength', default='25', type=int, | 33 metavar='flankinglength', default='25', type=int, | 
| 35 help='''Length of the flanking regions used to build\ | 34 help='''Length of the flanking regions used to build\ | 
| 36 repeat pseudogenomes. Flanking length should be set\ | 35 repeat pseudogenomes. Flanking length should be set\ | 
| 37 according to the length of your reads.\ | 36 according to the length of your reads.\ | 
| 38 Default 25, for 50 nt reads''') | 37 Default 25, for 50 nt reads''') | 
| 38 parser.add_argument('--cpus', action='store', dest='cpus', metavar='cpus', | |
| 39 default="1", type=int, | |
| 40 help='Number of CPUs. The more cpus the\ | |
| 41 faster RepEnrich performs. Default: "1"') | |
| 39 args = parser.parse_args() | 42 args = parser.parse_args() | 
| 40 | 43 | 
| 41 # parameters from argsparse | 44 # parameters from argsparse | 
| 42 gapl = args.gaplength | 45 gapl = args.gaplength | 
| 43 flankingl = args.flankinglength | 46 flankingl = args.flankinglength | 
| 44 annotation_file = args.annotation_file | 47 annotation_file = args.annotation_file | 
| 45 genomefasta = args.genomefasta | 48 genomefasta = args.genomefasta | 
| 46 setup_folder = args.setup_folder | 49 cpus = args.cpus | 
| 47 | 50 | 
| 48 # check that the programs we need are available | 51 # check that the programs we need are available | 
| 49 try: | 52 try: | 
| 50 subprocess.call(shlex.split("bowtie --version"), | 53 subprocess.call(shlex.split("bowtie --version"), | 
| 51 stdout=open(os.devnull, 'wb'), | 54 stdout=open(os.devnull, 'wb'), | 
| 52 stderr=open(os.devnull, 'wb')) | 55 stderr=open(os.devnull, 'wb')) | 
| 53 except OSError: | 56 except OSError: | 
| 54 print("Error: Bowtie not available in the path") | 57 print("Error: Bowtie not available in the path") | 
| 55 raise | 58 raise | 
| 56 | 59 | 
| 57 # Define a text importer | 60 | 
| 58 csv.field_size_limit(sys.maxsize) | 61 def starts_with_numerical(list): | 
| 62 try: | |
| 63 if len(list) == 0: | |
| 64 return False | |
| 65 int(list[0]) | |
| 66 return True | |
| 67 except ValueError: | |
| 68 return False | |
| 59 | 69 | 
| 60 | 70 | 
| 71 # define a text importer for .out/.txt format of repbase | |
| 61 def import_text(filename, separator): | 72 def import_text(filename, separator): | 
| 62 for line in csv.reader(open(os.path.realpath(filename)), | 73 csv.field_size_limit(sys.maxsize) | 
| 63 delimiter=separator, skipinitialspace=True): | 74 file = csv.reader(open(filename), delimiter=separator, | 
| 64 if line: | 75 skipinitialspace=True) | 
| 65 yield line | 76 return [line for line in file if starts_with_numerical(line)] | 
| 66 | 77 | 
| 67 | 78 | 
| 68 # Make a setup folder | |
| 69 if not os.path.exists(setup_folder): | |
| 70 os.makedirs(setup_folder) | |
| 71 # load genome into dictionary and compute length | 79 # load genome into dictionary and compute length | 
| 72 g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta")) | 80 g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta")) | 
| 73 idxgenome, lgenome, genome = {}, {}, {} | 81 genome = defaultdict(dict) | 
| 74 | 82 | 
| 75 for k, chr in enumerate(g.keys()): | 83 for chr in g.keys(): | 
| 76 genome[chr] = g[chr].seq | 84 genome[chr]['sequence'] = g[chr].seq | 
| 77 lgenome[chr] = len(genome[chr]) | 85 genome[chr]['length'] = len(g[chr].seq) | 
| 78 idxgenome[chr] = k | |
| 79 | 86 | 
| 80 # Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter | 87 # Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter | 
| 81 repeat_elements = [] | 88 repeat_elements = set() | 
| 82 # these dictionaries will contain lists | 89 rep_coords = defaultdict(list) # Merged dictionary for coordinates | 
| 83 rep_chr, rep_start, rep_end = {}, {}, {} | 90 | 
| 84 fin = import_text(annotation_file, ' ') | 91 with open('repnames.bed', 'w') as fout: | 
| 85 with open(os.path.join(setup_folder, 'repnames.bed'), 'w') as fout: | 92 f_in = import_text(annotation_file, ' ') | 
| 86 for i in range(3): | 93 for line in f_in: | 
| 87 next(fin) | |
| 88 for line in fin: | |
| 89 repname = line[9].translate(str.maketrans('()/', '___')) | 94 repname = line[9].translate(str.maketrans('()/', '___')) | 
| 90 if repname not in repeat_elements: | 95 repeat_elements.add(repname) | 
| 91 repeat_elements.append(repname) | 96 repchr, repstart, repend = line[4], line[5], line[6] | 
| 92 repchr = line[4] | 97 fout.write(f"{repchr}\t{repstart}\t{repend}\t{repname}\n") | 
| 93 repstart = line[5] | 98 rep_coords[repname].extend([repchr, repstart, repend]) | 
| 94 repend = line[6] | 99 # repeat_elements now contains the unique repeat names | 
| 95 fout.write('\t'.join([repchr, repstart, repend, repname]) + '\n') | 100 # rep_coords is a dictionary where keys are repeat names and values are lists | 
| 96 if repname in rep_chr: | 101 # containing chromosome, start, and end coordinates for each repeat instance | 
| 97 rep_chr[repname].append(repchr) | |
| 98 rep_start[repname].append(repstart) | |
| 99 rep_end[repname].append(repend) | |
| 100 else: | |
| 101 rep_chr[repname] = [repchr] | |
| 102 rep_start[repname] = [repstart] | |
| 103 rep_end[repname] = [repend] | |
| 104 | 102 | 
| 105 # sort repeat_elements and print them in repgenomes_key.txt | 103 # sort repeat_elements and print them in repeatIDs.txt | 
| 106 with open(os.path.join(setup_folder, 'repgenomes_key.txt'), 'w') as fout: | 104 with open('repeatIDs.txt', 'w') as fout: | 
| 107 for i, repeat in enumerate(sorted(repeat_elements)): | 105 for i, repeat in enumerate(sorted(repeat_elements)): | 
| 108 fout.write('\t'.join([repeat, str(i)]) + '\n') | 106 fout.write('\t'.join([repeat, str(i)]) + '\n') | 
| 109 | 107 | 
| 110 # generate spacer for pseudogenomes | 108 # generate spacer for pseudogenomes | 
| 111 spacer = ''.join(['N' for i in range(gapl)]) | 109 spacer = ''.join(['N' for i in range(gapl)]) | 
| 112 | 110 | 
| 113 # generate metagenomes and save them to FASTA files for bowtie build | 111 # generate metagenomes and save them to FASTA files for bowtie build | 
| 114 for repname in rep_chr: | 112 for repname in rep_coords: | 
| 115 metagenome = '' | 113 metagenome = '' | 
| 116 for i, repeat in enumerate(rep_chr[repname]): | 114 # iterating coordinate list by block of 3 (chr, start, end) | 
| 117 try: | 115 block = 3 | 
| 118 chromosome = rep_chr[repname][i] | 116 for i in range(0, len(rep_coords[repname]) - block + 1, block): | 
| 119 start = max(int(rep_start[repname][i]) - flankingl, 0) | 117 batch = rep_coords[repname][i:i+block] | 
| 120 end = min(int(rep_end[repname][i]) + flankingl, | 118 print(batch) | 
| 121 int(lgenome[chr])-1) + 1 | 119 chromosome = batch[0] | 
| 122 metagenome = f"{metagenome}{spacer}{genome[chromosome][start:end]}" | 120 start = max(int(batch[1]) - flankingl, 0) | 
| 123 except KeyError: | 121 end = min(int(batch[2]) + flankingl, | 
| 124 print("Unrecognised Chromosome: " + rep_chr[repname][i]) | 122 int(genome[chromosome]['length'])-1) + 1 | 
| 123 metagenome = ( | |
| 124 f"{metagenome}{spacer}" | |
| 125 f"{genome[chromosome]['sequence'][start:end]}" | |
| 126 ) | |
| 125 | 127 | 
| 126 # Create Fasta of repeat pseudogenome | 128 # Create Fasta of repeat pseudogenome | 
| 127 fastafilename = f"{os.path.join(setup_folder, repname)}.fa" | 129 fastafilename = f"{repname}.fa" | 
| 128 record = SeqRecord(Seq(metagenome), id=repname, name='', description='') | 130 record = SeqRecord(Seq(metagenome), id=repname, name='', description='') | 
| 129 SeqIO.write(record, fastafilename, "fasta") | 131 SeqIO.write(record, fastafilename, "fasta") | 
| 130 | 132 | 
| 131 # Generate repeat pseudogenome bowtie index | 133 | 
| 132 bowtie_build_cmd = ["bowtie-build", "-f", fastafilename, | 134 def bowtie_build(args): | 
| 133 os.path.join(setup_folder, repname)] | 135 """ | 
| 134 subprocess.run(bowtie_build_cmd, check=True) | 136 Function to be executed in parallel by ProcessPoolExecutor. | 
| 137 """ | |
| 138 try: | |
| 139 bowtie_base, fasta = args | |
| 140 command = shlex.split(f"bowtie-build -f {fasta} {bowtie_base}") | |
| 141 squash = subprocess.run(command, capture_output=True, text=True) | |
| 142 return squash.stdout | |
| 143 except Exception as e: | |
| 144 return str(e) | |
| 145 | |
| 146 | |
| 147 args_list = [(name, f"{name}.fa") for name in rep_coords] | |
| 148 with ProcessPoolExecutor(max_workers=cpus) as executor: | |
| 149 executor.map(bowtie_build, args_list) | 
