comparison RepEnrich2_setup.py @ 4:c5bb2f9af708 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 6b3b1194af0de793a1b4892c5973da835f5c0902
author artbio
date Sat, 20 Apr 2024 23:23:40 +0000
parents 4905a332a094
children
comparison
equal deleted inserted replaced
3:0efb0ee6a7e9 4:c5bb2f9af708
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 import argparse 2 import argparse
3 import csv 3 import csv
4 import os
5 import shlex 4 import shlex
6 import subprocess 5 import subprocess
7 import sys 6 import sys
8 from collections import defaultdict 7 from collections import defaultdict
9 from concurrent.futures import ProcessPoolExecutor 8 from concurrent.futures import ProcessPoolExecutor
46 flankingl = args.flankinglength 45 flankingl = args.flankinglength
47 annotation_file = args.annotation_file 46 annotation_file = args.annotation_file
48 genomefasta = args.genomefasta 47 genomefasta = args.genomefasta
49 cpus = args.cpus 48 cpus = args.cpus
50 49
51 # check that the programs we need are available
52 try:
53 subprocess.call(shlex.split("bowtie2 --version"),
54 stdout=open(os.devnull, 'wb'),
55 stderr=open(os.devnull, 'wb'))
56 except OSError:
57 print("Error: Bowtie2 not available in the path")
58 raise
59
60 50
61 def starts_with_numerical(list): 51 def starts_with_numerical(list):
62 try: 52 try:
63 if len(list) == 0: 53 if len(list) == 0:
64 return False 54 return False
66 return True 56 return True
67 except ValueError: 57 except ValueError:
68 return False 58 return False
69 59
70 60
71 # define a text importer for .out/.txt format of repbase 61 # text import function for .out/.txt format of repbase
72 def import_text(filename, separator): 62 def import_text(filename, separator):
73 csv.field_size_limit(sys.maxsize) 63 csv.field_size_limit(sys.maxsize)
74 file = csv.reader(open(filename), delimiter=separator, 64 file = csv.reader(open(filename), delimiter=separator,
75 skipinitialspace=True) 65 skipinitialspace=True)
76 return [line for line in file if starts_with_numerical(line)] 66 return [line for line in file if starts_with_numerical(line)]
79 # load genome into dictionary and compute length 69 # load genome into dictionary and compute length
80 g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta")) 70 g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta"))
81 genome = defaultdict(dict) 71 genome = defaultdict(dict)
82 72
83 for chr in g.keys(): 73 for chr in g.keys():
84 genome[chr]['sequence'] = g[chr].seq 74 genome[chr]['sequence'] = str(g[chr].seq)
85 genome[chr]['length'] = len(g[chr].seq) 75 genome[chr]['length'] = len(g[chr].seq)
86 76
87 # Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter 77 # Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter
88 repeat_elements = set() 78 repeat_elements = set()
89 rep_coords = defaultdict(list) # Merged dictionary for coordinates 79 rep_coords = defaultdict(list) # Merged dictionary for coordinates
108 # generate spacer for pseudogenomes 98 # generate spacer for pseudogenomes
109 spacer = ''.join(['N' for i in range(gapl)]) 99 spacer = ''.join(['N' for i in range(gapl)])
110 100
111 # generate metagenomes and save them to FASTA files for bowtie build 101 # generate metagenomes and save them to FASTA files for bowtie build
112 for repname in rep_coords: 102 for repname in rep_coords:
113 metagenome = '' 103 genomes_list = []
114 # iterating coordinate list by block of 3 (chr, start, end) 104 # iterating coordinate list by block of 3 (chr, start, end)
115 block = 3 105 block = 3
116 for i in range(0, len(rep_coords[repname]) - block + 1, block): 106 for i in range(0, len(rep_coords[repname]) - block + 1, block):
117 batch = rep_coords[repname][i:i+block] 107 batch = rep_coords[repname][i:i+block]
118 chromosome = batch[0] 108 chromosome = batch[0]
119 start = max(int(batch[1]) - flankingl, 0) 109 start = max(int(batch[1]) - flankingl, 0)
120 end = min(int(batch[2]) + flankingl, 110 end = min(int(batch[2]) + flankingl,
121 int(genome[chromosome]['length'])-1) + 1 111 int(genome[chromosome]['length'])-1) + 1
122 metagenome = ( 112 genomes_list.append(genome[chromosome]['sequence'][start:end])
123 f"{metagenome}{spacer}" 113 metagenome = spacer.join(genomes_list)
124 f"{genome[chromosome]['sequence'][start:end]}"
125 )
126
127 # Create Fasta of repeat pseudogenome 114 # Create Fasta of repeat pseudogenome
128 fastafilename = f"{repname}.fa" 115 fastafilename = f"{repname}.fa"
129 record = SeqRecord(Seq(metagenome), id=repname, name='', description='') 116 record = SeqRecord(Seq(metagenome), id=repname, name='', description='')
130 SeqIO.write(record, fastafilename, "fasta") 117 SeqIO.write(record, fastafilename, "fasta")
131 118