Mercurial > repos > artbio > repenrich2
comparison RepEnrich2_setup.py @ 4:c5bb2f9af708 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 6b3b1194af0de793a1b4892c5973da835f5c0902
author | artbio |
---|---|
date | Sat, 20 Apr 2024 23:23:40 +0000 |
parents | 4905a332a094 |
children |
comparison
equal
deleted
inserted
replaced
3:0efb0ee6a7e9 | 4:c5bb2f9af708 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 import argparse | 2 import argparse |
3 import csv | 3 import csv |
4 import os | |
5 import shlex | 4 import shlex |
6 import subprocess | 5 import subprocess |
7 import sys | 6 import sys |
8 from collections import defaultdict | 7 from collections import defaultdict |
9 from concurrent.futures import ProcessPoolExecutor | 8 from concurrent.futures import ProcessPoolExecutor |
46 flankingl = args.flankinglength | 45 flankingl = args.flankinglength |
47 annotation_file = args.annotation_file | 46 annotation_file = args.annotation_file |
48 genomefasta = args.genomefasta | 47 genomefasta = args.genomefasta |
49 cpus = args.cpus | 48 cpus = args.cpus |
50 | 49 |
51 # check that the programs we need are available | |
52 try: | |
53 subprocess.call(shlex.split("bowtie2 --version"), | |
54 stdout=open(os.devnull, 'wb'), | |
55 stderr=open(os.devnull, 'wb')) | |
56 except OSError: | |
57 print("Error: Bowtie2 not available in the path") | |
58 raise | |
59 | |
60 | 50 |
61 def starts_with_numerical(list): | 51 def starts_with_numerical(list): |
62 try: | 52 try: |
63 if len(list) == 0: | 53 if len(list) == 0: |
64 return False | 54 return False |
66 return True | 56 return True |
67 except ValueError: | 57 except ValueError: |
68 return False | 58 return False |
69 | 59 |
70 | 60 |
71 # define a text importer for .out/.txt format of repbase | 61 # text import function for .out/.txt format of repbase |
72 def import_text(filename, separator): | 62 def import_text(filename, separator): |
73 csv.field_size_limit(sys.maxsize) | 63 csv.field_size_limit(sys.maxsize) |
74 file = csv.reader(open(filename), delimiter=separator, | 64 file = csv.reader(open(filename), delimiter=separator, |
75 skipinitialspace=True) | 65 skipinitialspace=True) |
76 return [line for line in file if starts_with_numerical(line)] | 66 return [line for line in file if starts_with_numerical(line)] |
79 # load genome into dictionary and compute length | 69 # load genome into dictionary and compute length |
80 g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta")) | 70 g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta")) |
81 genome = defaultdict(dict) | 71 genome = defaultdict(dict) |
82 | 72 |
83 for chr in g.keys(): | 73 for chr in g.keys(): |
84 genome[chr]['sequence'] = g[chr].seq | 74 genome[chr]['sequence'] = str(g[chr].seq) |
85 genome[chr]['length'] = len(g[chr].seq) | 75 genome[chr]['length'] = len(g[chr].seq) |
86 | 76 |
87 # Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter | 77 # Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter |
88 repeat_elements = set() | 78 repeat_elements = set() |
89 rep_coords = defaultdict(list) # Merged dictionary for coordinates | 79 rep_coords = defaultdict(list) # Merged dictionary for coordinates |
108 # generate spacer for pseudogenomes | 98 # generate spacer for pseudogenomes |
109 spacer = ''.join(['N' for i in range(gapl)]) | 99 spacer = ''.join(['N' for i in range(gapl)]) |
110 | 100 |
111 # generate metagenomes and save them to FASTA files for bowtie build | 101 # generate metagenomes and save them to FASTA files for bowtie build |
112 for repname in rep_coords: | 102 for repname in rep_coords: |
113 metagenome = '' | 103 genomes_list = [] |
114 # iterating coordinate list by block of 3 (chr, start, end) | 104 # iterating coordinate list by block of 3 (chr, start, end) |
115 block = 3 | 105 block = 3 |
116 for i in range(0, len(rep_coords[repname]) - block + 1, block): | 106 for i in range(0, len(rep_coords[repname]) - block + 1, block): |
117 batch = rep_coords[repname][i:i+block] | 107 batch = rep_coords[repname][i:i+block] |
118 chromosome = batch[0] | 108 chromosome = batch[0] |
119 start = max(int(batch[1]) - flankingl, 0) | 109 start = max(int(batch[1]) - flankingl, 0) |
120 end = min(int(batch[2]) + flankingl, | 110 end = min(int(batch[2]) + flankingl, |
121 int(genome[chromosome]['length'])-1) + 1 | 111 int(genome[chromosome]['length'])-1) + 1 |
122 metagenome = ( | 112 genomes_list.append(genome[chromosome]['sequence'][start:end]) |
123 f"{metagenome}{spacer}" | 113 metagenome = spacer.join(genomes_list) |
124 f"{genome[chromosome]['sequence'][start:end]}" | |
125 ) | |
126 | |
127 # Create Fasta of repeat pseudogenome | 114 # Create Fasta of repeat pseudogenome |
128 fastafilename = f"{repname}.fa" | 115 fastafilename = f"{repname}.fa" |
129 record = SeqRecord(Seq(metagenome), id=repname, name='', description='') | 116 record = SeqRecord(Seq(metagenome), id=repname, name='', description='') |
130 SeqIO.write(record, fastafilename, "fasta") | 117 SeqIO.write(record, fastafilename, "fasta") |
131 | 118 |