comparison RepEnrich_setup.py @ 15:2e3d976e7d5d draft default tip

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 03183e29f807ec33548016a7c4144f52720b7b9e
author artbio
date Sun, 21 Apr 2024 09:44:51 +0000
parents bf866bedd4b4
children
comparison
equal deleted inserted replaced
14:bf866bedd4b4 15:2e3d976e7d5d
69 # load genome into dictionary and compute length 69 # load genome into dictionary and compute length
70 g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta")) 70 g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta"))
71 genome = defaultdict(dict) 71 genome = defaultdict(dict)
72 72
73 for chr in g.keys(): 73 for chr in g.keys():
74 genome[chr]['sequence'] = g[chr].seq 74 genome[chr]['sequence'] = str(g[chr].seq)
75 genome[chr]['length'] = len(g[chr].seq) 75 genome[chr]['length'] = len(g[chr].seq)
76 76
77 # Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter 77 # Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter
78 repeat_elements = set() 78 repeat_elements = set()
79 rep_coords = defaultdict(list) # Merged dictionary for coordinates 79 rep_coords = defaultdict(list) # Merged dictionary for coordinates
98 # generate spacer for pseudogenomes 98 # generate spacer for pseudogenomes
99 spacer = ''.join(['N' for i in range(gapl)]) 99 spacer = ''.join(['N' for i in range(gapl)])
100 100
101 # generate metagenomes and save them to FASTA files for bowtie build 101 # generate metagenomes and save them to FASTA files for bowtie build
102 for repname in rep_coords: 102 for repname in rep_coords:
103 metagenome = '' 103 genomes_list = []
104 # iterating coordinate list by block of 3 (chr, start, end) 104 # iterating coordinate list by block of 3 (chr, start, end)
105 block = 3 105 block = 3
106 for i in range(0, len(rep_coords[repname]) - block + 1, block): 106 for i in range(0, len(rep_coords[repname]) - block + 1, block):
107 batch = rep_coords[repname][i:i+block] 107 batch = rep_coords[repname][i:i+block]
108 chromosome = batch[0] 108 chromosome = batch[0]
109 start = max(int(batch[1]) - flankingl, 0) 109 start = max(int(batch[1]) - flankingl, 0)
110 end = min(int(batch[2]) + flankingl, 110 end = min(int(batch[2]) + flankingl,
111 int(genome[chromosome]['length'])-1) + 1 111 int(genome[chromosome]['length'])-1) + 1
112 metagenome = ( 112 genomes_list.append(genome[chromosome]['sequence'][start:end])
113 f"{metagenome}{spacer}" 113 metagenome = spacer.join(genomes_list)
114 f"{genome[chromosome]['sequence'][start:end]}"
115 )
116
117 # Create Fasta of repeat pseudogenome 114 # Create Fasta of repeat pseudogenome
118 fastafilename = f"{repname}.fa" 115 fastafilename = f"{repname}.fa"
119 record = SeqRecord(Seq(metagenome), id=repname, name='', description='') 116 record = SeqRecord(Seq(metagenome), id=repname, name='', description='')
120 SeqIO.write(record, fastafilename, "fasta") 117 SeqIO.write(record, fastafilename, "fasta")
121 118