comparison syndiva.py @ 0:0254731f047b draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/SynDivA commit 90c5ec603e2c6b8c49d2dc7ec1b1e97f9d8fb92c
author iuc
date Thu, 23 Jun 2022 22:32:13 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:0254731f047b
1 #!/usr/bin/env python
2 # title : syndiva.py
3 # description : This script will analyze fasta files, look for restriction sites,
4 # cut the sequences around the restriction sites,
5 # translate the nucleic sequences into amino acids sequences.
6 # author : Fabienne Wong Jun Tai and Benjamin Dartigues
7 # creation date : 20121107
8 # version : 1.0 - revised November 2012
9 # version : 1.1 - revised March 2022
10 # usage : python syndiva.py -i file.fasta -o /output/dir/ -p pattern -5 seq_restric_5'-3 seq_restric_3'
11 # notes :
12 # # python_version :3.7.11
13 # # biopython_max_version :1.72
14 # ==============================================================================
15 import math
16 import re
17 import subprocess
18 import sys
19
20 import matplotlib
21 import numpy
22 from args import Args
23 from args import get_os_path_join, get_os_path_name
24 from Bio import pairwise2
25 from Bio import SeqIO
26 from Bio.Seq import Seq
27 from Bio.Seq import translate
28 from Bio.SubsMat import MatrixInfo
29
30 matplotlib.use('Agg')
31 from matplotlib import pyplot as plot # noqa: I202,E402
32
33
34 args = Args()
35 # Variables initialization
36 directory = args.output_dir
37 mcl_file = get_os_path_join(directory, "mcl.in")
38 mcl_output = get_os_path_join(directory, "mcl.out")
39 html_file = get_os_path_join(directory, "syndiva_report.html")
40 graph_pic = get_os_path_join(directory, "distri.png")
41 input_file = get_os_path_name(args.input)
42 site_res_5 = args.site_res_5
43 site_res_3 = args.site_res_3
44 tag = {'mut': [], 'ok_stop_ext': [], 'stop': [], 'no_restric': [], 'no_multiple': [], 'amber': []}
45 all_seq = []
46 all_seq_fasta = {} # dictionnary that will store information about all the sequences
47 good_seq = {} # dictionnary that will store information about the valid sequences
48 identical_clones = {}
49 var_seq_common = {} # dictionnary that will store the number of sequences that share the same variable parts
50 align_scores = []
51 nb_var_part = 0
52
53
54 def get_identity(str1, str2):
55 if len(str2) > len(str1):
56 return (len(str2) - len([i for i in range(len(str1)) if str1[i] != str2[i]])) / len(str2)
57 else:
58 return (len(str1) - len([i for i in range(len(str1)) if str1[i] != str2[i]])) / len(str1)
59
60
61 def reverse_complement(_seq):
62 return str(Seq(_seq).reverse_complement())
63
64
65 def generate_aln(seq_dic, ids): # sourcery skip: use-join
66 # Multiple Sequence Alignment via ClustalO
67 _input = ''
68 for sequence_id in ids:
69 _input += '>%s\n%s\n' % (sequence_id, re.sub("(.{80})", "\\1\n", seq_dic[sequence_id]['prot'], re.DOTALL))
70 p = subprocess.Popen(["clustalo", "-i", "-", "--outfmt", "clu"], shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, universal_newlines=True)
71 aln_out, aln_err = p.communicate(input=_input)
72 return aln_out
73
74
75 def report_html(_html_file, _tag, _all_seq, _good_seq, _all_seq_fasta, _identical_clones, _nb_var_part, _var_seq_common, _align_scores, _args):
76 # Generate the html file for the report
77 _all_seq.sort()
78 for key in _tag.keys():
79 _tag[key].sort()
80 _good_seq = dict(sorted(_good_seq.items()))
81 good_ids = _good_seq.keys()
82 w = open(_html_file, 'w')
83 w.write(
84 '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN""http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"><html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" '
85 'lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><title>SynDivA Report</title><link '
86 'href="http://twitter.github.com/bootstrap/assets/css/bootstrap.css" rel="stylesheet" /><style type="text/css">body {padding-top: 40px;}.subhead {padding: 40px '
87 '0;}.subhead h1 {font-size: 60px;}.fasta { font-family: Monaco, Menlo, Consolas, "Courier New", monospace; font-size: 12px;}code.grey{color: '
88 '#636D71;}</style></head><body><a id="top"></a><div class="navbar navbar-fixed-top"><div class="navbar-inner"><div class="container"><a class="brand" href="#top">SynDivA '
89 'Report</a><div class="nav-collapse collapse"><ul class="nav"><li><a href="#input">Input data</a></li><li><a href="#analysis">Sequences analysis</a></li><li><a '
90 'href="#variable">Variable regions analysis</a></li><li><a href="#cluster">Clustering</a></li><li><a href="#stat">Statistics</a></li><li><a '
91 'href="#annex">Annex</a></li></ul></div></div></div></div><div class="container-fluid"><header class="subhead"><h1>SynDivA Report</h1></header><div '
92 'class="page-header"><a id="input"></a><h2>Input data</h2></div>')
93
94 # Input data
95 w.write(
96 '<p>Input file:<br/><code class="grey">%s</code></p><p>Number of sequences in input file:<br/><code class="grey">%d</code></p><p>Pattern of the sequence bank:<br/><code '
97 'class="grey">%s</code></p><p>5\' restriction site:<br/><code class="grey">%s</code></p><p>3\' restriction site:<br/><code class="grey">%s</code></p>' % (
98 input_file, len(_all_seq), _args.pattern, _args.site_res_5, _args.site_res_3))
99
100 # Sequence analysis
101 w.write(
102 '<div class="page-header"><a id="analysis"></a><h2>Sequences analysis</h2></div><p>Caption:</p><ul><li class="text-success">Valid sequences that will be part of the next '
103 'analysis </li><li class="text-warning">Good sequences but will not be part of the next analysis</li><li class="text-error">Rejected sequences</li></ul><table '
104 'class="table table-striped table-bordered"><tr><th class="text-error">Absence of restriction sites</th><th class="text-error">Incorrect number of nucleotides between '
105 'the restriction sites</th><th class="text-error">Stop codon <u>inside</u> the area of interest</th><th class="text-warning">Mutation in the conserved regions</th><th '
106 'class="text-success">Valid sequences</th><th>Amber codon in the sequence (<u>inside</u> the area of interest)</th></tr>')
107 w.write(
108 '<tr><td class="text-error">%d sequence(s) (%.2f%%)</td><td class="text-error">%d sequence(s) (%.2f%%)</td><td class="text-error">%d sequence(s) (%.2f%%)</td><td '
109 'class="text-warning">%d sequence(s) (%.2f%%)</td><td class="text-success">%d sequence(s) (%.2f%%)</td><td>%d sequence(s)</td></tr>' % (
110 len(_tag['no_restric']), float(len(_tag['no_restric'])) / float(len(_all_seq)) * 100, len(_tag['no_multiple']), float(len(_tag['no_multiple'])) / float(len(_all_seq)) * 100, len(_tag['stop']),
111 float(len(_tag['stop'])) / float(len(_all_seq)) * 100, len(_tag['mut']), float(len(_tag['mut'])) / float(len(_all_seq)) * 100, len(good_ids),
112 float(len(good_ids)) / float(len(_all_seq)) * 100,
113 len(_tag['amber'])))
114 w.write(
115 '<tr><td class="text-error">%s</td><td class="text-error">%s</td><td class="text-error">%s</td><td class="text-warning">%s</td><td '
116 'class="text-success">%s</td><td>%s</td></tr></table>' % (
117 '<br/>'.join(_tag['no_restric']), '<br/>'.join(_tag['no_multiple']), '<br/>'.join(_tag['stop']), '<br/>'.join(_tag['mut']), '<br/>'.join(good_ids), '<br/>'.join(_tag['amber'])))
118 # Variable regions analysis
119 w.write(
120 '<div class="page-header"><a id="variable"></a><h2>Variable regions analysis</h2></div><p>The following group of sequences are identical clones on the variable '
121 'regions:</p>')
122 identical_clones_seq = _identical_clones.keys()
123 if identical_clones_seq:
124 for seq in identical_clones_seq:
125 ids = list(set(_identical_clones[seq])) # return only one occurrence of each item in the list
126 w.write('<div class="row-fluid"><div class="span5"><pre>%d sequences (%.2f%% of valid sequences)<br/>%s</pre></div>' % (
127 len(ids), float(len(ids)) / float(len(good_ids)) * 100, '<br/>'.join(ids)))
128 w.write('<div class="span3"><table class="table table-striped table-bordered"><thead><tr><th>Variable region</th><th>Repeated sequence</th></tr></thead><tbody>')
129 for z in range(len(_good_seq[ids[0]]['var'])):
130 w.write('<td>%d</td><td>%s</td></tr>' % (z + 1, _good_seq[ids[0]]['var'][z]))
131 w.write('</tbody></table></div></div>')
132 else:
133 w.write('<p>No clone was found.</p>')
134
135 first = True
136 for i in range(_nb_var_part):
137 keys = []
138 for k in _var_seq_common[str(i + 1)].keys():
139 nb = _var_seq_common[str(i + 1)][k]
140 if nb > 1:
141 if first:
142 w.write(
143 '<p>Here\'s the distribution of the repeated sequences in variable regions:</p><table class="table table-striped table-bordered"><thead><tr><th>Variable '
144 'region</th><th>Repeated sequence</th><th>Number of occurrences (percentage of valid sequences)</th></tr></thead><tbody>')
145 first = False
146 keys.append(k)
147 else:
148 keys.append(k)
149 nb = len(keys)
150 if nb != 0:
151 w.write('<tr>')
152 for z in range(nb):
153 if z == 0:
154 w.write('<td rowspan="%d">%d</td>' % (nb, i + 1))
155 w.write('<td>%s</td><td>%d (%.2f%%)</td></tr>' % (
156 keys[z], _var_seq_common[str(i + 1)][keys[z]], float(_var_seq_common[str(i + 1)][keys[z]]) / float(len(good_ids)) * 100))
157 w.write('</tbody></table>')
158 # Clustering
159 w.write('<div class="page-header"><a id="cluster"></a><h2>Clustering</h2></div><p>The following clusters were generated by MCL:</p>')
160 for line in open(mcl_output, 'r'):
161 w.write('<div class="row-fluid"><div class="span6"><pre>%d sequences (%.2f%% of valid sequences)<br/>%s</pre></div></div>' % (
162 len(line.split("\t")), float(len(line.split("\t"))) / float(len(good_ids)) * 100, '<br/>'.join(line.split("\t"))))
163 # Statistics
164 w.write('<div class="page-header"><a id="stat"></a><h2>Statistics</h2></div>')
165 w.write('<p>Here\'s some statistics about the valid sequences:</p><p>Mean for the pairwise alignement scores: %.2f<br/>Standard deviation: %.2f</p>' % (
166 float(numpy.mean(_align_scores)), float(numpy.std(_align_scores))))
167 w.write('<div class="row-fluid"><div class="span6"><img src="%s" alt="Distribution of the pairwise alignment score"></div>' % get_os_path_name(graph_pic))
168 w.write('<div class="span6"><table class="table table-striped table-bordered"><thead><tr><th>Pairwise Alignment Score</th><th>Number of occurrences</th></tr></thead><tbody>')
169 uniq_scores = sorted(list(set(_align_scores)))
170 scores_dic = {}
171 for _score in uniq_scores:
172 scores_dic[_score] = _align_scores.count(_score)
173 scores_dic = dict(sorted(scores_dic.items()))
174 scores = scores_dic.items()
175 # scores.sort()
176 for el in scores:
177 w.write('<tr><td>%.2f</td><td>%d</td></tr>' % (el[0], el[1]))
178 w.write('</tbody></table></div></div>')
179 # Annex
180 w.write('<div class="page-header"><a id="annex"></a><h2>Annex</h2></div>')
181 w.write('<p><strong>Valid protein sequences</strong> in FASTA format:</p><textarea class="span8 fasta" type="text" rows="20" readonly="readonly">')
182 for _id in good_ids:
183 w.write('>%s\n%s\n' % (_id, re.sub("(.{80})", "\\1\n", _good_seq[_id]['prot'], re.DOTALL)))
184 w.write('</textarea>')
185 aln_out = generate_aln(_good_seq, good_ids)
186 w.write(
187 '<p>Multiple sequence alignment of the <strong>valid sequences</strong> generated by Clustal Omega:</p><textarea class="span8 fasta" type="text" rows="20" '
188 'readonly="readonly">%s</textarea>' % str(
189 aln_out))
190
191 if _tag['no_multiple']:
192 w.write(
193 '<p><strong>Protein sequences with an incorrect number of nucleotides between the restriction sites</strong> in FASTA format:</p><textarea class="span8 fasta" '
194 'type="text" rows="20" readonly="readonly">')
195 for _id in _tag['no_multiple']:
196 w.write('>%s\n%s\n' % (_id, re.sub("(.{80})", "\\1\n", _all_seq_fasta[_id]['prot'], re.DOTALL)))
197 w.write('</textarea>')
198
199 if _tag['mut']:
200 w.write('<p><strong>Mutated protein sequences</strong> in FASTA format:</p><textarea class="span8 fasta" type="text" rows="20" readonly="readonly">')
201 for _id in _tag['mut']:
202 w.write('>%s\n%s\n' % (_id, re.sub("(.{80})", "\\1\n", _all_seq_fasta[_id]['prot'], re.DOTALL)))
203 w.write('</textarea>')
204 aln_out = generate_aln(_all_seq_fasta, _tag['mut'])
205
206 w.write(
207 '<p>Multiple sequence alignment of the <strong>mutated sequences</strong> generated by Clustal Omega:</p><textarea class="span8 fasta" type="text" rows="20" '
208 'readonly="readonly">%s</textarea>' % str(
209 aln_out))
210
211 if _tag['stop']:
212 w.write('<p><strong>Protein sequences with a stop codon</strong> in FASTA format:</p><textarea class="span8 fasta" type="text" rows="20" readonly="readonly">')
213 for _id in _tag['stop']:
214 w.write('>%s\n%s\n' % (_id, re.sub("(.{80})", "\\1\n", _all_seq_fasta[_id]['prot'], re.DOTALL)))
215 w.write('</textarea>')
216
217 if _tag['amber']:
218 w.write('<p><strong>Protein sequences with an amber codon</strong> in FASTA format:</p><textarea class="span8 fasta" type="text" rows="20" readonly="readonly">')
219 for _id in _tag['amber']:
220 w.write('>%s\n%s\n' % (_id, re.sub("(.{80})", "\\1\n", _all_seq_fasta[_id]['prot'], re.DOTALL)))
221 w.write('</textarea>')
222
223 w.write('</div></body></html>')
224 w.close()
225
226
227 nb_seq = len(list(SeqIO.parse(args.input, "fasta")))
228
229 for seq_record in SeqIO.parse(args.input, "fasta"):
230 seq_id = seq_record.id
231 seq = str(seq_record.seq)
232 seq = seq.upper()
233 all_seq.append(seq_id)
234 # Checking if both restriction sites are present in the sequence
235 if site_res_5 in seq and site_res_3 in seq:
236 valid = True
237 else:
238 valid = False
239 tag['no_restric'].append(seq_id)
240 # If sequence has both restriction sites, checking if it is necessary to take the reverse complement strand
241 if valid:
242 site_res_5_pos = seq.index(site_res_5)
243 site_res_3_pos = seq.index(site_res_3)
244 # If site_res_5_pos > site_res_3_pos, reverse complement strand has to be calculated
245 if site_res_5_pos > site_res_3_pos:
246 # Checking if the number of nucleic acids between the restriction sites is a multiple of 3
247 length = math.fabs((site_res_5_pos + len(site_res_5)) - site_res_3_pos)
248 valid = length % 3 == 0
249 cut_seq = seq[:site_res_5_pos + len(site_res_5)]
250 cut_seq = reverse_complement(cut_seq)
251
252 # Else if site_res_5_pos < site_res_3_pos, use the sequence as it is
253 else:
254 # Checking if the number of nucleic acids between the restriction sites is a multiple of 3
255 length = math.fabs((site_res_3_pos + len(site_res_3)) - site_res_5_pos)
256 valid = length % 3 == 0
257 cut_seq = seq[site_res_5_pos:]
258 # If the number of nucleic acids between the restriction sites isn't a multiple of 3, put the sequence away
259 if not valid:
260 tag['no_multiple'].append(seq_id)
261 prot_seq = translate(cut_seq)
262 all_seq_fasta[seq_id] = {}
263 all_seq_fasta[seq_id]['prot'] = prot_seq
264 else:
265 # Translate nucleic sequence into amino acid sequence
266 prot_seq = translate(cut_seq)
267 all_seq_fasta[seq_id] = {}
268 all_seq_fasta[seq_id]['prot'] = prot_seq
269
270 # Looking for stop codon in the sequence and getting their position in the sequence
271 if '*' in prot_seq:
272 pos_stop = [m.start() for m in re.finditer(r"\*", prot_seq)]
273 stop = False
274 # Checking if stop codon is between the restriction sites, also checking if it is an amber codon. if stop codon other than amber codon -> tag stop
275 for i in range(len(pos_stop)):
276 if pos_stop[i] < length / 3:
277 stop_codon_nuc = cut_seq[pos_stop[i] * 3:pos_stop[i] * 3 + 3]
278 if stop_codon_nuc != "TAG":
279 tag['stop'].append(seq_id)
280 stop = True
281 break
282 else:
283 if seq_id not in tag['amber']:
284 tag['amber'].append(seq_id)
285 # If stop codon wasn't found between the restriction sites
286 if not stop:
287 """
288 # Checking if there is a stop codon outside the restriction sites. If yes -> tag ok_stop_ext
289 for i in range(len(pos_stop)):
290 if (pos_stop[i] > length/3):
291 stop_codon_nuc = cut_seq[pos_stop[i]*3:pos_stop[i]*3+3]
292 if stop_codon_nuc != "TAG":
293 tag['ok_stop_ext'].append(seq_id)
294 stop = True
295 break
296 else:
297 if (seq_id not in tag['amber']):
298 tag['amber'].append(seq_id)
299 """
300 # Checking if there was a mutation in the fix part, if yes -> tag mut else retrieve variable parts
301 mut = False
302 pattern_part = args.pattern.split(":")
303 tmp_prot_seq = prot_seq
304 var_parts = []
305 for i in range(len(pattern_part) - 1): # not checking the latest fix part
306 part = pattern_part[i]
307 # If part is fix
308 if not part[0].isdigit():
309 # If part not in prot_seq -> mutation, flag then break
310 if part not in tmp_prot_seq:
311 mut = True
312 tag['mut'].append(seq_id)
313 break
314 # Else, store the variable part if exist then remove the fix part + variable part (tmp_prot_seq starts at the end of part)
315 else:
316 pos_fix = tmp_prot_seq.index(part)
317 if pos_fix != 0:
318 var_parts.append(tmp_prot_seq[0:pos_fix])
319 tmp_prot_seq = tmp_prot_seq[pos_fix + len(part):]
320 # Else part is variable
321 else:
322 nb_var_part += 1
323 # Treating latest fix part if no mutation before
324 if not mut:
325 last_part = pattern_part[-1]
326 last_var = pattern_part[-2]
327 if '-' in last_var:
328 var_max = int(last_var.split('-')[1])
329 else:
330 var_max = int(last_var)
331 last_part = last_part[0:var_max + 1]
332 if last_part not in tmp_prot_seq:
333 mut = True
334 tag['mut'].append(seq_id)
335 else:
336 pos_fix = tmp_prot_seq.index(last_part)
337 if pos_fix != 0:
338 var_parts.append(tmp_prot_seq[0:pos_fix])
339 # If no mutation the sequence is validated and all the info are stored
340 if not mut:
341 good_seq[seq_id] = {}
342 good_seq[seq_id]['dna'] = cut_seq
343 good_seq[seq_id]['prot'] = prot_seq
344 good_seq[seq_id]['var'] = var_parts
345
346 # If all sequences are invalid, the program will exit as there is no data to continue
347 if not good_seq:
348 sys.exit("There is only one valid sequence among the input data. At least 2 valid sequences are necessary to proceed to the next step. The program will now exit")
349 elif len(good_seq.keys()) == 1:
350
351 sys.exit("There is only one valid sequence among the input data. At least 2 valid sequences are necessary to proceed to the next step. The program will now exit")
352
353 # Initialization of dict var_seq_common
354 for n in range(nb_var_part):
355 var_seq_common[str(n + 1)] = {}
356
357 # Opening the file where the mcl input will be written
358 with open(mcl_file, 'w+') as mcl:
359 seq_keys = good_seq.keys()
360 for i in range(len(seq_keys)):
361 var_1 = good_seq[list(seq_keys)[i]]['var']
362
363 # Classifying variable sequences
364 for k in range(len(var_1)):
365 try:
366 var_seq_common[str(k + 1)][var_1[k]] += 1
367 except KeyError:
368 var_seq_common[str(k + 1)][var_1[k]] = 1
369
370 for j in range(i + 1, len(seq_keys)):
371 var_2 = good_seq[list(seq_keys)[j]]['var']
372 score = 0.0
373 # Comparing the sequences' variable parts to find identical clones
374 if var_1 == var_2:
375 try:
376 clone_seq = "".join(var_1)
377 identical_clones[clone_seq].extend([seq_keys[i], seq_keys[j]])
378 except KeyError:
379 identical_clones[clone_seq] = [seq_keys[i], seq_keys[j]]
380 # Align the 2 sequences using NWalign_PAM30 => replace by pairwise2
381 seq_1 = ''.join(var_1)
382 seq_2 = ''.join(var_2)
383 matrix = MatrixInfo.pam30
384 if len(seq_2) > len(seq_1):
385 score = get_identity(pairwise2.align.globalds(seq_1, seq_2, matrix, -11, -1)[0][0], pairwise2.align.globalds(seq_1, seq_2, matrix, -11, -1)[0][1]) * 100
386 else:
387 score = get_identity(pairwise2.align.globalds(seq_2, seq_1, matrix, -11, -1)[0][0], pairwise2.align.globalds(seq_2, seq_1, matrix, -11, -1)[0][1]) * 100
388 align_scores.append(score)
389 mcl.write('%s\t%s\t%0.2f\n' % (list(seq_keys)[i], list(seq_keys)[j], score))
390
391 # Clusters formation
392 subprocess.call(["mcl", mcl_file, "--abc", "-I", "6.0", "-o", mcl_output], shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
393
394 # Producing distribution graph
395 plot.hist(align_scores, bins=numpy.arange(0, 101, 2))
396 plot.xlabel('Pairwise Alignment Score')
397 plot.ylabel('Number of occurrences')
398 plot.title('Distribution of the pairwise alignment score')
399 plot.grid(True)
400 plot.savefig(graph_pic)
401
402 # Generating html report
403 report_html(html_file, tag, all_seq, good_seq, all_seq_fasta, identical_clones, nb_var_part, var_seq_common, align_scores, args)
404
405 # Removing intermediate files
406 subprocess.call(["rm", mcl_file, mcl_output], shell=False)
407 print("HTML report has been generated in the output directory. The program will now exit.")