annotate scripts/modules/typing.py @ 3:0cbed1c0a762 draft default tip

planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author cstrittmatter
date Tue, 28 Jan 2020 10:42:31 -0500
parents 965517909457
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
1 import os.path
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
2 import functools
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
3
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
4 try:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
5 import modules.utils as utils
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
6 except ImportError:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
7 from pathotyping.modules import utils as utils
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
8
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
9
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
10 # {1: {'gene_identity': 0, 'gene_mean_read_coverage': 0.0, 'gene_number_positions_multiple_alleles': 0, 'header': 'fyuA', 'gene_coverage': 0.0, 'gene_low_coverage': 100.0}}
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
11
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
12
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
13 def simplify_data_by_gene(data_by_gene):
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
14 cleaned_data_by_gene = {}
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
15 for counter, data in list(data_by_gene.items()):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
16 cleaned_data_by_gene[data['header'].lower()] = {'gene_identity': data['gene_identity'],
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
17 'gene_coverage': data['gene_coverage'],
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
18 'gene_depth': data['gene_mean_read_coverage']}
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
19 return cleaned_data_by_gene
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
20
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
21
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
22 def possible_types(data_by_gene, typing_rules_file, min_gene_coverage, min_gene_identity, min_gene_depth):
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
23 data_by_gene = simplify_data_by_gene(data_by_gene)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
24
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
25 possible_pathotypes = []
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
26 genes_present = []
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
27 with open(typing_rules_file, 'rtU') as reader:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
28 genes = []
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
29 for line in reader:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
30 line = line.splitlines()[0]
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
31 if len(line) > 0:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
32 line = line.split('\t')
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
33 if not line[0].startswith('##'):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
34 if line[0].startswith('#'):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
35 genes = line[1:]
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
36 else:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
37 profile = line[1:]
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
38 congruence = []
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
39 for x, gene_requirement in enumerate(profile):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
40 if data_by_gene[genes[x].lower()]['gene_coverage'] >= min_gene_coverage and \
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
41 data_by_gene[genes[x].lower()]['gene_identity'] >= min_gene_identity and \
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
42 data_by_gene[genes[x].lower()]['gene_depth'] >= min_gene_depth:
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
43 gene_present = True
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
44 genes_present.append(genes[x])
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
45 else:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
46 gene_present = False
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
47
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
48 gene_requirement = \
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
49 True if gene_requirement == '1' else False if gene_requirement == '0' else None
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
50 if gene_requirement is not None:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
51 if gene_present == gene_requirement:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
52 congruence.append(True)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
53 else:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
54 congruence.append(False)
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
55 else:
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
56 congruence.append(True)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
57 if all(congruence):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
58 possible_pathotypes.append(line[0])
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
59 return possible_pathotypes, list(set(genes_present))
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
60
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
61
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
62 module_timer = functools.partial(utils.timer, name='Module Typing')
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
63
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
64
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
65 @module_timer
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
66 def typing(data_by_gene, typing_rules_file, min_gene_coverage, min_gene_identity, min_gene_depth, outdir):
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
67 possible_pathotypes, genes_present = possible_types(data_by_gene, typing_rules_file, min_gene_coverage,
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
68 min_gene_identity, min_gene_depth)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
69 with open(os.path.join(outdir, 'patho_typing.report.txt'), 'wt') as writer_report:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
70 with open(os.path.join(outdir, 'patho_typing.extended_report.txt'), 'wt') as writer_extended_report:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
71 writer_extended_report.write('#Pathotypes_found' + '\n')
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
72 if len(possible_pathotypes) > 0:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
73 writer_report.write('\n'.join(possible_pathotypes) + '\n')
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
74 writer_extended_report.write('\n'.join(possible_pathotypes) + '\n')
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
75 print('\n' + 'Pathotypes found:' + '\n')
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
76 print('\n'.join(possible_pathotypes) + '\n')
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
77 else:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
78 writer_report.write('TND' + '\n') # Type Not Determined
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
79 writer_extended_report.write('TND' + '\n') # Type Not Determined
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
80 print('\n' + 'It was not possible to identify any possible pathotype match' + '\n')
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
81 writer_extended_report.write('\n' + '#Genes_present' + '\n')
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
82 writer_extended_report.write('\n'.join(genes_present) + '\n')
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
83
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
84 return None, None