annotate ecoli_serotyping/speciesIdentification.py @ 6:fe3ceb5c4214 draft

Uploaded
author jpetteng
date Fri, 05 Jan 2018 15:43:14 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
1 import logging
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
2 import multiprocessing
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
3 import os
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
4 import tempfile
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
5 from Bio import SeqIO
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
6
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
7 from ectyper import genomeFunctions, blastFunctions, definitions, subprocess_util
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
8
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
9 LOG = logging.getLogger(__name__)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
10
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
11 def is_ecoli_genome(iden_file, genome_file=None, mash=False):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
12 '''
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
13 Return True if file is classified as ecoli by ecoli markers, otherwise False
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
14
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
15 Args:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
16 iden_file (str): path to valid fasta genome file
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
17 genome_file (str): Optional path to valid fastq file for reads
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
18 mash (bool): Optional input to decide whether to use mash if genome is
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
19 identified as non-ecoli
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
20
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
21 Returns:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
22 bool: True if iden_file is ecoli, False otherwise
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
23 '''
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
24 if genome_file is None:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
25 genome_file = iden_file
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
26 num_hit = get_num_hits(iden_file)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
27 if num_hit < 3:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
28 LOG.info(
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
29 "{0} is identified as "
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
30 "an invalid e.coli genome file "
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
31 "by marker approach".format(os.path.basename(iden_file)))
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
32 if mash:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
33 species = get_species(genome_file)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
34 LOG.info(
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
35 "{0} is identified as genome of "
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
36 "{1} by mash approach".format(os.path.basename(iden_file), species))
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
37 return False
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
38 LOG.debug("{0} is a valid e.coli genome file".format(os.path.basename(iden_file)))
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
39 return True
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
40
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
41 def get_num_hits(target):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
42 '''
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
43 Return number of matching hits when query the reference genome
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
44 on the target genome
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
45
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
46 Args:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
47 target (str): target genome
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
48
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
49 Returns:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
50 int: number of hits found
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
51 '''
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
52 num_hit = 0
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
53 try:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
54 with tempfile.TemporaryDirectory() as temp_dir:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
55 blast_db = blastFunctions.create_blast_db([target], temp_dir)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
56 result = blastFunctions.run_blast_for_identification(
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
57 definitions.ECOLI_MARKERS,
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
58 blast_db
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
59 )
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
60 with open(result) as handler:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
61 LOG.debug("get_num_hits() output:")
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
62 for line in handler:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
63 LOG.debug(line)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
64 num_hit += 1
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
65 LOG.debug("{0} aligned to {1} marker sequences".format(target, num_hit))
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
66 except SystemExit:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
67 pass
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
68 return num_hit
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
69
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
70 def get_species(file):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
71 '''
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
72 Given a fasta/fastq file, return the most likely species identification
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
73
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
74 Args:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
75 file (str): fasta/fastq file input
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
76
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
77 Returns:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
78 str: name of estimated species
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
79 '''
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
80 LOG.info("Identifying species for {0}".format(file))
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
81 if not os.path.isfile(definitions.REFSEQ_SKETCH):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
82 LOG.warning("No refseq found.")
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
83 return None
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
84 species = 'unknown'
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
85 if genomeFunctions.get_valid_format(file) == 'fasta':
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
86 tmp_file = tempfile.NamedTemporaryFile().name
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
87 basename = os.path.basename(file).replace(' ', '_')
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
88 with open(tmp_file, 'w') as new_fh:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
89 header = '> {0}\n'.format(basename)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
90 new_fh.write(header)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
91 for record in SeqIO.parse(file, 'fasta'):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
92 new_fh.write(str(record.seq))
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
93 new_fh.write('nnnnnnnnnnnnnnnnnnnn')
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
94 try:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
95 species = get_species_helper(tmp_file)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
96 except Exception:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
97 pass
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
98 if genomeFunctions.get_valid_format(file) == 'fastq':
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
99 species = get_species_helper(file)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
100 return species
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
101
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
102 def get_species_helper(file):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
103 '''
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
104 Given a fasta/fastq file with one sequence, return the most likely species
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
105 identification
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
106
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
107 Args:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
108 file (str): fasta/fastq file input
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
109
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
110 Returns:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
111 str: name of estimated species
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
112 '''
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
113 species = 'unknown'
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
114 cmd = [
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
115 'mash', 'dist',
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
116 file,
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
117 definitions.REFSEQ_SKETCH,
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
118 '|',
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
119 'sort -gk3 -',
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
120 '|',
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
121 'head -1 -'
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
122 ]
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
123 try:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
124 mash_output = subprocess_util.run_subprocess(' '.join(cmd), is_shell=True)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
125 ass_acc_num = '_'.join(mash_output.split('\t')[1].split('_')[:2])
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
126 cmd = [
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
127 'grep -E',
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
128 ass_acc_num,
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
129 definitions.REFSEQ_SUMMARY
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
130 ]
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
131 summary_output = subprocess_util.run_subprocess(' '.join(cmd), is_shell=True)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
132 species = summary_output.split('\t')[7]
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
133 return species
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
134 except Exception as err:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
135 LOG.warning('No matching species found with distance estimation:{0}'.format(err))
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
136 try:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
137 cmd = [
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
138 'mash screen',
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
139 '-w',
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
140 '-p', str(multiprocessing.cpu_count()//2),
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
141 definitions.REFSEQ_SKETCH,
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
142 file,
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
143 '| sort -gr - | head -1 -'
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
144 ]
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
145 screen_output = subprocess_util.run_subprocess(' '.join(cmd), is_shell=True)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
146 LOG.debug(screen_output.split('\t'))
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
147 species = screen_output.split('\t')[5].split('\n')[0]
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
148 except Exception as err2:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
149 LOG.warning(
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
150 'No matching species found with distance screening either:{}'.format(err2)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
151 )
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
152 return species