6
|
1 import logging
|
|
2 import multiprocessing
|
|
3 import os
|
|
4 import tempfile
|
|
5 from Bio import SeqIO
|
|
6
|
|
7 from ectyper import genomeFunctions, blastFunctions, definitions, subprocess_util
|
|
8
|
|
9 LOG = logging.getLogger(__name__)
|
|
10
|
|
11 def is_ecoli_genome(iden_file, genome_file=None, mash=False):
|
|
12 '''
|
|
13 Return True if file is classified as ecoli by ecoli markers, otherwise False
|
|
14
|
|
15 Args:
|
|
16 iden_file (str): path to valid fasta genome file
|
|
17 genome_file (str): Optional path to valid fastq file for reads
|
|
18 mash (bool): Optional input to decide whether to use mash if genome is
|
|
19 identified as non-ecoli
|
|
20
|
|
21 Returns:
|
|
22 bool: True if iden_file is ecoli, False otherwise
|
|
23 '''
|
|
24 if genome_file is None:
|
|
25 genome_file = iden_file
|
|
26 num_hit = get_num_hits(iden_file)
|
|
27 if num_hit < 3:
|
|
28 LOG.info(
|
|
29 "{0} is identified as "
|
|
30 "an invalid e.coli genome file "
|
|
31 "by marker approach".format(os.path.basename(iden_file)))
|
|
32 if mash:
|
|
33 species = get_species(genome_file)
|
|
34 LOG.info(
|
|
35 "{0} is identified as genome of "
|
|
36 "{1} by mash approach".format(os.path.basename(iden_file), species))
|
|
37 return False
|
|
38 LOG.debug("{0} is a valid e.coli genome file".format(os.path.basename(iden_file)))
|
|
39 return True
|
|
40
|
|
41 def get_num_hits(target):
|
|
42 '''
|
|
43 Return number of matching hits when query the reference genome
|
|
44 on the target genome
|
|
45
|
|
46 Args:
|
|
47 target (str): target genome
|
|
48
|
|
49 Returns:
|
|
50 int: number of hits found
|
|
51 '''
|
|
52 num_hit = 0
|
|
53 try:
|
|
54 with tempfile.TemporaryDirectory() as temp_dir:
|
|
55 blast_db = blastFunctions.create_blast_db([target], temp_dir)
|
|
56 result = blastFunctions.run_blast_for_identification(
|
|
57 definitions.ECOLI_MARKERS,
|
|
58 blast_db
|
|
59 )
|
|
60 with open(result) as handler:
|
|
61 LOG.debug("get_num_hits() output:")
|
|
62 for line in handler:
|
|
63 LOG.debug(line)
|
|
64 num_hit += 1
|
|
65 LOG.debug("{0} aligned to {1} marker sequences".format(target, num_hit))
|
|
66 except SystemExit:
|
|
67 pass
|
|
68 return num_hit
|
|
69
|
|
70 def get_species(file):
|
|
71 '''
|
|
72 Given a fasta/fastq file, return the most likely species identification
|
|
73
|
|
74 Args:
|
|
75 file (str): fasta/fastq file input
|
|
76
|
|
77 Returns:
|
|
78 str: name of estimated species
|
|
79 '''
|
|
80 LOG.info("Identifying species for {0}".format(file))
|
|
81 if not os.path.isfile(definitions.REFSEQ_SKETCH):
|
|
82 LOG.warning("No refseq found.")
|
|
83 return None
|
|
84 species = 'unknown'
|
|
85 if genomeFunctions.get_valid_format(file) == 'fasta':
|
|
86 tmp_file = tempfile.NamedTemporaryFile().name
|
|
87 basename = os.path.basename(file).replace(' ', '_')
|
|
88 with open(tmp_file, 'w') as new_fh:
|
|
89 header = '> {0}\n'.format(basename)
|
|
90 new_fh.write(header)
|
|
91 for record in SeqIO.parse(file, 'fasta'):
|
|
92 new_fh.write(str(record.seq))
|
|
93 new_fh.write('nnnnnnnnnnnnnnnnnnnn')
|
|
94 try:
|
|
95 species = get_species_helper(tmp_file)
|
|
96 except Exception:
|
|
97 pass
|
|
98 if genomeFunctions.get_valid_format(file) == 'fastq':
|
|
99 species = get_species_helper(file)
|
|
100 return species
|
|
101
|
|
102 def get_species_helper(file):
|
|
103 '''
|
|
104 Given a fasta/fastq file with one sequence, return the most likely species
|
|
105 identification
|
|
106
|
|
107 Args:
|
|
108 file (str): fasta/fastq file input
|
|
109
|
|
110 Returns:
|
|
111 str: name of estimated species
|
|
112 '''
|
|
113 species = 'unknown'
|
|
114 cmd = [
|
|
115 'mash', 'dist',
|
|
116 file,
|
|
117 definitions.REFSEQ_SKETCH,
|
|
118 '|',
|
|
119 'sort -gk3 -',
|
|
120 '|',
|
|
121 'head -1 -'
|
|
122 ]
|
|
123 try:
|
|
124 mash_output = subprocess_util.run_subprocess(' '.join(cmd), is_shell=True)
|
|
125 ass_acc_num = '_'.join(mash_output.split('\t')[1].split('_')[:2])
|
|
126 cmd = [
|
|
127 'grep -E',
|
|
128 ass_acc_num,
|
|
129 definitions.REFSEQ_SUMMARY
|
|
130 ]
|
|
131 summary_output = subprocess_util.run_subprocess(' '.join(cmd), is_shell=True)
|
|
132 species = summary_output.split('\t')[7]
|
|
133 return species
|
|
134 except Exception as err:
|
|
135 LOG.warning('No matching species found with distance estimation:{0}'.format(err))
|
|
136 try:
|
|
137 cmd = [
|
|
138 'mash screen',
|
|
139 '-w',
|
|
140 '-p', str(multiprocessing.cpu_count()//2),
|
|
141 definitions.REFSEQ_SKETCH,
|
|
142 file,
|
|
143 '| sort -gr - | head -1 -'
|
|
144 ]
|
|
145 screen_output = subprocess_util.run_subprocess(' '.join(cmd), is_shell=True)
|
|
146 LOG.debug(screen_output.split('\t'))
|
|
147 species = screen_output.split('\t')[5].split('\n')[0]
|
|
148 except Exception as err2:
|
|
149 LOG.warning(
|
|
150 'No matching species found with distance screening either:{}'.format(err2)
|
|
151 )
|
|
152 return species
|