comparison ecoli_serotyping/speciesIdentification.py @ 6:fe3ceb5c4214 draft

Uploaded
author jpetteng
date Fri, 05 Jan 2018 15:43:14 -0500
parents
children
comparison
equal deleted inserted replaced
5:a202cc394af8 6:fe3ceb5c4214
1 import logging
2 import multiprocessing
3 import os
4 import tempfile
5 from Bio import SeqIO
6
7 from ectyper import genomeFunctions, blastFunctions, definitions, subprocess_util
8
9 LOG = logging.getLogger(__name__)
10
11 def is_ecoli_genome(iden_file, genome_file=None, mash=False):
12 '''
13 Return True if file is classified as ecoli by ecoli markers, otherwise False
14
15 Args:
16 iden_file (str): path to valid fasta genome file
17 genome_file (str): Optional path to valid fastq file for reads
18 mash (bool): Optional input to decide whether to use mash if genome is
19 identified as non-ecoli
20
21 Returns:
22 bool: True if iden_file is ecoli, False otherwise
23 '''
24 if genome_file is None:
25 genome_file = iden_file
26 num_hit = get_num_hits(iden_file)
27 if num_hit < 3:
28 LOG.info(
29 "{0} is identified as "
30 "an invalid e.coli genome file "
31 "by marker approach".format(os.path.basename(iden_file)))
32 if mash:
33 species = get_species(genome_file)
34 LOG.info(
35 "{0} is identified as genome of "
36 "{1} by mash approach".format(os.path.basename(iden_file), species))
37 return False
38 LOG.debug("{0} is a valid e.coli genome file".format(os.path.basename(iden_file)))
39 return True
40
41 def get_num_hits(target):
42 '''
43 Return number of matching hits when query the reference genome
44 on the target genome
45
46 Args:
47 target (str): target genome
48
49 Returns:
50 int: number of hits found
51 '''
52 num_hit = 0
53 try:
54 with tempfile.TemporaryDirectory() as temp_dir:
55 blast_db = blastFunctions.create_blast_db([target], temp_dir)
56 result = blastFunctions.run_blast_for_identification(
57 definitions.ECOLI_MARKERS,
58 blast_db
59 )
60 with open(result) as handler:
61 LOG.debug("get_num_hits() output:")
62 for line in handler:
63 LOG.debug(line)
64 num_hit += 1
65 LOG.debug("{0} aligned to {1} marker sequences".format(target, num_hit))
66 except SystemExit:
67 pass
68 return num_hit
69
70 def get_species(file):
71 '''
72 Given a fasta/fastq file, return the most likely species identification
73
74 Args:
75 file (str): fasta/fastq file input
76
77 Returns:
78 str: name of estimated species
79 '''
80 LOG.info("Identifying species for {0}".format(file))
81 if not os.path.isfile(definitions.REFSEQ_SKETCH):
82 LOG.warning("No refseq found.")
83 return None
84 species = 'unknown'
85 if genomeFunctions.get_valid_format(file) == 'fasta':
86 tmp_file = tempfile.NamedTemporaryFile().name
87 basename = os.path.basename(file).replace(' ', '_')
88 with open(tmp_file, 'w') as new_fh:
89 header = '> {0}\n'.format(basename)
90 new_fh.write(header)
91 for record in SeqIO.parse(file, 'fasta'):
92 new_fh.write(str(record.seq))
93 new_fh.write('nnnnnnnnnnnnnnnnnnnn')
94 try:
95 species = get_species_helper(tmp_file)
96 except Exception:
97 pass
98 if genomeFunctions.get_valid_format(file) == 'fastq':
99 species = get_species_helper(file)
100 return species
101
102 def get_species_helper(file):
103 '''
104 Given a fasta/fastq file with one sequence, return the most likely species
105 identification
106
107 Args:
108 file (str): fasta/fastq file input
109
110 Returns:
111 str: name of estimated species
112 '''
113 species = 'unknown'
114 cmd = [
115 'mash', 'dist',
116 file,
117 definitions.REFSEQ_SKETCH,
118 '|',
119 'sort -gk3 -',
120 '|',
121 'head -1 -'
122 ]
123 try:
124 mash_output = subprocess_util.run_subprocess(' '.join(cmd), is_shell=True)
125 ass_acc_num = '_'.join(mash_output.split('\t')[1].split('_')[:2])
126 cmd = [
127 'grep -E',
128 ass_acc_num,
129 definitions.REFSEQ_SUMMARY
130 ]
131 summary_output = subprocess_util.run_subprocess(' '.join(cmd), is_shell=True)
132 species = summary_output.split('\t')[7]
133 return species
134 except Exception as err:
135 LOG.warning('No matching species found with distance estimation:{0}'.format(err))
136 try:
137 cmd = [
138 'mash screen',
139 '-w',
140 '-p', str(multiprocessing.cpu_count()//2),
141 definitions.REFSEQ_SKETCH,
142 file,
143 '| sort -gr - | head -1 -'
144 ]
145 screen_output = subprocess_util.run_subprocess(' '.join(cmd), is_shell=True)
146 LOG.debug(screen_output.split('\t'))
147 species = screen_output.split('\t')[5].split('\n')[0]
148 except Exception as err2:
149 LOG.warning(
150 'No matching species found with distance screening either:{}'.format(err2)
151 )
152 return species