Mercurial > repos > jpetteng > ectyper
comparison ecoli_serotyping/speciesIdentification.py @ 6:fe3ceb5c4214 draft
Uploaded
author | jpetteng |
---|---|
date | Fri, 05 Jan 2018 15:43:14 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
5:a202cc394af8 | 6:fe3ceb5c4214 |
---|---|
1 import logging | |
2 import multiprocessing | |
3 import os | |
4 import tempfile | |
5 from Bio import SeqIO | |
6 | |
7 from ectyper import genomeFunctions, blastFunctions, definitions, subprocess_util | |
8 | |
9 LOG = logging.getLogger(__name__) | |
10 | |
11 def is_ecoli_genome(iden_file, genome_file=None, mash=False): | |
12 ''' | |
13 Return True if file is classified as ecoli by ecoli markers, otherwise False | |
14 | |
15 Args: | |
16 iden_file (str): path to valid fasta genome file | |
17 genome_file (str): Optional path to valid fastq file for reads | |
18 mash (bool): Optional input to decide whether to use mash if genome is | |
19 identified as non-ecoli | |
20 | |
21 Returns: | |
22 bool: True if iden_file is ecoli, False otherwise | |
23 ''' | |
24 if genome_file is None: | |
25 genome_file = iden_file | |
26 num_hit = get_num_hits(iden_file) | |
27 if num_hit < 3: | |
28 LOG.info( | |
29 "{0} is identified as " | |
30 "an invalid e.coli genome file " | |
31 "by marker approach".format(os.path.basename(iden_file))) | |
32 if mash: | |
33 species = get_species(genome_file) | |
34 LOG.info( | |
35 "{0} is identified as genome of " | |
36 "{1} by mash approach".format(os.path.basename(iden_file), species)) | |
37 return False | |
38 LOG.debug("{0} is a valid e.coli genome file".format(os.path.basename(iden_file))) | |
39 return True | |
40 | |
41 def get_num_hits(target): | |
42 ''' | |
43 Return number of matching hits when query the reference genome | |
44 on the target genome | |
45 | |
46 Args: | |
47 target (str): target genome | |
48 | |
49 Returns: | |
50 int: number of hits found | |
51 ''' | |
52 num_hit = 0 | |
53 try: | |
54 with tempfile.TemporaryDirectory() as temp_dir: | |
55 blast_db = blastFunctions.create_blast_db([target], temp_dir) | |
56 result = blastFunctions.run_blast_for_identification( | |
57 definitions.ECOLI_MARKERS, | |
58 blast_db | |
59 ) | |
60 with open(result) as handler: | |
61 LOG.debug("get_num_hits() output:") | |
62 for line in handler: | |
63 LOG.debug(line) | |
64 num_hit += 1 | |
65 LOG.debug("{0} aligned to {1} marker sequences".format(target, num_hit)) | |
66 except SystemExit: | |
67 pass | |
68 return num_hit | |
69 | |
70 def get_species(file): | |
71 ''' | |
72 Given a fasta/fastq file, return the most likely species identification | |
73 | |
74 Args: | |
75 file (str): fasta/fastq file input | |
76 | |
77 Returns: | |
78 str: name of estimated species | |
79 ''' | |
80 LOG.info("Identifying species for {0}".format(file)) | |
81 if not os.path.isfile(definitions.REFSEQ_SKETCH): | |
82 LOG.warning("No refseq found.") | |
83 return None | |
84 species = 'unknown' | |
85 if genomeFunctions.get_valid_format(file) == 'fasta': | |
86 tmp_file = tempfile.NamedTemporaryFile().name | |
87 basename = os.path.basename(file).replace(' ', '_') | |
88 with open(tmp_file, 'w') as new_fh: | |
89 header = '> {0}\n'.format(basename) | |
90 new_fh.write(header) | |
91 for record in SeqIO.parse(file, 'fasta'): | |
92 new_fh.write(str(record.seq)) | |
93 new_fh.write('nnnnnnnnnnnnnnnnnnnn') | |
94 try: | |
95 species = get_species_helper(tmp_file) | |
96 except Exception: | |
97 pass | |
98 if genomeFunctions.get_valid_format(file) == 'fastq': | |
99 species = get_species_helper(file) | |
100 return species | |
101 | |
102 def get_species_helper(file): | |
103 ''' | |
104 Given a fasta/fastq file with one sequence, return the most likely species | |
105 identification | |
106 | |
107 Args: | |
108 file (str): fasta/fastq file input | |
109 | |
110 Returns: | |
111 str: name of estimated species | |
112 ''' | |
113 species = 'unknown' | |
114 cmd = [ | |
115 'mash', 'dist', | |
116 file, | |
117 definitions.REFSEQ_SKETCH, | |
118 '|', | |
119 'sort -gk3 -', | |
120 '|', | |
121 'head -1 -' | |
122 ] | |
123 try: | |
124 mash_output = subprocess_util.run_subprocess(' '.join(cmd), is_shell=True) | |
125 ass_acc_num = '_'.join(mash_output.split('\t')[1].split('_')[:2]) | |
126 cmd = [ | |
127 'grep -E', | |
128 ass_acc_num, | |
129 definitions.REFSEQ_SUMMARY | |
130 ] | |
131 summary_output = subprocess_util.run_subprocess(' '.join(cmd), is_shell=True) | |
132 species = summary_output.split('\t')[7] | |
133 return species | |
134 except Exception as err: | |
135 LOG.warning('No matching species found with distance estimation:{0}'.format(err)) | |
136 try: | |
137 cmd = [ | |
138 'mash screen', | |
139 '-w', | |
140 '-p', str(multiprocessing.cpu_count()//2), | |
141 definitions.REFSEQ_SKETCH, | |
142 file, | |
143 '| sort -gr - | head -1 -' | |
144 ] | |
145 screen_output = subprocess_util.run_subprocess(' '.join(cmd), is_shell=True) | |
146 LOG.debug(screen_output.split('\t')) | |
147 species = screen_output.split('\t')[5].split('\n')[0] | |
148 except Exception as err2: | |
149 LOG.warning( | |
150 'No matching species found with distance screening either:{}'.format(err2) | |
151 ) | |
152 return species |