annotate scripts/ReMatCh/modules/checkMLST.py @ 0:965517909457 draft

planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author cstrittmatter
date Wed, 22 Jan 2020 08:41:44 -0500
parents
children 0cbed1c0a762
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
1 import sys
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
2 import os
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
3 import urllib2
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
4 import urllib
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
5 import csv
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
6 from glob import glob
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
7 import re
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
8 import functools
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
9 try:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
10 import xml.etree.cElementTree as ET
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
11 except ImportError:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
12 import xml.etree.ElementTree as ET
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
13 import utils
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
14 import rematch_module
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
15
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
16
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
17 def determine_species(species):
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
18 species = species.lower().split(' ')
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
19
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
20 if len(species) >= 2:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
21 species = species[:2]
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
22 if species[1] in ('spp', 'spp.', 'complex'):
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
23 species = [species[0]]
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
24
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
25 return species
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
26
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
27
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
28 def check_existing_schema(species, schema_number, script_path):
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
29 species = determine_species(species)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
30
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
31 if schema_number is None:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
32 schema_number = ''
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
33 else:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
34 schema_number = '#' + str(schema_number)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
35
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
36 mlst_schemas_folder = os.path.join(os.path.dirname(script_path), 'modules', 'mlst_schemas', '')
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
37 reference = []
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
38 files = [f for f in os.listdir(mlst_schemas_folder) if not f.startswith('.') and os.path.isfile(os.path.join(mlst_schemas_folder, f))]
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
39 for file_found in files:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
40 file_path = os.path.join(mlst_schemas_folder, file_found)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
41 if file_found.startswith('_'.join(species) + schema_number) and file_found.endswith('.fasta'):
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
42 reference = file_path
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
43
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
44 if len(reference) > 1:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
45 if schema_number == '':
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
46 schema_number = '#1'
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
47 for scheme in reference:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
48 if os.path.splitext(scheme)[0].endswith(schema_number):
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
49 reference = [scheme]
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
50 break
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
51 if len(reference) == 0:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
52 reference = None
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
53 elif len(reference) == 1:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
54 reference = reference[0]
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
55 return reference
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
56
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
57
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
58 def write_mlst_reference(species, mlst_sequences, outdir, time_str):
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
59 print 'Writing MLST alleles as reference_sequences' + '\n'
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
60 reference_file = os.path.join(outdir, str(species.replace('/', '_').replace(' ', '_') + '.' + time_str + '.fasta'))
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
61 with open(reference_file, 'wt') as writer:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
62 for header, sequence in mlst_sequences.items():
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
63 writer.write('>' + header + '\n')
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
64 fasta_sequence_lines = rematch_module.chunkstring(sequence, 80)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
65 for line in fasta_sequence_lines:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
66 writer.write(line + '\n')
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
67 return reference_file
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
68
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
69
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
70 def getST(mlst_dicts, dict_sequences):
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
71 SequenceDict = mlst_dicts[0]
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
72 STdict = mlst_dicts[1]
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
73 lociOrder = mlst_dicts[2]
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
74
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
75 alleles_profile = ['-'] * len(lociOrder)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
76 for x, sequence_data in dict_sequences.items():
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
77 if sequence_data['header'] not in SequenceDict:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
78 print sequence_data['header'] + ' not found between consensus sequences!'
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
79 break
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
80 if sequence_data['sequence'] in SequenceDict[sequence_data['header']].keys():
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
81 allele_number = SequenceDict[sequence_data['header']][sequence_data['sequence']]
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
82 alleles_profile[lociOrder.index(sequence_data['header'])] = allele_number
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
83 else:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
84 for sequence_st, allele_number in SequenceDict[sequence_data['header']].items():
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
85 if sequence_st in sequence_data['sequence']:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
86 alleles_profile[lociOrder.index(sequence_data['header'])] = allele_number
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
87
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
88 alleles_profile = ','.join(alleles_profile)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
89 st = '-'
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
90 if alleles_profile in STdict:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
91 st = STdict[alleles_profile]
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
92
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
93 return st, alleles_profile
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
94
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
95
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
96 downloadPubMLST = functools.partial(utils.timer, name='Download PubMLST module')
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
97
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
98
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
99 @downloadPubMLST
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
100 def downloadPubMLSTxml(originalSpecies, schema_number, outdir):
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
101 print 'Searching MLST database for ' + originalSpecies
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
102
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
103 xmlURL = 'http://pubmlst.org/data/dbases.xml'
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
104 try:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
105 content = urllib2.urlopen(xmlURL)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
106 xml = content.read()
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
107 tree = ET.fromstring(xml)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
108 except:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
109 print "Ooops! There might be a problem with the PubMLST service, try later or check if the xml is well formated at " + xmlURL
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
110 raise
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
111
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
112 xmlData = {}
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
113
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
114 if schema_number is None:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
115 schema_number = 1
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
116
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
117 success = 0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
118 for scheme in tree.findall('species'):
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
119 species_scheme = scheme.text.splitlines()[0].rsplit('#', 1)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
120 number_scheme = species_scheme[1] if len(species_scheme) == 2 else 1
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
121 species_scheme = species_scheme[0]
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
122 if determine_species(species_scheme) == determine_species(originalSpecies):
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
123 if schema_number == number_scheme:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
124 success += 1
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
125 xmlData[scheme.text.strip()] = {}
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
126 for info in scheme: # mlst
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
127 for database in info: # database
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
128 for retrievedDate in database.findall('retrieved'):
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
129 retrieved = retrievedDate.text
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
130 xmlData[scheme.text.strip()][retrieved] = []
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
131 for profile in database.findall('profiles'):
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
132 profileURl = profile.find('url').text
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
133 xmlData[scheme.text.strip()][retrieved].append(profileURl)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
134 for lociScheme in database.findall('loci'):
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
135 loci = {}
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
136 for locus in lociScheme:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
137 locusID = locus.text
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
138 for locusInfo in locus:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
139 locusUrl = locusInfo.text
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
140 loci[locusID.strip()] = locusUrl
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
141 xmlData[scheme.text.strip()][retrieved].append(loci)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
142 if success == 0:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
143 sys.exit("\tError. No schema found for %s. Please refer to https://pubmlst.org/databases/" % (originalSpecies))
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
144 elif success > 1:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
145 keys = xmlData.keys()
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
146 keys = sorted(keys)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
147 print "\tWarning. More than one schema found for %s. only keeping the first one... %s" % (originalSpecies, keys[0])
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
148 for key in keys[1:]:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
149 del xmlData[key]
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
150
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
151 pubmlst_dir = os.path.join(outdir, 'pubmlst', '')
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
152 if not os.path.isdir(pubmlst_dir):
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
153 os.makedirs(pubmlst_dir)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
154
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
155 for SchemaName, info in xmlData.items():
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
156 STdict = {}
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
157 SequenceDict = {}
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
158 mlst_sequences = {}
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
159
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
160 species_name = '_'.join(determine_species(SchemaName)).replace('/', '_')
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
161
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
162 for RetrievalDate, URL in info.items():
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
163 schema_date = species_name + '_' + RetrievalDate
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
164 outDit = os.path.join(pubmlst_dir, schema_date) # compatible with windows? See if it already exists, if so, break
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
165
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
166 if os.path.isdir(outDit):
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
167 pickle = os.path.join(outDit, str(schema_date + '.pkl'))
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
168 if os.path.isfile(pickle):
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
169 print "\tschema files already exist for %s" % (SchemaName)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
170 mlst_dicts = utils.extractVariableFromPickle(pickle)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
171 SequenceDict = mlst_dicts[0]
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
172 for lociName, alleleSequences in SequenceDict.items():
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
173 for sequence in alleleSequences:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
174 if lociName not in mlst_sequences.keys():
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
175 mlst_sequences[lociName] = sequence
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
176 else:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
177 break
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
178 return mlst_dicts, mlst_sequences
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
179
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
180 elif any(species_name in x for x in os.listdir(pubmlst_dir)):
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
181 print "Older version of %s's scheme found! Deleting..." % (SchemaName)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
182 for directory in glob(str(pubmlst_dir + str(species_name + '_*'))):
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
183 utils.removeDirectory(directory)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
184 os.makedirs(outDit)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
185 else:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
186 os.makedirs(outDit)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
187
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
188 contentProfile = urllib2.urlopen(URL[0])
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
189 profileFile = csv.reader(contentProfile, delimiter='\t')
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
190 header = profileFile.next() # skip header
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
191 try:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
192 indexCC = header.index('clonal_complex') if 'clonal_complex' in header else header.index('CC')
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
193 except:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
194 indexCC = len(header)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
195 lociOrder = header[1:indexCC]
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
196 for row in profileFile:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
197 ST = row[0]
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
198 alleles = ','.join(row[1:indexCC])
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
199 STdict[alleles] = ST
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
200 for lociName, lociURL in URL[1].items():
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
201 if lociName not in SequenceDict.keys():
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
202 SequenceDict[lociName] = {}
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
203 url_file = os.path.join(outDit, lociURL.rsplit('/', 1)[1])
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
204 urllib.urlretrieve(lociURL, url_file)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
205 sequences, ignore, ignore = rematch_module.get_sequence_information(url_file, 0)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
206 for key in sequences.keys():
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
207 header = re.sub("\D", "", sequences[key]['header'])
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
208 sequence = sequences[key]['sequence'].upper()
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
209 SequenceDict[lociName][sequence] = header
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
210 if lociName not in mlst_sequences.keys():
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
211 mlst_sequences[lociName] = sequence
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
212 os.remove(url_file)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
213 mlst_dicts = [SequenceDict, STdict, lociOrder]
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
214 utils.saveVariableToPickle(mlst_dicts, outDit, schema_date)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
215 return mlst_dicts, mlst_sequences