annotate data_manager_amrfinderplus/data_manager/data_manager_amrfinderplus.py @ 9:398a6357e524 draft

Uploaded
author estrain
date Mon, 23 May 2022 10:31:03 +0000
parents d6c3cee5fe48
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
36ba79e745fd Uploaded
estrain
parents:
diff changeset
1 #!/usr/bin/env python
36ba79e745fd Uploaded
estrain
parents:
diff changeset
2 # Errol Strain, estrain@gmail.com
36ba79e745fd Uploaded
estrain
parents:
diff changeset
3 # Database downloads for NCBI AMRFinderPlus
36ba79e745fd Uploaded
estrain
parents:
diff changeset
4
36ba79e745fd Uploaded
estrain
parents:
diff changeset
5 import sys
36ba79e745fd Uploaded
estrain
parents:
diff changeset
6 import os
36ba79e745fd Uploaded
estrain
parents:
diff changeset
7 import tempfile
36ba79e745fd Uploaded
estrain
parents:
diff changeset
8 import shutil
36ba79e745fd Uploaded
estrain
parents:
diff changeset
9 import json
36ba79e745fd Uploaded
estrain
parents:
diff changeset
10 import re
36ba79e745fd Uploaded
estrain
parents:
diff changeset
11 import argparse
36ba79e745fd Uploaded
estrain
parents:
diff changeset
12 from ftplib import FTP
36ba79e745fd Uploaded
estrain
parents:
diff changeset
13
36ba79e745fd Uploaded
estrain
parents:
diff changeset
14
5
d6c3cee5fe48 Uploaded
estrain
parents: 4
diff changeset
15 def download_from_ncbi(output_directory):
0
36ba79e745fd Uploaded
estrain
parents:
diff changeset
16 NCBI_FTP_SERVER = 'ftp.ncbi.nlm.nih.gov'
36ba79e745fd Uploaded
estrain
parents:
diff changeset
17 FILENAME = 'version.txt'
36ba79e745fd Uploaded
estrain
parents:
diff changeset
18 NCBI_DOWNLOAD_PATH = '/pathogen/Antimicrobial_resistance/AMRFinderPlus/database/latest/'
36ba79e745fd Uploaded
estrain
parents:
diff changeset
19
36ba79e745fd Uploaded
estrain
parents:
diff changeset
20 email = 'anonymous@example.com'
36ba79e745fd Uploaded
estrain
parents:
diff changeset
21
4
f119fb382a21 Uploaded
estrain
parents: 3
diff changeset
22 cwd = os.getcwd()
f119fb382a21 Uploaded
estrain
parents: 3
diff changeset
23 os.chdir(output_directory)
f119fb382a21 Uploaded
estrain
parents: 3
diff changeset
24
0
36ba79e745fd Uploaded
estrain
parents:
diff changeset
25 ftp = FTP( NCBI_FTP_SERVER )
36ba79e745fd Uploaded
estrain
parents:
diff changeset
26 ftp.login( 'anonymous', email)
36ba79e745fd Uploaded
estrain
parents:
diff changeset
27 ftp.cwd(NCBI_DOWNLOAD_PATH)
36ba79e745fd Uploaded
estrain
parents:
diff changeset
28
36ba79e745fd Uploaded
estrain
parents:
diff changeset
29 #exclude the allele counts folder
36ba79e745fd Uploaded
estrain
parents:
diff changeset
30 files = ftp.nlst()
36ba79e745fd Uploaded
estrain
parents:
diff changeset
31 files = filter(lambda x: re.search(r'^((?!allele|(?:invokername=allele)).)*$', x), files)
36ba79e745fd Uploaded
estrain
parents:
diff changeset
32
36ba79e745fd Uploaded
estrain
parents:
diff changeset
33 for f in files:
36ba79e745fd Uploaded
estrain
parents:
diff changeset
34 ftp.retrbinary("RETR " + f, open(f, 'wb').write)
36ba79e745fd Uploaded
estrain
parents:
diff changeset
35
36ba79e745fd Uploaded
estrain
parents:
diff changeset
36 files = ftp.nlst()
36ba79e745fd Uploaded
estrain
parents:
diff changeset
37 files = filter(lambda x: re.search(r'^((?!allele|(?:invokername=allele)).)*$', x), files)
36ba79e745fd Uploaded
estrain
parents:
diff changeset
38 pointmuts = filter(lambda x: re.search(r'^((?!tab|(?:invokername=tab)).)*$', x), files)
36ba79e745fd Uploaded
estrain
parents:
diff changeset
39 pointmuts = filter(lambda x: re.search(r'AMR_DNA-', x), pointmuts)
36ba79e745fd Uploaded
estrain
parents:
diff changeset
40
36ba79e745fd Uploaded
estrain
parents:
diff changeset
41
36ba79e745fd Uploaded
estrain
parents:
diff changeset
42 # Make blast databases
36ba79e745fd Uploaded
estrain
parents:
diff changeset
43 blastcmd = "makeblastdb -in AMRProt -dbtype prot -logfile /dev/null"
36ba79e745fd Uploaded
estrain
parents:
diff changeset
44 os.system(blastcmd)
36ba79e745fd Uploaded
estrain
parents:
diff changeset
45 blastcmd = "makeblastdb -in AMR_CDS -dbtype nucl -logfile /dev/null"
36ba79e745fd Uploaded
estrain
parents:
diff changeset
46 os.system(blastcmd)
36ba79e745fd Uploaded
estrain
parents:
diff changeset
47
36ba79e745fd Uploaded
estrain
parents:
diff changeset
48 for f in pointmuts:
36ba79e745fd Uploaded
estrain
parents:
diff changeset
49 blastcmd = "makeblastdb -in " + f +" -dbtype nucl -logfile /dev/null"
36ba79e745fd Uploaded
estrain
parents:
diff changeset
50 os.system(blastcmd)
36ba79e745fd Uploaded
estrain
parents:
diff changeset
51
36ba79e745fd Uploaded
estrain
parents:
diff changeset
52 # Make HMM indexes
36ba79e745fd Uploaded
estrain
parents:
diff changeset
53 hmmcmd="hmmpress -f AMR.LIB > /dev/null 2> /dev/null"
36ba79e745fd Uploaded
estrain
parents:
diff changeset
54 os.system(hmmcmd)
36ba79e745fd Uploaded
estrain
parents:
diff changeset
55
36ba79e745fd Uploaded
estrain
parents:
diff changeset
56 # Read in version
36ba79e745fd Uploaded
estrain
parents:
diff changeset
57 with open("version.txt") as f:
36ba79e745fd Uploaded
estrain
parents:
diff changeset
58 version = f.readline().rstrip()
36ba79e745fd Uploaded
estrain
parents:
diff changeset
59
36ba79e745fd Uploaded
estrain
parents:
diff changeset
60 ftp.quit()
36ba79e745fd Uploaded
estrain
parents:
diff changeset
61
4
f119fb382a21 Uploaded
estrain
parents: 3
diff changeset
62 os.chdir(cwd)
f119fb382a21 Uploaded
estrain
parents: 3
diff changeset
63
0
36ba79e745fd Uploaded
estrain
parents:
diff changeset
64 return version
36ba79e745fd Uploaded
estrain
parents:
diff changeset
65
36ba79e745fd Uploaded
estrain
parents:
diff changeset
66 def print_json (version,argspath,argsname,argsout):
36ba79e745fd Uploaded
estrain
parents:
diff changeset
67
36ba79e745fd Uploaded
estrain
parents:
diff changeset
68 data_table_entry = {
36ba79e745fd Uploaded
estrain
parents:
diff changeset
69 'data_tables' : {
36ba79e745fd Uploaded
estrain
parents:
diff changeset
70 'amrfinderplus': [
36ba79e745fd Uploaded
estrain
parents:
diff changeset
71 {
36ba79e745fd Uploaded
estrain
parents:
diff changeset
72 "value":version,
36ba79e745fd Uploaded
estrain
parents:
diff changeset
73 "name":argsname,
36ba79e745fd Uploaded
estrain
parents:
diff changeset
74 "path":argspath,
36ba79e745fd Uploaded
estrain
parents:
diff changeset
75 }
36ba79e745fd Uploaded
estrain
parents:
diff changeset
76 ]
36ba79e745fd Uploaded
estrain
parents:
diff changeset
77 }
36ba79e745fd Uploaded
estrain
parents:
diff changeset
78 }
36ba79e745fd Uploaded
estrain
parents:
diff changeset
79
36ba79e745fd Uploaded
estrain
parents:
diff changeset
80 with open(argsout, 'w') as fh:
36ba79e745fd Uploaded
estrain
parents:
diff changeset
81 json.dump(data_table_entry, fh, indent=2, sort_keys=True)
36ba79e745fd Uploaded
estrain
parents:
diff changeset
82
36ba79e745fd Uploaded
estrain
parents:
diff changeset
83 def main():
36ba79e745fd Uploaded
estrain
parents:
diff changeset
84
36ba79e745fd Uploaded
estrain
parents:
diff changeset
85 parser = argparse.ArgumentParser(description='Download NCBI amrFinderPlus Databases')
36ba79e745fd Uploaded
estrain
parents:
diff changeset
86 parser.add_argument('--name', type=str, required=True, nargs=1, help='Database name')
36ba79e745fd Uploaded
estrain
parents:
diff changeset
87 parser.add_argument('--out', type=str, required=True, nargs=1, help='output file')
36ba79e745fd Uploaded
estrain
parents:
diff changeset
88
36ba79e745fd Uploaded
estrain
parents:
diff changeset
89 args = parser.parse_args()
2
316961434166 Uploaded
estrain
parents: 0
diff changeset
90
4
f119fb382a21 Uploaded
estrain
parents: 3
diff changeset
91 with open(args.out[0]) as fh:
f119fb382a21 Uploaded
estrain
parents: 3
diff changeset
92 params = json.load(fh)
2
316961434166 Uploaded
estrain
parents: 0
diff changeset
93
4
f119fb382a21 Uploaded
estrain
parents: 3
diff changeset
94 output_directory = params['output_data'][0]['extra_files_path']
f119fb382a21 Uploaded
estrain
parents: 3
diff changeset
95 os.mkdir(output_directory)
2
316961434166 Uploaded
estrain
parents: 0
diff changeset
96 data_manager_dict = {}
316961434166 Uploaded
estrain
parents: 0
diff changeset
97
0
36ba79e745fd Uploaded
estrain
parents:
diff changeset
98 #Fetch the files and build blast databases
4
f119fb382a21 Uploaded
estrain
parents: 3
diff changeset
99 version=download_from_ncbi(output_directory)
0
36ba79e745fd Uploaded
estrain
parents:
diff changeset
100
9
398a6357e524 Uploaded
estrain
parents: 5
diff changeset
101 tablename = "AMRFinderPlus Database " + version
398a6357e524 Uploaded
estrain
parents: 5
diff changeset
102
0
36ba79e745fd Uploaded
estrain
parents:
diff changeset
103 #shutil.copytree("amrdb",args.path[0])
9
398a6357e524 Uploaded
estrain
parents: 5
diff changeset
104 print_json(version,output_directory,tablename,args.out[0])
0
36ba79e745fd Uploaded
estrain
parents:
diff changeset
105
36ba79e745fd Uploaded
estrain
parents:
diff changeset
106 if __name__ == "__main__": main()