annotate data_manager/fetch_refseq.py @ 0:8b91891ae805 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
author iuc
date Mon, 01 Oct 2018 15:36:01 -0400
parents
children d58cad5baa70
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
1 #!/usr/bin/env python
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
2
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
3 from __future__ import division, print_function
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
4
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
5 import argparse
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
6 import functools
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
7 import gzip
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
8 import json
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
9 import os
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
10 import os.path
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
11 import sys
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
12 from datetime import date
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
13 from multiprocessing import Process, Queue
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
14
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
15 import requests
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
16
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
17 try:
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
18 from io import StringIO
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
19 except ImportError:
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
20 from StringIO import StringIO
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
21 # Refseq structure
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
22 # - Release number
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
23 # - Divisions
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
24 # 1. archea
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
25 # 2. bacteria
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
26 # 3. fungi
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
27 # 4. invertebrate
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
28 # 5. mitochondrion
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
29 # 6. other
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
30 # 7. plant
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
31 # 8. plasmid
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
32 # 9. plastid
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
33 # 10. protozoa
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
34 # 11. vertebrate mammalian
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
35 # 12. vertebrate other
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
36 # 13. viral
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
37 # within each division
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
38 # DIVNAME.\d+(.\d+)?.(genomic|protein|rna).(fna|gbff|faa|gpff).gz
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
39 # where fna and faa are FASTA, gbff and gpff are Genbank
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
40
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
41
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
42 def _add_data_table_entry(data_manager_dict, data_table_entry, data_table_name):
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
43 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
44 data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get('all_fasta', [])
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
45 data_manager_dict['data_tables'][data_table_name].append(data_table_entry)
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
46 return data_manager_dict
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
47
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
48
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
49 def unzip_to(conn, out_dir, output_filename, chunk_size=4096, debug=False, compress=False):
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
50 input_filename = conn.get()
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
51 if compress:
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
52 open_output = gzip.open
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
53 else:
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
54 open_output = open
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
55 with open_output(os.path.join(out_dir, output_filename), 'wb') as output_file:
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
56 while input_filename != 'STOP':
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
57 if debug:
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
58 print('Reading', input_filename, file=sys.stderr)
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
59 with gzip.open(input_filename, 'rb') as input_file:
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
60 read_chunk = functools.partial(input_file.read, (chunk_size))
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
61 for data in iter(read_chunk, b''): # use b'' as a sentinel to stop the loop. note '' != b'' in Python 3
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
62 output_file.write(data)
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
63 os.unlink(input_filename)
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
64 input_filename = conn.get()
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
65
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
66
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
67 def get_refseq_division(division_name, mol_types, output_directory, debug=False, compress=False):
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
68 base_url = 'https://ftp.ncbi.nlm.nih.gov/refseq/release/'
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
69 valid_divisions = set(['archea', 'bacteria', 'complete', 'fungi', 'invertebrate', 'mitochondrion', 'other',
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
70 'plant', 'plasmid', 'plastid', 'protozoa', 'vertebrate_mammalian', 'vertebrate_other', 'viral'])
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
71 ending_mappings = {
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
72 'genomic': '.genomic.fna.gz',
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
73 'protein': '.protein.faa.gz',
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
74 'rna': 'rna.fna.gz'
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
75 }
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
76 assert division_name in valid_divisions, "Unknown division name ({})".format(division_name)
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
77 for mol_type in mol_types:
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
78 assert mol_type in ending_mappings, "Unknown molecule type ({})".format(mol_type)
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
79 if not os.path.exists(output_directory):
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
80 os.mkdir(output_directory)
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
81 release_num_file = base_url + 'RELEASE_NUMBER'
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
82 r = requests.get(release_num_file)
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
83 release_num = r.text.strip()
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
84 division_base_url = base_url + division_name
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
85 if debug:
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
86 print('Retrieving {}'.format(division_base_url), file=sys.stderr)
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
87 r = requests.get(division_base_url)
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
88 listing_text = r.text
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
89
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
90 unzip_queues = {}
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
91 unzip_processes = []
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
92 final_output_filenames = []
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
93 for mol_type in mol_types:
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
94 q = unzip_queues[mol_type] = Queue()
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
95 output_filename = division_name + '.' + release_num + '.' + mol_type + '.fasta'
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
96 if compress:
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
97 output_filename += '.gz'
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
98 final_output_filenames.append(output_filename)
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
99 unzip_processes.append(Process(target=unzip_to, args=(q, output_directory, output_filename),
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
100 kwargs=dict(debug=debug, compress=compress)))
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
101 unzip_processes[-1].start()
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
102
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
103 # sample line: <a href="vertebrate_other.86.genomic.gbff.gz">vertebrate_other.86.genomic.gbff.gz</a> 2018-07-13 00:59 10M
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
104 for line in StringIO(listing_text):
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
105 if '.gz' not in line:
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
106 continue
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
107 parts = line.split('"')
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
108 assert len(parts) == 3, "Unexpected line format: {}".format(line.rstrip())
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
109 filename = parts[1]
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
110 for mol_type in mol_types:
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
111 ending = ending_mappings[mol_type]
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
112 if filename.endswith(ending):
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
113 if debug:
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
114 print('Downloading:', filename, ending, mol_type, file=sys.stderr)
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
115 output_filename = os.path.join(output_directory, filename)
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
116 with open(output_filename, 'wb') as output_file:
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
117 r = requests.get(division_base_url + '/' + filename)
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
118 for chunk in r.iter_content(chunk_size=4096):
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
119 output_file.write(chunk)
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
120 conn = unzip_queues[mol_type]
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
121 conn.put(output_filename)
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
122
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
123 for mol_type in mol_types:
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
124 conn = unzip_queues[mol_type]
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
125 conn.put('STOP')
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
126
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
127 return [release_num, final_output_filenames]
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
128
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
129
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
130 if __name__ == '__main__':
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
131 parser = argparse.ArgumentParser(description='Download RefSeq databases')
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
132 parser.add_argument('--debug', default=False, action='store_true', help='Print debugging output to stderr (verbose)')
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
133 parser.add_argument('--compress', default=False, action='store_true', help='Compress output files')
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
134 parser.add_argument('--output_directory', default='tmp', help='Directory to write output to')
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
135 parser.add_argument('--galaxy_datamanager_filename', help='Galaxy JSON format file describing data manager inputs')
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
136 parser.add_argument('--division_names', help='RefSeq divisions to download')
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
137 parser.add_argument('--mol_types', help='Molecule types (genomic, rna, protein) to fetch')
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
138 parser.add_argument('--pin_date', help='Force download date to this version string')
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
139 args = parser.parse_args()
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
140
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
141 division_names = args.division_names.split(',')
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
142 mol_types = args.mol_types.split(',')
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
143 if args.galaxy_datamanager_filename is not None:
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
144 dm_opts = json.loads(open(args.galaxy_datamanager_filename).read())
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
145 output_directory = dm_opts['output_data'][0]['extra_files_path'] # take the extra_files_path of the first output parameter
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
146 data_manager_dict = {}
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
147 else:
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
148 output_directory = args.output_directory
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
149 for division_name in division_names:
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
150 if args.pin_date is not None:
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
151 today_str = args.pin_date
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
152 else:
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
153 today_str = date.today().strftime('%Y-%m-%d') # ISO 8601 date format
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
154 [release_num, fasta_files] = get_refseq_division(division_name, mol_types, output_directory, args.debug, args.compress)
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
155 if args.galaxy_datamanager_filename is not None:
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
156 for i, mol_type in enumerate(mol_types):
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
157 assert mol_type in fasta_files[i], "Filename does not contain expected mol_type ({}, {})".format(mol_type, fasta_files[i])
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
158 unique_key = 'refseq_' + division_name + '.' + release_num + '.' + mol_type # note: this is now same as dbkey
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
159 dbkey = unique_key
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
160 desc = 'RefSeq ' + division_name + ' Release ' + release_num + ' ' + mol_type + ' (' + today_str + ')'
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
161 path = os.path.join(output_directory, fasta_files[i])
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
162 _add_data_table_entry(data_manager_dict=data_manager_dict,
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
163 data_table_entry=dict(value=unique_key, dbkey=dbkey, name=desc, path=path),
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
164 data_table_name='all_fasta')
8b91891ae805 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
iuc
parents:
diff changeset
165 open(args.galaxy_datamanager_filename, 'w').write(json.dumps(data_manager_dict, sort_keys=True))