Mercurial > repos > eric-rasche > gsaf_downloader
comparison gsaf_download.py @ 0:c892ae46653b draft
Uploaded
author | eric-rasche |
---|---|
date | Fri, 05 Dec 2014 14:24:41 -0500 |
parents | |
children | ba70ea57cef9 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c892ae46653b |
---|---|
1 #!/usr/bin/env python | |
2 import urllib, urllib2 | |
3 import sys | |
4 import subprocess | |
5 import json | |
6 import hashlib | |
7 import os | |
8 import logging | |
9 logging.basicConfig(level=logging.DEBUG) | |
10 log = logging.getLogger() | |
11 | |
12 try: | |
13 os.makedirs("output") | |
14 except: | |
15 #TODO | |
16 pass | |
17 | |
18 URL = sys.argv[1] | |
19 DATASET_ID = sys.argv[2] | |
20 | |
21 #req = urllib2.urlopen(URL) | |
22 #page = req.read() | |
23 with open('file.html', 'r') as handle: | |
24 page = handle.read() | |
25 comments = [x.strip() for x in page.split('\n') if x.strip().startswith('<!--')] | |
26 | |
27 gx_json = open('galaxy.json', 'w') | |
28 | |
29 prefix = '<!--gsafjson' | |
30 suffix = '-->' | |
31 for gsafjson in [x[len(prefix): -len(suffix)].strip() for x in comments if 'gsafjson' in x]: | |
32 #{"file_type":"fastq.gz", | |
33 # "JA":"****", | |
34 # "sample_name":"Sample10", | |
35 # "user_data":{"description":{} } , | |
36 # "md5":"2ea00f4eef8f6a2344b80fbe12ab2eb7", | |
37 # "url":"http://gsaf.s3.amazonaws.com/****", | |
38 # "size_in_mb":45, | |
39 # "filename":"Sample10_S8_L001_R1_001.fastq.gz", | |
40 # "reads":["Sample10_S8_L001_R1_001.fastq.gz","Sample10_S8_L001_R2_001.fastq.gz"], | |
41 # "SA":"***"} | |
42 data = json.loads(gsafjson) | |
43 log.info("Fetching %s" % data['filename']) | |
44 file_path = os.path.join('output', data['filename']) | |
45 #urllib.urlretrieve(data['url'], file_path) | |
46 log.info("Hashing file") | |
47 #file_md5 = hashlib.md5(open(file_path).read()).hexdigest() | |
48 #log.debug("Hashed to %s" % file_md5) | |
49 | |
50 stderr = '' | |
51 #if file_md5 != data['md5']: | |
52 #stderr = 'md5sum mismatch: %s != %s' % (file_md5, data['md5']) | |
53 | |
54 # Galaxy.json | |
55 # {"name": "lambda.fa", "stdout": "uploaded fasta file", "line_count": 811, "ext": "fasta", "dataset_id": 16220, "type": "dataset"} | |
56 line_count = subprocess.check_output(['wc', '-l', file_path]).strip() | |
57 line_count = line_count[0:line_count.index(' ')] | |
58 | |
59 galaxy_json = { | |
60 'name': data['filename'].strip(data['file_type']), | |
61 'stdout': None, | |
62 'stderr': stderr, | |
63 'line_count': int(line_count), | |
64 # TODO, check that data is really .gz | |
65 'ext': data['file_type'].strip('.gz'), | |
66 'dataset_id': DATASET_ID, | |
67 'type': 'dataset' | |
68 } | |
69 | |
70 #try: | |
71 #subprocess.check_call(['gunzip', file_path]) | |
72 #except: | |
73 #log.error("Couldn't extract %s" % data['filename']) | |
74 | |
75 gx_json.write(json.dumps(galaxy_json) + "\n") | |
76 |