comparison gsaf_download.py @ 0:c892ae46653b draft

Uploaded
author eric-rasche
date Fri, 05 Dec 2014 14:24:41 -0500
parents
children ba70ea57cef9
comparison
equal deleted inserted replaced
-1:000000000000 0:c892ae46653b
1 #!/usr/bin/env python
2 import urllib, urllib2
3 import sys
4 import subprocess
5 import json
6 import hashlib
7 import os
8 import logging
9 logging.basicConfig(level=logging.DEBUG)
10 log = logging.getLogger()
11
12 try:
13 os.makedirs("output")
14 except:
15 #TODO
16 pass
17
18 URL = sys.argv[1]
19 DATASET_ID = sys.argv[2]
20
21 #req = urllib2.urlopen(URL)
22 #page = req.read()
23 with open('file.html', 'r') as handle:
24 page = handle.read()
25 comments = [x.strip() for x in page.split('\n') if x.strip().startswith('<!--')]
26
27 gx_json = open('galaxy.json', 'w')
28
29 prefix = '<!--gsafjson'
30 suffix = '-->'
31 for gsafjson in [x[len(prefix): -len(suffix)].strip() for x in comments if 'gsafjson' in x]:
32 #{"file_type":"fastq.gz",
33 # "JA":"****",
34 # "sample_name":"Sample10",
35 # "user_data":{"description":{} } ,
36 # "md5":"2ea00f4eef8f6a2344b80fbe12ab2eb7",
37 # "url":"http://gsaf.s3.amazonaws.com/****",
38 # "size_in_mb":45,
39 # "filename":"Sample10_S8_L001_R1_001.fastq.gz",
40 # "reads":["Sample10_S8_L001_R1_001.fastq.gz","Sample10_S8_L001_R2_001.fastq.gz"],
41 # "SA":"***"}
42 data = json.loads(gsafjson)
43 log.info("Fetching %s" % data['filename'])
44 file_path = os.path.join('output', data['filename'])
45 #urllib.urlretrieve(data['url'], file_path)
46 log.info("Hashing file")
47 #file_md5 = hashlib.md5(open(file_path).read()).hexdigest()
48 #log.debug("Hashed to %s" % file_md5)
49
50 stderr = ''
51 #if file_md5 != data['md5']:
52 #stderr = 'md5sum mismatch: %s != %s' % (file_md5, data['md5'])
53
54 # Galaxy.json
55 # {"name": "lambda.fa", "stdout": "uploaded fasta file", "line_count": 811, "ext": "fasta", "dataset_id": 16220, "type": "dataset"}
56 line_count = subprocess.check_output(['wc', '-l', file_path]).strip()
57 line_count = line_count[0:line_count.index(' ')]
58
59 galaxy_json = {
60 'name': data['filename'].strip(data['file_type']),
61 'stdout': None,
62 'stderr': stderr,
63 'line_count': int(line_count),
64 # TODO, check that data is really .gz
65 'ext': data['file_type'].strip('.gz'),
66 'dataset_id': DATASET_ID,
67 'type': 'dataset'
68 }
69
70 #try:
71 #subprocess.check_call(['gunzip', file_path])
72 #except:
73 #log.error("Couldn't extract %s" % data['filename'])
74
75 gx_json.write(json.dumps(galaxy_json) + "\n")
76