annotate gsaf_download.py @ 3:ba70ea57cef9 draft

Removed testing code
author eric-rasche
date Fri, 05 Dec 2014 14:34:49 -0500
parents c892ae46653b
children a90d7b00d727
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
1 #!/usr/bin/env python
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
2 import urllib, urllib2
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
3 import sys
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
4 import subprocess
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
5 import json
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
6 import hashlib
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
7 import os
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
8 import logging
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
9 logging.basicConfig(level=logging.DEBUG)
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
10 log = logging.getLogger()
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
11
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
12 try:
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
13 os.makedirs("output")
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
14 except:
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
15 #TODO
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
16 pass
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
17
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
18 URL = sys.argv[1]
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
19 DATASET_ID = sys.argv[2]
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
20
3
ba70ea57cef9 Removed testing code
eric-rasche
parents: 0
diff changeset
21 req = urllib2.urlopen(URL)
ba70ea57cef9 Removed testing code
eric-rasche
parents: 0
diff changeset
22 page = req.read()
0
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
23 comments = [x.strip() for x in page.split('\n') if x.strip().startswith('<!--')]
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
24
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
25 gx_json = open('galaxy.json', 'w')
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
26
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
27 prefix = '<!--gsafjson'
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
28 suffix = '-->'
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
29 for gsafjson in [x[len(prefix): -len(suffix)].strip() for x in comments if 'gsafjson' in x]:
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
30 #{"file_type":"fastq.gz",
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
31 # "JA":"****",
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
32 # "sample_name":"Sample10",
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
33 # "user_data":{"description":{} } ,
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
34 # "md5":"2ea00f4eef8f6a2344b80fbe12ab2eb7",
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
35 # "url":"http://gsaf.s3.amazonaws.com/****",
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
36 # "size_in_mb":45,
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
37 # "filename":"Sample10_S8_L001_R1_001.fastq.gz",
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
38 # "reads":["Sample10_S8_L001_R1_001.fastq.gz","Sample10_S8_L001_R2_001.fastq.gz"],
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
39 # "SA":"***"}
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
40 data = json.loads(gsafjson)
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
41 log.info("Fetching %s" % data['filename'])
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
42 file_path = os.path.join('output', data['filename'])
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
43 #urllib.urlretrieve(data['url'], file_path)
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
44 log.info("Hashing file")
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
45 #file_md5 = hashlib.md5(open(file_path).read()).hexdigest()
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
46 #log.debug("Hashed to %s" % file_md5)
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
47
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
48 stderr = ''
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
49 #if file_md5 != data['md5']:
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
50 #stderr = 'md5sum mismatch: %s != %s' % (file_md5, data['md5'])
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
51
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
52 # Galaxy.json
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
53 # {"name": "lambda.fa", "stdout": "uploaded fasta file", "line_count": 811, "ext": "fasta", "dataset_id": 16220, "type": "dataset"}
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
54 line_count = subprocess.check_output(['wc', '-l', file_path]).strip()
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
55 line_count = line_count[0:line_count.index(' ')]
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
56
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
57 galaxy_json = {
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
58 'name': data['filename'].strip(data['file_type']),
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
59 'stdout': None,
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
60 'stderr': stderr,
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
61 'line_count': int(line_count),
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
62 # TODO, check that data is really .gz
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
63 'ext': data['file_type'].strip('.gz'),
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
64 'dataset_id': DATASET_ID,
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
65 'type': 'dataset'
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
66 }
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
67
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
68 #try:
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
69 #subprocess.check_call(['gunzip', file_path])
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
70 #except:
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
71 #log.error("Couldn't extract %s" % data['filename'])
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
72
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
73 gx_json.write(json.dumps(galaxy_json) + "\n")
c892ae46653b Uploaded
eric-rasche
parents:
diff changeset
74