0
|
1 #!/usr/bin/env python
|
|
2 import urllib, urllib2
|
|
3 import sys
|
|
4 import subprocess
|
|
5 import json
|
|
6 import hashlib
|
|
7 import os
|
|
8 import logging
|
|
9 logging.basicConfig(level=logging.DEBUG)
|
|
10 log = logging.getLogger()
|
|
11
|
|
12 try:
|
|
13 os.makedirs("output")
|
|
14 except:
|
|
15 #TODO
|
|
16 pass
|
|
17
|
|
18 URL = sys.argv[1]
|
|
19 DATASET_ID = sys.argv[2]
|
|
20
|
3
|
21 req = urllib2.urlopen(URL)
|
|
22 page = req.read()
|
0
|
23 comments = [x.strip() for x in page.split('\n') if x.strip().startswith('<!--')]
|
|
24
|
|
25 gx_json = open('galaxy.json', 'w')
|
|
26
|
|
27 prefix = '<!--gsafjson'
|
|
28 suffix = '-->'
|
|
29 for gsafjson in [x[len(prefix): -len(suffix)].strip() for x in comments if 'gsafjson' in x]:
|
|
30 #{"file_type":"fastq.gz",
|
|
31 # "JA":"****",
|
|
32 # "sample_name":"Sample10",
|
|
33 # "user_data":{"description":{} } ,
|
|
34 # "md5":"2ea00f4eef8f6a2344b80fbe12ab2eb7",
|
|
35 # "url":"http://gsaf.s3.amazonaws.com/****",
|
|
36 # "size_in_mb":45,
|
|
37 # "filename":"Sample10_S8_L001_R1_001.fastq.gz",
|
|
38 # "reads":["Sample10_S8_L001_R1_001.fastq.gz","Sample10_S8_L001_R2_001.fastq.gz"],
|
|
39 # "SA":"***"}
|
|
40 data = json.loads(gsafjson)
|
|
41 log.info("Fetching %s" % data['filename'])
|
|
42 file_path = os.path.join('output', data['filename'])
|
4
|
43 urllib.urlretrieve(data['url'], file_path)
|
0
|
44 log.info("Hashing file")
|
4
|
45 file_md5 = hashlib.md5(open(file_path).read()).hexdigest()
|
|
46 log.debug("Hashed to %s" % file_md5)
|
0
|
47
|
|
48 stderr = ''
|
4
|
49 if file_md5 != data['md5']:
|
|
50 stderr = 'md5sum mismatch: %s != %s' % (file_md5, data['md5'])
|
0
|
51
|
|
52 # Galaxy.json
|
|
53 # {"name": "lambda.fa", "stdout": "uploaded fasta file", "line_count": 811, "ext": "fasta", "dataset_id": 16220, "type": "dataset"}
|
|
54 line_count = subprocess.check_output(['wc', '-l', file_path]).strip()
|
|
55 line_count = line_count[0:line_count.index(' ')]
|
|
56
|
|
57 galaxy_json = {
|
|
58 'name': data['filename'].strip(data['file_type']),
|
|
59 'stdout': None,
|
|
60 'stderr': stderr,
|
|
61 'line_count': int(line_count),
|
|
62 # TODO, check that data is really .gz
|
|
63 'ext': data['file_type'].strip('.gz'),
|
|
64 'dataset_id': DATASET_ID,
|
|
65 'type': 'dataset'
|
|
66 }
|
|
67
|
4
|
68 try:
|
|
69 subprocess.check_call(['gunzip', file_path])
|
|
70 except:
|
|
71 log.error("Couldn't extract %s" % data['filename'])
|
0
|
72
|
|
73 gx_json.write(json.dumps(galaxy_json) + "\n")
|
|
74
|