diff get_sequences.py @ 1:e5dd4bd78bbc draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/Ensembl-REST commit aaf8d501c3a92ed415fdf9293a65468c72aae984-dirty
author earlhaminst
date Mon, 12 Dec 2016 07:47:42 -0500
parents
children 4b7261f484bb
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/get_sequences.py	Mon Dec 12 07:47:42 2016 -0500
@@ -0,0 +1,46 @@
+# A simple tool to connect to the Ensembl server and retrieve sequences using
+# the Ensembl REST API.
+import json
+import optparse
+from itertools import islice
+from urlparse import urljoin
+
+import requests
+
+parser = optparse.OptionParser()
+parser.add_option('-i', '--input', help='List of Ensembl IDs')
+
+parser.add_option('-s', '--species', type='choice',
+                  choices=['ensembl', 'ensemblgenomes'], default='ensembl',
+                  help='Specify the genome databases for vertebrates and other eukaryotic species')
+
+parser.add_option('-t', '--type', type='choice',
+                  choices=['genomic', 'cds', 'cdna', 'protein'],
+                  default='genomic', help='Type of sequence')
+parser.add_option('--expand_3prime', type='int', default=0,
+                  help='Expand the sequence downstream of the sequence by this many basepairs. Only available when using genomic sequence type')
+parser.add_option('--expand_5prime', type='int', default=0,
+                  help='Expand the sequence upstream of the sequence by this many basepairs. Only available when using genomic sequence type')
+options, args = parser.parse_args()
+if options.input is None:
+    raise Exception('-i option must be specified')
+
+server = 'http://rest.%s.org' % options.species
+ext = 'sequence/id'
+
+headers = {'Content-Type': 'text/x-fasta', 'Accept': 'text/x-fasta'}
+params = dict((k, getattr(options, k)) for k in ['type', 'expand_3prime', 'expand_5prime'])
+with open(options.input) as f:
+    # Need to split the file in chunks of 50 lines because of the limit imposed by Ensembl
+    while True:
+        ids = [line.strip() for line in islice(f, 50)]
+        if not ids:
+            break
+        data = {'ids': ids}
+        r = requests.post(urljoin(server, ext), params=params, headers=headers,
+                          data=json.dumps(data))
+
+        if not r.ok:
+            r.raise_for_status()
+
+        print r.text