Mercurial > repos > galaxyp > retrieve_ensembl_bed
view ensembl_rest.py @ 1:9c4a48f5d4e7 draft default tip
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/proteogenomics/retrieve_ensembl_bed commit 6babd357845126292cb202aaea0f70ff68819525"
author | galaxyp |
---|---|
date | Mon, 07 Oct 2019 16:14:39 -0400 |
parents | da1b538b87e5 |
children |
line wrap: on
line source
#!/usr/bin/env python """ # #------------------------------------------------------------------------------ # University of Minnesota # Copyright 2017, Regents of the University of Minnesota #------------------------------------------------------------------------------ # Author: # # James E Johnson # #------------------------------------------------------------------------------ """ from __future__ import print_function from __future__ import unicode_literals import sys from time import sleep import requests server = "https://rest.ensembl.org" ext = "/info/assembly/homo_sapiens?" max_region = 4000000 debug = False def ensembl_rest(ext, headers): if debug: print("%s" % ext, file=sys.stderr) r = requests.get(server+ext, headers=headers) if r.status_code == 429: print("response headers: %s\n" % r.headers, file=sys.stderr) if 'Retry-After' in r.headers: sleep(r.headers['Retry-After']) r = requests.get(server+ext, headers=headers) if not r.ok: r.raise_for_status() return r def get_species(): results = dict() ext = "/info/species" req_header = {"Content-Type": "application/json"} r = ensembl_rest(ext, req_header) for species in r.json()['species']: results[species['name']] = species print("%s\t%s\t%s\t%s\t%s" % (species['name'], species['common_name'], species['display_name'], species['strain'], species['taxon_id']), file=sys.stdout) return results def get_biotypes(species): biotypes = [] ext = "/info/biotypes/%s?" % species req_header = {"Content-Type": "application/json"} r = ensembl_rest(ext, req_header) for entry in r.json(): if 'biotype' in entry: biotypes.append(entry['biotype']) return biotypes def get_toplevel(species): coord_systems = dict() ext = "/info/assembly/%s?" % species req_header = {"Content-Type": "application/json"} r = ensembl_rest(ext, req_header) toplevel = r.json() for seq in toplevel['top_level_region']: if seq['coord_system'] not in coord_systems: coord_systems[seq['coord_system']] = dict() coord_system = coord_systems[seq['coord_system']] coord_system[seq['name']] = int(seq['length']) return coord_systems def get_transcripts_bed(species, refseq, start, length, strand='', params=None): bed = [] param = params if params else '' req_header = {"Content-Type": "text/x-bed"} regions = list(range(start, length, max_region)) if not regions or regions[-1] < length: regions.append(length) for end in regions[1:]: ext = "/overlap/region/%s/%s:%d-%d%s?feature=transcript;%s"\ % (species, refseq, start, end, strand, param) start = end + 1 r = ensembl_rest(ext, req_header) if r.text: bed += r.text.splitlines() return bed def get_seq(id, seqtype, params=None): param = params if params else '' ext = "/sequence/id/%s?type=%s;%s" % (id, seqtype, param) req_header = {"Content-Type": "text/plain"} r = ensembl_rest(ext, req_header) return r.text def get_cdna(id, params=None): return get_seq(id, 'cdna', params=params) def get_cds(id, params=None): return get_seq(id, 'cds', params=params) def get_genomic(id, params=None): return get_seq(id, 'genomic', params=params) def get_transcript_haplotypes(species, transcript): ext = "/transcript_haplotypes/%s/%s?aligned_sequences=1"\ % (species, transcript) req_header = {"Content-Type": "application/json"} r = ensembl_rest(ext, req_header) decoded = r.json() return decoded