Mercurial > repos > saket-choudhary > vep_rest
changeset 1:3645d1bcc7bb draft default tip
Uploaded
author | saket-choudhary |
---|---|
date | Sat, 18 Oct 2014 04:03:13 -0400 |
parents | de145ceb3ac0 |
children | |
files | tool_dependencies.xml vep_rest/test-data/vep_input.vcf vep_rest/test-data/vep_output.txt vep_rest/tool_dependencies.xml vep_rest/vep_rest.py vep_rest/vep_rest.xml |
diffstat | 6 files changed, 227 insertions(+), 31 deletions(-) [+] |
line wrap: on
line diff
--- a/tool_dependencies.xml Sat Oct 18 03:46:53 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,31 +0,0 @@ -<?xml version='1.0' encoding='utf-8'?> -<tool_dependency> - <package name="pyvcf" version="0.6.7"> - <readme> - This Galaxy Tool shed package installs pyvcf(https://pypi.python.org/pypi/PyVCF) - package. - - The corresponding PYTHONPATH is accessible via PYTHONPATH_PYVCF - - Developmental version is hosted on Github: https://github.com/saketkc/galaxy_tools/packages/package_pyvcf_0_6_7/ - </readme> - <install version="1.0"> - <actions> - <action type="download_by_url">https://pypi.python.org/packages/source/P/PyVCF/PyVCF-0.6.7.tar.gz</action> - <action type="make_directory">$INSTALL_DIR/lib/python</action> - <action type="shell_command"> - export PYTHONPATH=$PYTHONPATH:$INSTALL_DIR/lib/python && - python setup.py install --install-lib $INSTALL_DIR/lib/python - </action> - <action type="set_environment"> - <environment_variable action="append_to" name="PYTHONPATH">$INSTALL_DIR/lib/python</environment_variable> - <environment_variable action="set_to" name="PYTHONPATH_PYVCF">$INSTALL_DIR/lib/python</environment_variable> - </action> - </actions> - </install> - </package> -</tool_dependency> - - - -
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vep_rest/test-data/vep_input.vcf Sat Oct 18 04:03:13 2014 -0400 @@ -0,0 +1,40 @@ +##fileformat=VCFv4.1 +##source=COSMICv70 +##reference=GRCh37 +##fileDate=20140805 +##comment="Missing nucleotide details indicate ambiguity during curation process" +##comment="URL stub for COSM ID field (use numeric portion of ID)='http://cancer.sanger.ac.uk/cosmic/mutation/overview?id='" +##comment="REF and ALT sequences are both forward strand +##INFO=<ID=GENE,Number=1,Type=String,Description="Gene name"> +##INFO=<ID=STRAND,Number=1,Type=String,Description="Gene strand"> +##INFO=<ID=CDS,Number=1,Type=String,Description="CDS annotation"> +##INFO=<ID=AA,Number=1,Type=String,Description="Peptide annotation"> +##INFO=<ID=CNT,Number=1,Type=Integer,Description="How many samples have this mutation"> +#CHROM POS ID REF ALT QUAL FILTER INFO +1 69345 COSM911918 C A . . GENE=OR4F5;STRAND=+;CDS=c.255C>A;AA=p.I85I;CNT=1 +1 69523 COSM426644 G T . . GENE=OR4F5;STRAND=+;CDS=c.433G>T;AA=p.G145C;CNT=1 +1 69538 COSM75742 G A . . GENE=OR4F5;STRAND=+;CDS=c.448G>A;AA=p.V150M;CNT=1 +1 69539 COSM1343690 T C . . GENE=OR4F5;STRAND=+;CDS=c.449T>C;AA=p.V150A;CNT=1 +1 69540 COSM1560546 G T . . GENE=OR4F5;STRAND=+;CDS=c.450G>T;AA=p.V150V;CNT=1 +1 69569 COSM1599955 T C . . GENE=OR4F5;STRAND=+;CDS=c.479T>C;AA=p.L160P;CNT=2 +1 69591 COSM3419425 C T . . GENE=OR4F5;STRAND=+;CDS=c.501C>T;AA=p.V167V;CNT=1 +1 861390 COSM460103 G C . . GENE=SAMD11;STRAND=+;CDS=c.69G>C;AA=p.P23P;CNT=1 +1 865609 COSM336143 C T . . GENE=SAMD11;STRAND=+;CDS=c.147C>T;AA=p.P49P;CNT=1 +1 865617 COSM3790304 C G . . GENE=SAMD11;STRAND=+;CDS=c.155C>G;AA=p.S52C;CNT=1 +1 865624 COSM912740 C T . . GENE=SAMD11;STRAND=+;CDS=c.162C>T;AA=p.S54S;CNT=1 +1 865658 COSM364168 G T . . GENE=SAMD11;STRAND=+;CDS=c.196G>T;AA=p.G66W;CNT=1 +1 865691 COSM1686856 C T . . GENE=SAMD11;STRAND=+;CDS=c.229C>T;AA=p.P77S;CNT=1 +1 865716 COSM1735520 G A . . GENE=SAMD11;STRAND=+;CDS=c.254G>A;AA=p.R85K;CNT=1 +1 866438 COSM3386379 G A . . GENE=SAMD11;STRAND=+;CDS=c.274G>A;AA=p.V92M;CNT=1 +1 871165 COSM3711402 C A . . GENE=SAMD11;STRAND=+;CDS=c.319C>A;AA=p.L107I;CNT=1 +1 871217 COSM3667588 A C . . GENE=SAMD11;STRAND=+;CDS=c.371A>C;AA=p.E124A;CNT=1 +1 871255 COSM414754 G A . . GENE=SAMD11;STRAND=+;CDS=c.409G>A;AA=p.E137K;CNT=1 +1 874447 COSM178082 G A . . GENE=SAMD11;STRAND=+;CDS=c.458G>A;AA=p.R153H;CNT=1 +1 874456 COSM178083 G C . . GENE=SAMD11;STRAND=+;CDS=c.467G>C;AA=p.R156P;CNT=1 +1 874465 COSM112049 G GC . . GENE=SAMD11;STRAND=+;CDS=c.476_477insC;AA=p.D160fs*47;CNT=1 +1 874497 COSM912847 G A . . GENE=SAMD11;STRAND=+;CDS=c.508G>A;AA=p.E170K;CNT=1 +1 874501 COSM912848 C T . . GENE=SAMD11;STRAND=+;CDS=c.512C>T;AA=p.S171L;CNT=1 +1 874504 COSM1659453 C G . . GENE=SAMD11;STRAND=+;CDS=c.515C>G;AA=p.P172R;CNT=1 +1 874778 COSM1344642 GCCTCCCCAGCCACGGTGAGGACCCACCCTGGCATGATCCCCCTCATCA G . . GENE=SAMD11;STRAND=+;CDS=c.645_692del48;AA=p.G220_H235del16;CNT=2 +1 874781 COSM1344643 T TC . . GENE=SAMD11;STRAND=+;CDS=c.647_648insC;AA=p.S218fs*4;CNT=1 +1 874816 COSM1344644 C CT . . GENE=SAMD11;STRAND=+;CDS=c.682_683insT;AA=p.P228fs*227;CNT=3
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vep_rest/test-data/vep_output.txt Sat Oct 18 04:03:13 2014 -0400 @@ -0,0 +1,5 @@ +ENSP00000393181 S52C,G66W +ENSP00000471152 G45R,R42T,A40T +ENSP00000411579 S52C,G66W +ENSP00000342313 S52C,G66W +ENSP00000334393 G145C,V150M,V150A,L160P
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vep_rest/tool_dependencies.xml Sat Oct 18 04:03:13 2014 -0400 @@ -0,0 +1,7 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="requests" version="2.2.1"> + <repository changeset_revision="04c9eef6c14b" name="package_requests_2_2_1" owner="saket-choudhary" toolshed="http://toolshed.g2.bx.psu.edu" /> + <repository changeset_revision="c05e29a21f10" name="package_pyvcf_0_6_7" owner="saket-choudhary" toolshed="http://toolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vep_rest/vep_rest.py Sat Oct 18 04:03:13 2014 -0400 @@ -0,0 +1,120 @@ +#!/usr/bin/env python +""" +Script to interact with Ensemble Variant Effect Predictor(VEP) +webservice + + +The MIT License (MIT) + +Copyright (c) 2014 Saket Choudhary<saketkc@gmail.com, skchoudh@usc.edu> + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + +""" +import argparse +import requests +import sys +import time +import vcf + +URL = 'http://grch37.rest.ensembl.org/vep/human/region/{}:{}-{}/{}?content-type=application/json&protein=1' + +class VEPRestClient: + + def __init__(self, input_file, output_file): + self.pending_urls = [] + vcf_reader = vcf.Reader(open(input_file, 'r')) + self.output_file = output_file + for record in vcf_reader: + url = URL.format(record.CHROM, record.POS, record.POS, ("").join([str(x) for x in record.ALT])) + key = "{}:{}-{}-{}".format(record.CHROM, record.POS, record.POS, ("").join([str(x) for x in record.ALT])) + self.pending_urls.append((key, url)) + + def submit(self): + protein_variants = {} + for record in self.pending_urls: + vcf_key = record[0] + url = record[1] + request = requests.get(url) + time_delay = None + try: + retry_delay = request.headers['Retry-After'] + time_delay = retry_delay + except KeyError: + pass + response = None + if time_delay: + time.sleep(time_delay) + request = requests.get(url) + try: + response = request.json()[0] + except Exception as e: + #TODO Better error handling + print e + if not response: + continue + variants = response['transcript_consequences'] + consequence = "" + for variant in variants: + consequence = "" + protein_id = None + protein_start = None + try: + protein_id = variant['protein_id'] + except KeyError: + pass + try: + protein_start = variant['protein_start'] + except KeyError: + pass + if protein_id: + if protein_id.startswith('ENSP'): + if variant['protein_id'] not in protein_variants.keys(): + protein_variants[protein_id] = [] + consequence += protein_id + if protein_start: + try: + #TODO Better error handling + amino_acid_original, amino_acid_substituted = variant['amino_acids'].split("/") + substitution = amino_acid_original + str(protein_start) + amino_acid_substituted + if "X" not in substitution: + protein_variants[variant['protein_id']].append(substitution) + consequence += " ," + substitution + except: + pass + + output = "" + for key, value in protein_variants.iteritems(): + if len(value)>0: + output += "{} {}\n".format(key, (",").join(value)) + + with open(self.output_file, 'wb') as f: + f.write(output) + + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--input_file", type=str, required=True, help="Input file location") + parser.add_argument("--output_file", type=str, required=True, help="Output file location") + args = parser.parse_args(sys.argv[1:]) + vep = VEPRestClient(args.input_file, args.output_file) + vep.submit() +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vep_rest/vep_rest.xml Sat Oct 18 04:03:13 2014 -0400 @@ -0,0 +1,55 @@ +<tool id="vep_rest" name="VEP Rest"> + <description>VEP Web Service</description> + <requirements> + <requirement type="package" version="2.2.1">requests</requirement> + <requirement type="python-module">requests</requirement> + <requirement type="package" version="2.2.1">pyvcf</requirement> + <requirement type="python-module">pyvcf</requirement> + </requirements> + <command interpreter="python"> + vep_rest.py --input_file $input --output_file $output + </command> + <inputs> + <param name="input" format="vcf" type="data" label="Input variants" /> + </inputs> + <outputs> + <data name="output" format="txt"/> + </outputs> + <tests> + <test> + <param name="input" value="vep_input.vcf"/> + <output name="output" file="vep_output.txt"/> + </test> + </tests> + <help> + + + **What it does** + + This script calls VEP Rest webserice for GRCh37(http://grch37.rest.ensembl.org/) to fetch + consequences of variations in the proteins ONLY. Variations in transcripts are IGNORED. + + Input is a VCF file.[http://samtools.github.io/hts-specs/VCFv4.2.pdf] + + Output is a text file with each line beginning with Protein identifier followed by comma separated substituions. + Example: + + ENSP00000393181, S52C,G66W,P77S,R85K,V92M,L107I + ENSP00000471152, G45R,R42T,A40T,G19E,L11F,T3M + ENSP00000411579, S52C,G66W,P77S,R85K,V92M,L107I,E124A,E137K,R153H,R156P,E170K,S171L,P172R + ENSP00000349216, R9K,V16M,L31I,E48A,E61K,R77H,R80P,E94K,S95L,P96R + ENSP00000342313, S52C,G66W,P77S,R85K,V92M,L107I,E124A,E137K,R153H,R156P,E170K,S171L,P172R + + + **Citations** + + If you use this tool in Galaxy, please cite : + McLaren W, Pritchard B, Rios D, Chen Y, Flicek P, Cunningham F. + Deriving the consequences of genomic variants with the Ensembl API and SNP Effect Predictor. + Bioinformatics 26(16):2069-70(2010) + doi:10.1093/bioinformatics/btq330 + + + </help> +</tool> +