Mercurial > repos > iuc > data_manager_snpsift_dbnsfp
changeset 0:0e9e3bb5776a draft
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/data_managers/data_manager_snpsift_dbnsfp commit 5316af00b4a71a7b526cbc9540d5158749cc38e4
author | iuc |
---|---|
date | Tue, 07 Jun 2016 10:23:16 -0400 |
parents | |
children | d57ebdd39f0f |
files | data_manager/data_manager_snpsift_dbnsfp.py data_manager/data_manager_snpsift_dbnsfp.xml data_manager_conf.xml test-data/test_nsfp.data_manager_json test-data/test_nsfp.tsv tool-data/snpsift_dbnsfps.loc.sample tool_data_table_conf.xml.sample tool_dependencies.xml |
diffstat | 8 files changed, 302 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/data_manager_snpsift_dbnsfp.py Tue Jun 07 10:23:16 2016 -0400 @@ -0,0 +1,167 @@ +#!/usr/bin/env python + +import gzip +import json +import optparse +import os +import os.path +import re +import shutil +import sys +import urllib +import zipfile + +from pysam import ctabix + +""" +# Install dbNSFP databases +# from DbNsfp site + # Download dbNSFP database + $ wget ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbNSFPv2.4.zip + # Uncompress + $ unzip dbNSFP2.4.zip + # Create a single file version + $ (head -n 1 dbNSFP2.4_variant.chr1 ; cat dbNSFP2.4_variant.chr* | grep -v "^#") > dbNSFP2.4.txt + # Compress using block-gzip algorithm + bgzip dbNSFP2.4.txt + # Create tabix index + tabix -s 1 -b 2 -e 2 dbNSFP2.4.txt.gz + +data_table: + + <table name="snpsift_dbnsfps" comment_char="#"> + <columns>key, build, name, value, annotations</columns> + <file path="tool-data/snpsift_dbnsfps.loc" /> + </table> + +#id build description path annotations +#GRCh37_dbNSFP2.4 GRCh37 GRCh37 dbNSFP2.4 /depot/snpeff/dbNSFP2.4.gz SIFT_pred,Uniprot_acc +#GRCh38_dbNSFP2.7 GRCh38 GRCh38 dbNSFP2.7 /depot/snpeff/dbNSFP2.7.gz SIFT_pred,Uniprot_acc +""" + +data_table = 'snpsift_dbnsfps' +softgenetics_url = 'ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/' +dbNSFP_file_pat = '(dbNSFP(.*)_variant|dbscSNV(.*)).chr(.*)' +tokenize = re.compile(r'(\d+)|(\D+)').findall +dbNSFP_name_pat = 'dbNSFP(v|_light)?(\d*).*?' + + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit(1) + + +def get_nsfp_genome_version(name): + genome_version = 'hg19' + dbNSFP_name_pat = '(dbscSNV|dbNSFP(v|_light)?)(\d*).*?' + m = re.match(dbNSFP_name_pat, name) + if m: + (base, mid, ver) = m.groups() + if base == 'dbscSNV': + genome_version = 'hg19' + else: + genome_version = 'hg38' if ver == '3' else 'hg19' if ver == '2' else 'hg18' + return genome_version + + +def get_annotations(gzip_path): + annotations = None + fh = None + try: + fh = gzip.open(gzip_path, 'r') + buf = fh.read(10000) + lines = buf.splitlines() + headers = lines[0].split('\t') + annotations = ','.join([x.strip() for x in headers[4:]]) + except Exception as e: + stop_err('Error Reading annotations %s : %s' % (gzip_path, e)) + finally: + if fh: + fh.close() + return annotations + + +def tabix_file(input_fname, output_fname): + print >> sys.stdout, "tabix_file: %s -> %s" % (input_fname, output_fname) + ctabix.tabix_compress(input_fname, output_fname, force=True) + # Column indices are 0-based. + ctabix.tabix_index(output_fname, seq_col=0, start_col=1, end_col=1) + + +def natural_sortkey(string): + return tuple(int(num) if num else alpha for num, alpha in tokenize(string)) + + +def download_dbnsfp_database(url, output_file): + dbnsfp_tsv = None + file_path = 'downloaded_file' + urllib.urlretrieve(url, file_path) + with zipfile.ZipFile(file_path, 'r') as my_zip: + dbnsfp_tsv = output_file if output_file else 'dbnsfp_tsv' + wtr = open(dbnsfp_tsv, 'w') + allfiles = [info.filename for info in my_zip.infolist()] + files = [f for f in allfiles if re.match(dbNSFP_file_pat, f)] + files = sorted(files, key=natural_sortkey) + for j, file in enumerate(files): + fh = my_zip.open(file, 'rU') + for i, line in enumerate(fh): + if j > 0 and i == 0: + continue + wtr.write(line) + return dbnsfp_tsv + + +def main(): + # Parse Command Line + parser = optparse.OptionParser() + parser.add_option('-g', '--dbkey', dest='dbkey', action='store', type="string", default=None, help='dbkey genome version') + parser.add_option('-n', '--db_name', dest='db_name', action='store', type="string", default=None, help='A name for a history snpsiftdbnsfp dataset') + parser.add_option('-s', '--softgenetics', dest='softgenetics', action='store', type="string", default=None, help='A name for softgenetics dbNSFP file') + parser.add_option('-H', '--snpsiftdbnsfp', dest='snpsiftdbnsfp', action='store', type="string", default=None, help='A history snpsiftdbnsfp dataset') + parser.add_option('-T', '--dbnsfp_tabular', dest='dbnsfp_tabular', action='store', type="string", default=None, help='A history dbnsfp_tabular dataset') + (options, args) = parser.parse_args() + + filename = args[0] + params = json.loads(open(filename).read()) + target_directory = params['output_data'][0]['extra_files_path'] + if not os.path.exists(target_directory): + os.mkdir(target_directory) + data_manager_dict = {} + genome_version = options.dbkey if options.dbkey else 'unknown' + dbnsfp_tsv = None + db_name = None + bzip_path = None + if options.softgenetics: + dbnsfp_url = softgenetics_url + options.softgenetics + db_name = options.db_name if options.db_name else re.sub('\.zip$', '', options.softgenetics) + genome_version = get_nsfp_genome_version(options.softgenetics) + tsv = db_name + '.tsv' + dbnsfp_tsv = download_dbnsfp_database(dbnsfp_url, tsv) + elif options.dbnsfp_tabular: + db_name = options.db_name + dbnsfp_tsv = options.dbnsfp_tabular + elif options.snpsiftdbnsfp: + (dirpath, bgzip_name) = os.path.split(options.snpsiftdbnsfp) + idxpath = options.snpsiftdbnsfp + '.tbi' + shutil.copy(options.snpsiftdbnsfp, target_directory) + shutil.copy(idxpath, target_directory) + bzip_path = os.path.join(target_directory, bgzip_name) + db_name = re.sub('(.txt)?.gz$', '', bgzip_name) + else: + stop_err('Either --softgenetics or --dbnsfp_tabular required') + if dbnsfp_tsv: + bgzip_name = '%s.txt.gz' % db_name + bzip_path = os.path.join(target_directory, bgzip_name) + tabix_file(dbnsfp_tsv, bzip_path) + annotations = get_annotations(bzip_path) + # Create the SnpSift dbNSFP Reference Data + data_table_entry = dict(key='%s_%s' % (genome_version, db_name), build=genome_version, name='%s %s' % (genome_version, db_name), value=bgzip_name, annotations=annotations) + data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {}) + data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, []) + data_manager_dict['data_tables'][data_table].append(data_table_entry) + + # save info to json file + open(filename, 'wb').write(json.dumps(data_manager_dict)) + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/data_manager_snpsift_dbnsfp.xml Tue Jun 07 10:23:16 2016 -0400 @@ -0,0 +1,89 @@ +<tool id="data_manager_snpsift_dbnsfp" name="SnpSift dbNSFP" version="4.1.0" tool_type="manage_data"> + <description>Install a dbNSFP variant annotation database</description> + <requirements> + <requirement type="package" version="0.7.7">pysam</requirement> + </requirements> + <stdio> + <exit_code range=":-1" level="fatal" description="Error: Cannot open file" /> + <exit_code range="1:" level="fatal" description="Error" /> + </stdio> + <command interpreter="python"> + #import re + data_manager_snpsift_dbnsfp.py + #if $db.src == 'softgenetics': + --softgenetics "$db.softgenetics_name" + #elif $db.src == 'history': + #if $db.snpsiftdbnsfp.ext == 'snpsiftdbnsfp': + #import os.path + --snpsiftdbnsfp "$os.path.join($db.snpsiftdbnsfp.extra_files_path, $db.snpsiftdbnsfp.metadata.bgzip)" + #else + --dbnsfp_tabular "$db.snpsiftdbnsfp" + #end if + --db_name "$db.db_name" + #if str($db.dbkey).strip() != '': + --dbkey "$db.dbkey" + #elif str($db.snpsiftdbnsfp.metadata.dbkey) != '?': + --dbkey "$db.snpsiftdbnsfp.metadata.dbkey" + #end if + #end if + "$out_file" + </command> + <inputs> + <conditional name="db"> + <param name="src" type="select" label="Source for dbNSFP file"> + <option value="softgenetics">Jpopgen dbNSFP from softgenetics</option> + <option value="history">from your history</option> + </param> + <when value="softgenetics"> + <param name="softgenetics_name" type="text" value="" label="dbNSFP file name at softgenetics ftp site"> + <help>Download From: ftp://dbnsfp.softgenetics.com/ + Enter the name of the database, e.g.: dbNSFPv3.0c.zip + </help> + <validator type="regex" message="A dbNSFP or dbscSNV .zip">(dbNSFP|dbscSNV).*[.]zip</validator> + </param> + </when> + <when value="history"> + <param name="snpsiftdbnsfp" type="data" format="snpsiftdbnsfp,dbnsfp.tabular" label="A snpsift dbnsfp from your history" + help="This can can be generated by converting a tabular file set to type: dbnsfp.tabular"/> + <param name="db_name" type="text" value="" label="The unique name to give this dbnsfp database"> + <validator type="length" min="3" max="20" message="Must have between 3 and 20 chracters"/> + <validator type="regex" message="No whitespace allowed">^\S*$</validator> + </param> + <param name="dbkey" type="text" value="hg19" optional="true" label="DBKEY to assign to data to this dbNSFP database" /> + </when> + </conditional> + </inputs> + + <outputs> + <data name="out_file" format="data_manager_json" label="${tool.name}"/> + </outputs> + <tests> + <test> + <param name="src" value="history"/> + <param name="snpsiftdbnsfp" value="test_nsfp.tsv" ftype="dbnsfp.tabular"/> + <param name="dbkey" value="hg19"/> + <param name="db_name" value="test_nsfp_tsv" /> + <output name="out_file" file="test_nsfp.data_manager_json"/> + </test> + </tests> + <help> + +This tool installs dbNSFP_ databases to annotate VCF files using SnpSift_dbNSFP_ +It populates data table: snpsift_dbnsfps + +.. _dbNSFP: https://sites.google.com/site/jpopgen/dbNSFP +.. _SnpSift_dbNSFP: http://snpefIf.sourceforge.net/SnpSift.html#dbNSFP + +Please refer to https://sites.google.com/site/jpopgen/dbNSFP for which citations to use with specific dbNSFP database versions. + + </help> + <citations> + <citation type="doi">DOI: 10.1002/humu.21517</citation> + <citation type="doi">DOI: 10.1002/humu.22376</citation> + <citation type="doi">DOI: 10.1002/humu.22932</citation> + <citation type="doi">doi: 10.1093/hmg/ddu733</citation> + <citation type="doi">doi: 10.1093/nar/gku1206</citation> + <citation type="doi">doi: 10.3389/fgene.2012.00035</citation> + </citations> +</tool> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_conf.xml Tue Jun 07 10:23:16 2016 -0400 @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<data_managers> + <data_manager tool_file="data_manager/data_manager_snpsift_dbnsfp.xml" id="data_manager_snpsift_dbnsfp" > + <data_table name="snpsift_dbnsfps"> <!-- Defines a Data Table to be modified. --> + <output> <!-- Handle the output of the Data Manager Tool --> + <column name="key" /> <!-- columns that are going to be specified by the Data Manager Tool --> + <column name="build" /> <!-- columns that are going to be specified by the Data Manager Tool --> + <column name="name" /> <!-- columns that are going to be specified by the Data Manager Tool --> + <column name="value" output_ref="out_file" > + <move type="directory" relativize_symlinks="True"> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">snpSift/v4_1/dbnsfp</target> + </move> + <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/snpSift/v4_1/dbnsfp/${value}</value_translation> + <value_translation type="function">abspath</value_translation> + </column> + <column name="annnotations" /> <!-- columns that are going to be specified by the Data Manager Tool --> + </output> + </data_table> + </data_manager> +</data_managers> + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_nsfp.data_manager_json Tue Jun 07 10:23:16 2016 -0400 @@ -0,0 +1,1 @@ +{"data_tables": {"snpsift_dbnsfp": [{"value": "test_nsfp_tsv.txt.gz", "name": "? test_nsfp_tsv", "build": "?", "dbkey": "?_test_nsfp_tsv", "annotations": "hg18_pos(1-coor), genename, SIFT_score, SIFT_pred, Polyphen2_HDIV_score"}]}} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_nsfp.tsv Tue Jun 07 10:23:16 2016 -0400 @@ -0,0 +1,7 @@ +#chr pos(1-coor) ref alt hg18_pos(1-coor) genename SIFT_score SIFT_pred Polyphen2_HDIV_score +1 69134 A C 58997 OR4F5 0.03 D 0.043 +1 69134 A G 58997 OR4F5 0.09 T 0.0 +1 69134 A T 58997 OR4F5 0.03 D 0.308 +4 100239319 T A 100458342 ADH1B 0 D 0.021 +4 100239319 T C 100458342 ADH1B 0.15 T 0.0 +4 100239319 T G 100458342 ADH1B 0 D 0.0
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/snpsift_dbnsfps.loc.sample Tue Jun 07 10:23:16 2016 -0400 @@ -0,0 +1,3 @@ +#key build description path annotations +#GRCh37_dbNSFP2.4 GRCh37 GRCh37 dbNSFP2.4 /depot/snpeff/dbNSFP2.4.txt.gz SIFT_pred,Uniprot_acc +#GRCh38_dbNSFP3.1c GRCh38 GRCh38 dbNSFP3.1c /depot/snpeff/dbNSFP3.1c.txt.gz SIFT_pred,Uniprot_acc
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Tue Jun 07 10:23:16 2016 -0400 @@ -0,0 +1,7 @@ +<tables> + <table name="snpsift_dbnsfps" comment_char="#"> + <columns>key, build, name, value, annotations</columns> + <file path="tool-data/snpsift_dbnsfps.loc" /> + </table> +</tables> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Tue Jun 07 10:23:16 2016 -0400 @@ -0,0 +1,6 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="pysam" version="0.7.7"> + <repository changeset_revision="0a5141bdf9d0" name="package_pysam_0_7_7" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency>