# HG changeset patch
# User in_silico
# Date 1528819588 14400
# Node ID 6b607ebcd67463740106f2fc76078486c30bf2e9
# Parent 18982667bd1026cbbd0df5dec7eeb00b4f631535
Uploaded
diff -r 18982667bd10 -r 6b607ebcd674 cravat_convert/base_converter.py
--- a/cravat_convert/base_converter.py Tue Jun 12 12:06:11 2018 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,22 +0,0 @@
-class BaseConverter(object):
- def __init__(self):
- self.format_name = None
- def check_format(self,*args,**kwargs):
- err_msg = 'Converter for %s format has no method check_format' %\
- self.format_name
- raise NotImplementedError(err_msg)
- def setup(self,*args,**kwargs):
- err_msg = 'Converter for %s format has no method setup' %\
- self.format_name
- raise NotImplementedError(err_msg)
- def convert_line(self,*args,**kwargs):
- err_msg = 'Converter for %s format has no method convert_line' %\
- self.format_name
- raise NotImplementedError(err_msg)
-
-
-class BadFormatError(Exception):
- def __init__(self, message, errors=None):
- super(BadFormatError, self).__init__(message)
- # Support for custom error codes, if added later
- self.errors = errors
\ No newline at end of file
diff -r 18982667bd10 -r 6b607ebcd674 cravat_convert/cravat_convert.py
--- a/cravat_convert/cravat_convert.py Tue Jun 12 12:06:11 2018 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,77 +0,0 @@
-'''
-Convert a VCF format file to Cravat format file
-'''
-
-import os
-import argparse
-from vcf_converter import CravatConverter
-
-# File read/write configuration variables
-vcf_sep = '\t'
-cr_sep = '\t'
-cr_newline = '\n'
-
-# VCF Headers mapped to their index position in a row of VCF values
-vcf_mapping = {
- 'CHROM': 0,
- 'POS': 1,
- 'ID': 2,
- 'REF': 3,
- 'ALT': 4,
- 'QUAL': 5,
- 'FILTER': 6,
- 'INFO': 7,
- 'FORMAT': 8,
- 'NA00001': 9,
- 'NA00002': 10,
- 'NA00003': 11
-}
-
-
-def get_args():
- parser = argparse.ArgumentParser()
- parser.add_argument('--input',
- '-i',
- required = True,
- help='Input path to a VCF file for conversion',)
- parser.add_argument('--output',
- '-o',
- default = os.path.join(os.getcwd(), "cravat_converted.txt"),
- help = 'Output path to write the cravat file to')
- return parser.parse_args()
-
-
-def convert(in_path, out_path=None):
- if not out_path:
- base, _ = os.path.split(in_path)
- out_path = os.path.join(base, "cravat_converted.txt")
-
- with open(in_path, 'r') as in_file, \
- open(out_path, 'w') as out_file:
-
- # cr_count will be used to generate the 'TR' field of the cravat rows (first header)
- cr_count = 0
- # VCF lines are always assumed to be '+' strand, as VCF doesn't specify that attribute
- strand = '+'
- # VCF converter. Adjusts position, reference, and alternate for Cravat formatting.
- converter = CravatConverter()
-
- for line in in_file:
- if line.startswith("#"):
- continue
- line = line.strip().split(vcf_sep)
- # row is dict of VCF headers mapped to corresponding values of this line
- row = { header: line[index] for header, index in vcf_mapping.items() }
- for alt in row["ALT"].split(","):
- new_pos, new_ref, new_alt = converter.extract_vcf_variant(strand, row["POS"], row["REF"], alt)
- new_pos, new_ref, new_alt = str(new_pos), str(new_ref), str(new_alt)
- cr_line = cr_sep.join([
- 'TR' + str(cr_count), row['CHROM'], new_pos, strand, new_ref, new_alt, row['ID']
- ])
- out_file.write(cr_line + cr_newline)
- cr_count += 1
-
-
-if __name__ == "__main__":
- cli_args = get_args()
- convert(cli_args.input, cli_args.output)
diff -r 18982667bd10 -r 6b607ebcd674 cravat_convert/cravat_convert.xml
--- a/cravat_convert/cravat_convert.xml Tue Jun 12 12:06:11 2018 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,20 +0,0 @@
-
- Converts a VCF format file to a Cravat format file
- cravat_convert.py -i $input -o $output
-
-
-
-
-
-
-
-
-
-
-
-
- Converts a VCF format file to a Cravat format file
-
-
-
-
diff -r 18982667bd10 -r 6b607ebcd674 cravat_convert/vcf_converter.py
--- a/cravat_convert/vcf_converter.py Tue Jun 12 12:06:11 2018 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,243 +0,0 @@
-"""
-A module originally obtained from the cravat package. Modified to use in the vcf
-converter galaxy tool.
-
-
-Register of changes made (Chris Jacoby):
- 1) Changed imports as galaxy tool won't have access to complete cravat python package
- 2) Defined BadFormatError in BaseConverted file, as I didn't have the BadFormatError module
-"""
-
-from base_converter import BaseConverter, BadFormatError
-import re
-
-class CravatConverter(BaseConverter):
-
- def __init__(self):
- self.format_name = 'vcf'
- self.samples = []
- self.var_counter = 0
- self.addl_cols = [{'name':'phred',
- 'title':'Phred',
- 'type':'string'},
- {'name':'filter',
- 'title':'VCF filter',
- 'type':'string'},
- {'name':'zygosity',
- 'title':'Zygosity',
- 'type':'string'},
- {'name':'alt_reads',
- 'title':'Alternate reads',
- 'type':'int'},
- {'name':'tot_reads',
- 'title':'Total reads',
- 'type':'int'},
- {'name':'af',
- 'title':'Variant allele frequency',
- 'type':'float'}]
-
- def check_format(self, f):
- return f.readline().startswith('##fileformat=VCF')
-
- def setup(self, f):
-
- vcf_line_no = 0
- for line in f:
- vcf_line_no += 1
- if len(line) < 6:
- continue
- if line[:6] == '#CHROM':
- toks = re.split('\s+', line.rstrip())
- if len(toks) > 8:
- self.samples = toks[9:]
- break
-
- def convert_line(self, l):
- if l.startswith('#'): return None
- self.var_counter += 1
- toks = l.strip('\r\n').split('\t')
- all_wdicts = []
- if len(toks) < 8:
- raise BadFormatError('Wrong VCF format')
- [chrom, pos, tag, ref, alts, qual, filter, info] = toks[:8]
- if tag == '':
- raise BadFormatError('ID column is blank')
- elif tag == '.':
- tag = 'VAR' + str(self.var_counter)
- if chrom[:3] != 'chr':
- chrom = 'chr' + chrom
- alts = alts.split(',')
- len_alts = len(alts)
- if len(toks) == 8:
- for altno in range(len_alts):
- wdict = None
- alt = alts[altno]
- newpos, newref, newalt = self.extract_vcf_variant('+', pos, ref, alt)
- wdict = {'tags':tag,
- 'chrom':chrom,
- 'pos':newpos,
- 'ref_base':newref,
- 'alt_base':newalt,
- 'sample_id':'no_sample',
- 'phred': qual,
- 'filter': filter}
- all_wdicts.append(wdict)
- elif len(toks) > 8:
- sample_datas = toks[9:]
- genotype_fields = {}
- genotype_field_no = 0
- for genotype_field in toks[8].split(':'):
- genotype_fields[genotype_field] = genotype_field_no
- genotype_field_no += 1
- if not ('GT' in genotype_fields):
- raise BadFormatError('No GT Field')
- gt_field_no = genotype_fields['GT']
- for sample_no in range(len(sample_datas)):
- sample = self.samples[sample_no]
- sample_data = sample_datas[sample_no].split(':')
- gts = {}
- for gt in sample_data[gt_field_no].replace('/', '|').split('|'):
- if gt == '.':
- continue
- else:
- gts[int(gt)] = True
- for gt in sorted(gts.keys()):
- wdict = None
- if gt == 0:
- continue
- else:
- alt = alts[gt - 1]
- newpos, newref, newalt = self.extract_vcf_variant('+', pos, ref, alt)
- zyg = self.homo_hetro(sample_data[gt_field_no])
- depth, alt_reads, af = self.extract_read_info(sample_data, gt, gts, genotype_fields)
-
- wdict = {'tags':tag,
- 'chrom':chrom,
- 'pos':newpos,
- 'ref_base':newref,
- 'alt_base':newalt,
- 'sample_id':sample,
- 'phred': qual,
- 'filter': filter,
- 'zygosity': zyg,
- 'tot_reads': depth,
- 'alt_reads': alt_reads,
- 'af': af,
- }
- all_wdicts.append(wdict)
- return all_wdicts
-
- #The vcf genotype string has a call for each allele separated by '\' or '/'
- #If the call is the same for all allels, return 'hom' otherwise 'het'
- def homo_hetro(self, gt_str):
- if '.' in gt_str:
- return '';
-
- gts = gt_str.strip().replace('/', '|').split('|')
- for gt in gts:
- if gt != gts[0]:
- return 'het'
- return 'hom'
-
- #Extract read depth, allele count, and allele frequency from optional VCR information
- def extract_read_info (self, sample_data, gt, gts, genotype_fields):
- depth = ''
- alt_reads = ''
- ref_reads = ''
- af = ''
-
- #AD contains 2 values usually ref count and alt count unless there are
- #multiple alts then it will have alt 1 then alt 2.
- if 'AD' in genotype_fields and genotype_fields['AD'] <= len(sample_data):
- if 0 in gts.keys():
- #if part of the genotype is reference, then AD will have #ref reads, #alt reads
- ref_reads = sample_data[genotype_fields['AD']].split(',')[0]
- alt_reads = sample_data[genotype_fields['AD']].split(',')[1]
- elif gt == max(gts.keys()):
- #if geontype has multiple alt bases, then AD will have #alt1 reads, #alt2 reads
- alt_reads = sample_data[genotype_fields['AD']].split(',')[1]
- else:
- alt_reads = sample_data[genotype_fields['AD']].split(',')[0]
-
- if 'DP' in genotype_fields and genotype_fields['DP'] <= len(sample_data):
- depth = sample_data[genotype_fields['DP']]
- elif alt_reads != '' and ref_reads != '':
- #if DP is not present but we have alt and ref reads count, dp = ref+alt
- depth = int(alt_reads) + int(ref_reads)
-
- if 'AF' in genotype_fields and genotype_fields['AF'] <= len(sample_data):
- af = float(sample_data[genotype_fields['AF']] )
- elif depth != '' and alt_reads != '':
- #if AF not specified, calc it from alt and ref reads
- af = float(alt_reads) / float(depth)
-
- return depth, alt_reads, af
-
- def extract_vcf_variant (self, strand, pos, ref, alt):
-
- reflen = len(ref)
- altlen = len(alt)
-
- # Returns without change if same single nucleotide for ref and alt.
- if reflen == 1 and altlen == 1 and ref == alt:
- return pos, ref, alt
-
- # Trimming from the start and then the end of the sequence
- # where the sequences overlap with the same nucleotides
- new_ref2, new_alt2, new_pos = \
- self.trimming_vcf_input(ref, alt, pos, strand)
-
- if new_ref2 == '':
- new_ref2 = '-'
- if new_alt2 == '':
- new_alt2 = '-'
-
- return new_pos, new_ref2, new_alt2
-
- # This function looks at the ref and alt sequences and removes
- # where the overlapping sequences contain the same nucleotide.
- # This trims from the end first but does not remove the first nucleotide
- # because based on the format of VCF input the
- # first nucleotide of the ref and alt sequence occur
- # at the position specified.
- # End removed first, not the first nucleotide
- # Front removed and position changed
- def trimming_vcf_input(self, ref, alt, pos, strand):
- pos = int(pos)
- reflen = len(ref)
- altlen = len(alt)
- minlen = min(reflen, altlen)
- new_ref = ref
- new_alt = alt
- new_pos = pos
- # Trims from the end. Except don't remove the first nucleotide.
- # 1:6530968 CTCA -> GTCTCA becomes C -> GTC.
- for nt_pos in range(0, minlen - 1):
- if ref[reflen - nt_pos - 1] == alt[altlen - nt_pos - 1]:
- new_ref = ref[:reflen - nt_pos - 1]
- new_alt = alt[:altlen - nt_pos - 1]
- else:
- break
- new_ref_len = len(new_ref)
- new_alt_len = len(new_alt)
- minlen = min(new_ref_len, new_alt_len)
- new_ref2 = new_ref
- new_alt2 = new_alt
- # Trims from the start. 1:6530968 G -> GT becomes 1:6530969 - -> T.
- for nt_pos in range(0, minlen):
- if new_ref[nt_pos] == new_alt[nt_pos]:
- if strand == '+':
- new_pos += 1
- elif strand == '-':
- new_pos -= 1
- new_ref2 = new_ref[nt_pos + 1:]
- new_alt2 = new_alt[nt_pos + 1:]
- else:
- new_ref2 = new_ref[nt_pos:]
- new_alt2 = new_alt[nt_pos:]
- break
- return new_ref2, new_alt2, new_pos
-
-
-if __name__ == "__main__":
- c = CravatConverter()
\ No newline at end of file
diff -r 18982667bd10 -r 6b607ebcd674 cravat_submit/cravat_submit.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cravat_submit/cravat_submit.py Tue Jun 12 12:06:28 2018 -0400
@@ -0,0 +1,103 @@
+import requests
+import json
+import time
+import urllib
+import sys
+import csv
+
+input_filename = sys.argv[1]
+input_select_bar = sys.argv[2]
+output_filename = sys.argv[3]
+
+# HACK: Input args corrections.
+if input_select_bar == "None":
+ # The server represents an analyses of None as ""; however, submitting a blank string on command line throws off arg position
+ input_select_bar = ""
+ # The server represents the "Vest and Chasm" analyses as "VEST;CHASM; however, galaxy converts the semi-colon to an 'X'. Switch it back.
+elif input_select_bar == "VESTXCHASM":
+ input_select_bar = "VEST;CHASM"
+
+write_header = True
+
+#plugs in params to given URL
+submit = requests.post('http://staging.cravat.us/CRAVAT/rest/service/submit', files={'inputfile':open(input_filename)}, data={'email':'znylund@insilico.us.com', 'analyses': input_select_bar})
+#,'analysis':input_select_bar,'functionalannotation': "on"})
+#Makes the data a json dictionary, takes out only the job ID
+jobid = json.loads(submit.text)['jobid']
+#out_file.write(jobid)
+submitted = json.loads(submit.text)['status']
+#out_file.write('\t' + submitted)
+
+#loops until we find a status equal to Success, then breaks
+while True:
+ check = requests.get('http://staging.cravat.us/CRAVAT/rest/service/status', params={'jobid': jobid})
+ status = json.loads(check.text)['status']
+ resultfileurl = json.loads(check.text)['resultfileurl']
+ #out_file.write(str(status) + ', ')
+ if status == 'Success':
+ #out_file.write('\t' + resultfileurl)
+ break
+ else:
+ time.sleep(2)
+
+#out_file.write('\n')
+
+#creates three files
+file_1 = time.strftime("%H:%M") + '_Z_Variant_Result.tsv'
+file_2 = time.strftime("%H:%M") + '_Z_Additional_Details.tsv'
+file_3 = time.strftime("%H:%M") + 'Combined_Variant_Results.tsv'
+
+
+#Download the two results
+urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant.Result.tsv", file_1)
+urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant_Additional_Details.Result.tsv", file_2)
+
+headers = []
+duplicates = []
+
+#opens the Variant Result file and the Variant Additional Details file as csv readers, then opens the output file (galaxy) as a writer
+with open(file_1) as tsvin_1, open(file_2) as tsvin_2, open(output_filename, 'wb') as tsvout:
+ tsvreader_1 = csv.reader(tsvin_1, delimiter='\t')
+ tsvreader_2 = csv.reader(tsvin_2, delimiter='\t')
+ tsvout = csv.writer(tsvout, delimiter='\t')
+
+#loops through each row in the Variant Additional Details file
+ for row in tsvreader_2:
+ #sets row_2 equal to the same row in Variant Result file
+ row_2 = tsvreader_1.next()
+ #checks if row is empty or if the first term contains '#'
+ if row == [] or row[0][0] == '#':
+ continue
+ #checks if the row begins with input line
+ if row[0] == 'Input line':
+ #Goes through each value in the headers list in VAD
+ for value in row:
+ #Adds each value into headers
+ headers.append(value)
+ #Loops through the Keys in VR
+ for value in row_2:
+ #Checks if the value is already in headers
+ if value in headers:
+ continue
+ #else adds the header to headers
+ else:
+ headers.append(value)
+
+ print headers
+ tsvout.writerow(headers)
+
+
+ else:
+
+ cells = []
+ #Goes through each value in the next list
+ for value in row:
+ #adds it to cells
+ cells.append(value)
+ #Goes through each value from the VR file after position 11 (After it is done repeating from VAD file)
+ for value in row_2[11:]:
+ #adds in the rest of the values to cells
+ cells.append(value)
+
+ print cells
+ tsvout.writerow(cells)
\ No newline at end of file
diff -r 18982667bd10 -r 6b607ebcd674 cravat_submit/cravat_submit.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cravat_submit/cravat_submit.xml Tue Jun 12 12:06:28 2018 -0400
@@ -0,0 +1,34 @@
+
+ Submits, checks for, and retrieves data for cancer annotation
+ cravat_submit.py $input $dropdown $output
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This tool submits, checks for, and retrieves data for cancer annotation.
+
+
+