# HG changeset patch
# User insilicosolutions
# Date 1431545058 14400
# Node ID 9e29dd2972abda7d76dd04c7250d253e6d4edd7b
Uploaded
diff -r 000000000000 -r 9e29dd2972ab cravat/cravat.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cravat/cravat.py Wed May 13 15:24:18 2015 -0400
@@ -0,0 +1,64 @@
+import sys
+import re
+import requests
+
+chromosome_re = re.compile('[0-9a-zA-Z_:]+\s+(chr[1-9]|chr1[0-9]|chr2[0-2]|chr[XY])\s+[0-9]+\s+[+-]\s+([ATGC]+|-)\s+([ATGC]+|-)', re.IGNORECASE)
+
+def is_correct_input_line (line):
+ if chromosome_re.match(line) != None:
+ return True
+ else:
+ return False
+
+def query (line):
+ url = query_url + '?mutation=' + '_'.join(line.split())
+ r = requests.get(url)
+ annot = r.json()
+ return annot
+
+query_url = 'http://staging.cravat.us/rest/service/query'
+
+first_headers = ['ID',
+ 'Chromosome',
+ 'Position',
+ 'Strand',
+ 'Reference base(s)',
+ 'Alternate base(s)',
+ 'Sample']
+
+input_filename = sys.argv[1]
+output_filename = sys.argv[2]
+
+headers = []
+header_not_loaded = True
+
+f = open(input_filename)
+wf = open(output_filename, 'w')
+for line in f:
+ if is_correct_input_line(line) == False:
+ print 'Wrong format line:' + line[:-1]
+ continue
+
+ toks = line[:-1].split()
+ uid = toks[0]
+ if len(toks) >= 7:
+ sample_id = toks[6]
+ else:
+ sample_id = 'Unknown'
+ annot = query(' '.join(toks[1:]))
+ if header_not_loaded:
+ headers = annot.keys()
+ headers.sort()
+ wf.write('\t'.join(first_headers))
+ for header in headers:
+ if header not in first_headers:
+ wf.write('\t' + header)
+ wf.write('\n')
+ header_not_loaded = False
+ wf.write('\t'.join(toks[:6]) + '\t' + sample_id)
+ for header in headers:
+ if header not in first_headers:
+ wf.write('\t' + annot[header])
+ wf.write('\n')
+f.close()
+wf.close()
diff -r 000000000000 -r 9e29dd2972ab cravat/cravat.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cravat/cravat.xml Wed May 13 15:24:18 2015 -0400
@@ -0,0 +1,19 @@
+
+ CRAVAT annotation
+ cravat.py $input $output
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This tool queries CRAVAT for variants
+
diff -r 000000000000 -r 9e29dd2972ab cravat/test-data/test_input.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cravat/test-data/test_input.txt Wed May 13 15:24:18 2015 -0400
@@ -0,0 +1,5 @@
+TR1 chr22 30421786 + A T sample_1
+TR2 chr22 40814500 - A G sample_1
+TR3 chr22 25115450 + - AGG sample_2
+TR4 chr22 234234 + AGA - sample_2
+TR5 chr2 219134766 + G - sample_3
diff -r 000000000000 -r 9e29dd2972ab cravat/test-data/test_output.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cravat/test-data/test_output.tsv Wed May 13 15:24:18 2015 -0400
@@ -0,0 +1,6 @@
+ID Chromosome Position Strand Reference base(s) Alternate base(s) Sample 1000 Genomes allele frequency Driver Genes ESP6500 allele frequency (African American) ESP6500 allele frequency (European American) ExAC allele frequency (African/African American) ExAC allele frequency (East Asian) ExAC allele frequency (Finnish) ExAC allele frequency (Latino) ExAC allele frequency (Non-Finnish European) ExAC allele frequency (Other) ExAC allele frequency (South Asian) ExAC total allele frequency HUGO symbol Mappability Warning Occurrences in COSMIC [exact nucleotide change] Occurrences in COSMIC by primary sites [exact nucleotide change] Protein sequence change in COSMIC Sequence ontology Sequence ontology all transcripts Sequence ontology protein change Sequence ontology transcript TARGET Transcript in COSMIC
+TR1 chr22 30421786 + A T sample_1 0.0077875400893390178680419921875 0.02269629947841167449951171875 0 0.0238370001316070556640625 0 0 0.001123600057326257228851318359375 0 0.0011037499643862247467041015625 0 0.0021585500799119472503662109375 MTMR3 MS NM_153051.2:N1161I(MS), NM_153050.2:N1170I(MS), ENST00000323630:N1062I(MS), ENST00000351488:N1161I(MS), ENST00000333027:N1170I(MS), ENST00000406629:N1170I(MS), ENST00000401950:N1198I(MS) N1198I NM_021090.3
+TR2 chr22 40814500 - A G sample_1 0.4832270145416259765625 0.860644996166229248046875 0.396977007389068603515625 0.871408998966217041015625 0.110581003129482269287109375 0.330637991428375244140625 0.1900829970836639404296875 0.3895820081233978271484375 0.3936649858951568603515625 0.4774529933929443359375 0.4003469944000244140625 MKL1 1 stomach(1) p.S648G (stomach 1) MS ENST00000396617:S648G(MS), ENST00000402042:S598G(MS), ENST00000407029:S648G(MS), ENST00000355630:S648G(MS) S648G NM_020831.3 ENST00000355630
+TR3 chr22 25115450 + - AGG sample_2 0 0 0 0 0 0 0 0 0 0 0 PIWIL3 II NM_001255975.1:871(II), ENST00000533313:762(II), ENST00000527701:762(II), ENST00000332271:880(II) 880 NM_001008496.3
+TR4 chr22 234234 + AGA - sample_2 0 0 0 0 0 0 0 0 0 0 0 Non-Coding
+TR5 chr2 219134766 + G - sample_3 0 0 0 0.00038654799573123455047607421875 0 0 0 0.000482450588606297969818115234375 0 0 0.0002977323601953685283660888671875 AAMP 2 large_intestine(2) p.P15fs*5 (large_intestine 2) FD ENST00000248450:15(FD), ENST00000444053:15(FD) 15 NM_001087.3 ENST00000248450
diff -r 000000000000 -r 9e29dd2972ab cravat/vcf_to_cravat.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cravat/vcf_to_cravat.py Wed May 13 15:24:18 2015 -0400
@@ -0,0 +1,133 @@
+import sys
+import re
+
+def extract_vcf_variant (strand, pos, ref, alt):
+ pos = int(pos)
+ reflen = len(ref)
+ altlen = len(alt)
+ minlen = min(reflen, altlen)
+ new_ref = ref
+ new_alt = alt
+
+ if reflen == 1 and altlen == 1 and ref == alt:
+ return pos, ref, alt
+
+ for nt_pos in xrange(0, minlen):
+ if ref[reflen - nt_pos - 1] == alt[altlen - nt_pos - 1]:
+ new_ref = ref[:reflen - nt_pos - 1]
+ new_alt = alt[:altlen - nt_pos - 1]
+ else:
+ break
+ newreflen = len(new_ref)
+ newaltlen = len(new_alt)
+
+ minlen = min(newreflen, newaltlen)
+ new_pos = pos
+ new_ref2 = new_ref
+ new_alt2 = new_alt
+
+ for nt_pos in xrange(minlen):
+ if new_ref[nt_pos] == new_alt[nt_pos]:
+ if strand == '+':
+ new_pos += 1
+ elif strand == '-':
+ new_pos -= 1
+ new_ref2 = new_ref[nt_pos + 1:]
+ new_alt2 = new_alt[nt_pos + 1:]
+ else:
+ new_ref2 = new_ref[nt_pos:]
+ new_alt2 = new_alt[nt_pos:]
+ break
+ if new_ref == '':
+ new_ref2 = '-'
+ if new_alt2 == '':
+ new_alt2 = '-'
+
+ return new_pos, new_ref2, new_alt2
+
+input_filename = sys.argv[1]
+output_filename = sys.argv[2]
+
+f = open(input_filename)
+wf = open(output_filename, 'w')
+
+vcf_line_no = 0
+
+for line in f:
+ vcf_line_no += 1
+ if len(line) < 6:
+ continue
+ if line[:6] == '#CHROM':
+ toks = re.split('\s+', line.rstrip())
+ if len(toks) > 8:
+ samples = toks[9]
+ break
+no_samples = len(samples)
+
+for line in f:
+ vcf_line_no += 1
+
+ if line[0] == '#':
+ continue
+
+ toks = re.split('\s+', line.rstrip())
+
+ if len(toks) < 8:
+ continue
+
+ [chrom, pos, uidbase, ref, alts, dummy, dummy, dummy] = toks[:8]
+ reflen = len(ref)
+ if uidbase == '.':
+ uidbase = 'VAR' + str(vcf_line_no)
+ if chrom[:3].lower != 'chr':
+ chrom = 'chr' + chrom
+ alts = alts.split(',')
+
+ len_alts = len(alts)
+ if len(toks) == 8:
+ for altno in xrange(len_alts):
+ alt = alts[altno]
+ if len_alts == 1:
+ uid = uidbase
+ else:
+ uid = uidbase + '_' + str(altno + 1)
+ newpos, newref, newalt = extract_vcf_variant('+', pos, ref, alt)
+ cravat_line = '\t'.join([uid, chrom, str(newpos), '+', newref, newalt, 'Unknown'])
+ wf.write(cravat_line + '\n')
+ elif len(toks) > 8:
+ sample_datas = toks[9:]
+
+ genotype_fields = {}
+ genotype_field_no = 0
+ for genotype_field in toks[8].split(':'):
+ genotype_fields[genotype_field] = genotype_field_no
+
+ if not ('GT' in genotype_fields):
+ print 'No GT Field at line ' + str(vcf_line_no) + ' [' + line.strip() + ']'
+ continue
+
+ gt_field_no = genotype_fields['GT']
+
+ for sample_no in xrange(len(sample_datas)):
+ sample = samples[sample_no]
+ sample_data = sample_datas[sample_no].split(':')
+ gts = {}
+ for gt in sample_data[gt_field_no].replace('/', '|').split('|'):
+ if gt == '.':
+ continue
+ else:
+ gts[int(gt)] = True
+ for gt in gts.keys():
+ if gt == 0:
+ continue
+ else:
+ alt = alts[gt - 1]
+ if len_alts == 1:
+ uid = uidbase
+ else:
+ uid = uidbase + ':' + str(gt)
+ newpos, newref, newalt = extract_vcf_variant('+', pos, ref, alt)
+ cravat_line = '\t'.join([uid + '_' + sample, chrom, str(newpos), '+', newref, newalt, sample])
+ wf.write(cravat_line + '\n')
+f.close()
+wf.close()
diff -r 000000000000 -r 9e29dd2972ab cravat/vcf_to_cravat.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cravat/vcf_to_cravat.xml Wed May 13 15:24:18 2015 -0400
@@ -0,0 +1,19 @@
+
+ VCF to CRAVAT input format conversion
+ vcf_to_cravat.py $input $output
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This tool converts a VCF format file to a CRAVAT input format file.
+