Mercurial > repos > jjohnson > defuse
comparison defuse_results_to_vcf.py @ 11:b22f8634ff84 draft
planemo upload for repository https://github.com/jj-umn/galaxytools/tree/master/defuse commit 23b94b5747c6956360cd2eca0a07a669929ea141-dirty
author | jjohnson |
---|---|
date | Sun, 17 Jan 2016 14:11:06 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
10:f65857c1b92e | 11:b22f8634ff84 |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 # | |
4 #------------------------------------------------------------------------------ | |
5 # University of Minnesota | |
6 # Copyright 2012, Regents of the University of Minnesota | |
7 #------------------------------------------------------------------------------ | |
8 # Author: | |
9 # | |
10 # James E Johnson | |
11 # Jesse Erdmann | |
12 # | |
13 #------------------------------------------------------------------------------ | |
14 """ | |
15 | |
16 | |
17 """ | |
18 This tool takes the defuse results.tsv tab-delimited file as input and creates a Variant Call Format file as output. | |
19 """ | |
20 | |
21 import sys,re,os.path | |
22 import optparse | |
23 from optparse import OptionParser | |
24 | |
25 """ | |
26 http://www.1000genomes.org/wiki/analysis/variant-call-format/vcf-variant-call-format-version-42 | |
27 | |
28 5. INFO keys used for structural variants | |
29 When the INFO keys reserved for encoding structural variants are used for imprecise variants, the values should be best estimates. When a key reflects a property of a single alt allele (e.g. SVLEN), then when there are multiple alt alleles there will be multiple values for the key corresponding to each alelle (e.g. SVLEN=-100,-110 for a deletion with two distinct alt alleles). | |
30 The following INFO keys are reserved for encoding structural variants. In general, when these keys are used by imprecise variants, the values should be best estimates. When a key reflects a property of a single alt allele (e.g. SVLEN), then when there are multiple alt alleles there will be multiple values for the key corresponding to each alelle (e.g. SVLEN=-100,-110 for a deletion with two distinct alt alleles). | |
31 ##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation"> | |
32 ##INFO=<ID=NOVEL,Number=0,Type=Flag,Description="Indicates a novel structural variation"> | |
33 ##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record"> | |
34 For precise variants, END is POS + length of REF allele - 1, and the for imprecise variants the corresponding best estimate. | |
35 ##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant"> | |
36 Value should be one of DEL, INS, DUP, INV, CNV, BND. This key can be derived from the REF/ALT fields but is useful for filtering. | |
37 ##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles"> | |
38 One value for each ALT allele. Longer ALT alleles (e.g. insertions) have positive values, shorter ALT alleles (e.g. deletions) have negative values. | |
39 ##INFO=<ID=CIPOS,Number=2,Type=Integer,Description="Confidence interval around POS for imprecise variants"> | |
40 ##INFO=<ID=CIEND,Number=2,Type=Integer,Description="Confidence interval around END for imprecise variants"> | |
41 ##INFO=<ID=HOMLEN,Number=.,Type=Integer,Description="Length of base pair identical micro-homology at event breakpoints"> | |
42 ##INFO=<ID=HOMSEQ,Number=.,Type=String,Description="Sequence of base pair identical micro-homology at event breakpoints"> | |
43 ##INFO=<ID=BKPTID,Number=.,Type=String,Description="ID of the assembled alternate allele in the assembly file"> | |
44 For precise variants, the consensus sequence the alternate allele assembly is derivable from the REF and ALT fields. However, the alternate allele assembly file may contain additional information about the characteristics of the alt allele contigs. | |
45 ##INFO=<ID=MEINFO,Number=4,Type=String,Description="Mobile element info of the form NAME,START,END,POLARITY"> | |
46 ##INFO=<ID=METRANS,Number=4,Type=String,Description="Mobile element transduction info of the form CHR,START,END,POLARITY"> | |
47 ##INFO=<ID=DGVID,Number=1,Type=String,Description="ID of this element in Database of Genomic Variation"> | |
48 ##INFO=<ID=DBVARID,Number=1,Type=String,Description="ID of this element in DBVAR"> | |
49 ##INFO=<ID=DBRIPID,Number=1,Type=String,Description="ID of this element in DBRIP"> | |
50 ##INFO=<ID=MATEID,Number=.,Type=String,Description="ID of mate breakends"> | |
51 ##INFO=<ID=PARID,Number=1,Type=String,Description="ID of partner breakend"> | |
52 ##INFO=<ID=EVENT,Number=1,Type=String,Description="ID of event associated to breakend"> | |
53 ##INFO=<ID=CILEN,Number=2,Type=Integer,Description="Confidence interval around the length of the inserted material between breakends"> | |
54 ##INFO=<ID=DP,Number=1,Type=Integer,Description="Read Depth of segment containing breakend"> | |
55 ##INFO=<ID=DPADJ,Number=.,Type=Integer,Description="Read Depth of adjacency"> | |
56 ##INFO=<ID=CN,Number=1,Type=Integer,Description="Copy number of segment containing breakend"> | |
57 ##INFO=<ID=CNADJ,Number=.,Type=Integer,Description="Copy number of adjacency"> | |
58 ##INFO=<ID=CICN,Number=2,Type=Integer,Description="Confidence interval around copy number for the segment"> | |
59 ##INFO=<ID=CICNADJ,Number=.,Type=Integer,Description="Confidence interval around copy number for the adjacency"> | |
60 6. FORMAT keys used for structural variants | |
61 ##FORMAT=<ID=CN,Number=1,Type=Integer,Description="Copy number genotype for imprecise events"> | |
62 ##FORMAT=<ID=CNQ,Number=1,Type=Float,Description="Copy number genotype quality for imprecise events"> | |
63 ##FORMAT=<ID=CNL,Number=.,Type=Float,Description="Copy number genotype likelihood for imprecise events"> | |
64 ##FORMAT=<ID=NQ,Number=1,Type=Integer,Description="Phred style probability score that the variant is novel with respect to the genome's ancestor"> | |
65 ##FORMAT=<ID=HAP,Number=1,Type=Integer,Description="Unique haplotype identifier"> | |
66 ##FORMAT=<ID=AHAP,Number=1,Type=Integer,Description="Unique identifier of ancestral haplotype"> | |
67 These keys are analogous to GT/GQ/GL and are provided for genotyping imprecise events by copy number (either because there is an unknown number of alternate alleles or because the haplotypes cannot be determined). CN specifies the integer copy number of the variant in this sample. CNQ is encoded as a phred quality -10log_10p(copy number genotype call is wrong). CNL specifies a list of log10 likelihoods for each potential copy number, starting from zero. When possible, GT/GQ/GL should be used instead of (or in addition to) these keys. | |
68 | |
69 Specifying Complex Rearrangements with Breakends | |
70 An arbitrary rearrangement event can be summarized as a set of novel adjacencies. | |
71 Each adjacency ties together 2 breakends. The two breakends at either end of a novel adjacency are called mates. | |
72 There is one line of VCF (i.e. one record) for each of the two breakends in a novel adjacency. A breakend record is identified with the tag SYTYPE=BND" in the INFO field. The REF field of a breakend record indicates a base or sequence s of bases beginning at position POS, as in all VCF records. The ALT field of a breakend record indicates a replacement for s. This "breakend replacement" has three parts: | |
73 the string t that replaces places s. The string t may be an extended version of s if some novel bases are inserted during the formation of the novel adjacency. | |
74 The position p of the mate breakend, indicated by a string of the form "chr:pos". This is the location of the first mapped base in the piece being joined at this novel adjacency. | |
75 The direction that the joined sequence continues in, starting from p. This is indicated by the orientation of square brackets surrounding p. | |
76 These 3 elements are combined in 4 possible ways to create the ALT. In each of the 4 cases, the assertion is that s is replaced with t, and then some piece starting at position p is joined to t. The cases are: | |
77 REF ALT Meaning | |
78 s t[p[ piece extending to the right of p is joined after t | |
79 s t]p] reverse comp piece extending left of p is joined after t | |
80 s ]p]t piece extending to the left of p is joined before t | |
81 s [p[t reverse comp piece extending right of p is joined before t | |
82 | |
83 Examples: | |
84 #CHROM POS ID REF ALT QUAL FILT INFO | |
85 2 321681 bnd_W G G]17:198982] 6 PASS SVTYPE=BND;MATEID=bnd_Y | |
86 2 321682 bnd_V T ]13:123456]T 6 PASS SVTYPE=BND;MATEID=bnd_U | |
87 13 123456 bnd_U C C[2:321682[ 6 PASS SVTYPE=BND;MATEID=bnd_V | |
88 13 123457 bnd_X A [17:198983[A 6 PASS SVTYPE=BND;MATEID=bnd_Z | |
89 17 198982 bnd_Y A A]2:321681] 6 PASS SVTYPE=BND;MATEID=bnd_W | |
90 17 198983 bnd_Z C [13:123457[C 6 PASS SVTYPE=BND;MATEID=bnd_X | |
91 """ | |
92 | |
93 vcf_header = """\ | |
94 ##fileformat=VCFv4.1 | |
95 ##source=defuse | |
96 ##reference=%s | |
97 ##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles"> | |
98 ##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant"> | |
99 ##INFO=<ID=MATEID,Number=1,Type=String,Description="ID of the BND mate"> | |
100 ##INFO=<ID=MATELOC,Number=1,Type=String,Description="The chrom:position of the BND mate"> | |
101 ##INFO=<ID=GENESTRAND,Number=2,Type=String,Description="Strands"> | |
102 ##INFO=<ID=DP,Number=1,Type=Integer,Description="Read Depth of segment containing breakend"> | |
103 ##INFO=<ID=SPLITCNT,Number=1,Type=Integer,Description="number of split reads supporting the prediction"> | |
104 ##INFO=<ID=SPANCNT,Number=1,Type=Integer,Description="number of spanning reads supporting the fusion"> | |
105 ##INFO=<ID=HOMLEN,Number=1,Type=Integer,Description="Length of base pair identical micro-homology at event breakpoints"> | |
106 ##INFO=<ID=SPLICESCORE,Number=1,Type=Integer,Description="number of nucleotides similar to GTAG at fusion splice"> | |
107 ##INFO=<ID=GENE,Number=2,Type=String,Description="Gene Names at each breakend"> | |
108 ##INFO=<ID=GENEID,Number=2,Type=String,Description="Gene IDs at each breakend"> | |
109 ##INFO=<ID=GENELOC,Number=2,Type=String,Description="location of breakpoint releative to genes"> | |
110 ##INFO=<ID=EXPR,Number=2,Type=Integer,Description="expression of genes as number of concordant pairs aligned to exons"> | |
111 ##INFO=<ID=ORF,Number=0,Type=Flag,Description="fusion combines genes in a way that preserves a reading frame"> | |
112 ##INFO=<ID=EXONBND,Number=0,Type=Flag,Description="fusion splice at exon boundaries"> | |
113 ##INFO=<ID=INTERCHROM,Number=0,Type=Flag,Description="fusion produced by an interchromosomal translocation"> | |
114 ##INFO=<ID=READTHROUGH,Number=0,Type=Flag,Description="fusion involving adjacent potentially resulting from co-transcription rather than genome rearrangement"> | |
115 ##INFO=<ID=ADJACENT,Number=0,Type=Flag,Description="fusion between adjacent genes"> | |
116 ##INFO=<ID=ALTSPLICE,Number=0,Type=Flag,Description="fusion likely the product of alternative splicing between adjacent genes"> | |
117 ##INFO=<ID=DELETION,Number=0,Type=Flag,Description="fusion produced by a genomic deletion"> | |
118 ##INFO=<ID=EVERSION,Number=0,Type=Flag,Description="fusion produced by a genomic eversion"> | |
119 ##INFO=<ID=INVERSION,Number=0,Type=Flag,Description="fusion produced by a genomic inversion"> | |
120 #CHROM POS ID REF ALT QUAL FILTER INFO\ | |
121 """ | |
122 | |
123 def cmp_alphanumeric(s1,s2): | |
124 if s1 == s2: | |
125 return 0 | |
126 a1 = re.findall("\d+|[a-zA-Z]+",s1) | |
127 a2 = re.findall("\d+|[a-zA-Z]+",s2) | |
128 for i in range(min(len(a1),len(a2))): | |
129 if a1[i] == a2[i]: | |
130 continue | |
131 if a1[i].isdigit() and a2[i].isdigit(): | |
132 return int(a1[i]) - int(a2[i]) | |
133 return 1 if a1[i] > a2[i] else -1 | |
134 return len(a1) - len(a2) | |
135 | |
136 def __main__(): | |
137 # VCF functions | |
138 chr_dict = dict() | |
139 def add_vcf_line(chr,pos,id,line): | |
140 if chr not in chr_dict: | |
141 pos_dict = dict() | |
142 chr_dict[chr] = pos_dict | |
143 if pos not in chr_dict[chr]: | |
144 id_dict = dict() | |
145 chr_dict[chr][pos] = id_dict | |
146 chr_dict[chr][pos][id] = line | |
147 | |
148 def write_vcf(): | |
149 print >> outputFile, vcf_header % (refname) | |
150 for chr in sorted(chr_dict.keys(),cmp=cmp_alphanumeric): | |
151 for pos in sorted(chr_dict[chr].keys()): | |
152 for id in chr_dict[chr][pos]: | |
153 print >> outputFile, chr_dict[chr][pos][id] | |
154 #Parse Command Line | |
155 parser = optparse.OptionParser() | |
156 # files | |
157 parser.add_option( '-i', '--input', dest='input', help='The input defuse results.tsv file (else read from stdin)' ) | |
158 parser.add_option( '-o', '--output', dest='output', help='The output vcf file (else write to stdout)' ) | |
159 parser.add_option( '-r', '--reference', dest='reference', default=None, help='The genomic reference id' ) | |
160 (options, args) = parser.parse_args() | |
161 | |
162 # results.tsv input | |
163 if options.input != None: | |
164 try: | |
165 inputPath = os.path.abspath(options.input) | |
166 inputFile = open(inputPath, 'r') | |
167 except Exception, e: | |
168 print >> sys.stderr, "failed: %s" % e | |
169 exit(2) | |
170 else: | |
171 inputFile = sys.stdin | |
172 # vcf output | |
173 if options.output != None: | |
174 try: | |
175 outputPath = os.path.abspath(options.output) | |
176 outputFile = open(outputPath, 'w') | |
177 except Exception, e: | |
178 print >> sys.stderr, "failed: %s" % e | |
179 exit(3) | |
180 else: | |
181 outputFile = sys.stdout | |
182 | |
183 refname = options.reference if options.reference else 'unknown' | |
184 | |
185 svtype = 'SVTYPE=BND' | |
186 filt = 'PASS' | |
187 columns = [] | |
188 try: | |
189 for linenum,line in enumerate(inputFile): | |
190 ## print >> sys.stderr, "%d: %s\n" % (linenum,line) | |
191 fields = line.strip().split('\t') | |
192 if line.startswith('cluster_id'): | |
193 columns = fields | |
194 ## print >> sys.stderr, "columns: %s\n" % columns | |
195 continue | |
196 cluster_id = fields[columns.index('cluster_id')] | |
197 gene_chromosome1 = fields[columns.index('gene_chromosome1')] | |
198 gene_chromosome2 = fields[columns.index('gene_chromosome2')] | |
199 genomic_strand1 = fields[columns.index('genomic_strand1')] | |
200 genomic_strand2 = fields[columns.index('genomic_strand2')] | |
201 gene1 = fields[columns.index('gene1')] | |
202 gene2 = fields[columns.index('gene2')] | |
203 gene_info = 'GENEID=%s,%s' % (gene1,gene2) | |
204 gene_name1 = fields[columns.index('gene_name1')] | |
205 gene_name2 = fields[columns.index('gene_name2')] | |
206 gene_name_info = 'GENE=%s,%s' % (gene_name1,gene_name2) | |
207 gene_location1 = fields[columns.index('gene_location1')] | |
208 gene_location2 = fields[columns.index('gene_location2')] | |
209 gene_loc = 'GENELOC=%s,%s' % (gene_location1,gene_location2) | |
210 expression1 = int(fields[columns.index('expression1')]) | |
211 expression2 = int(fields[columns.index('expression2')]) | |
212 expr = 'EXPR=%d,%d' % (expression1,expression2) | |
213 genomic_break_pos1 = int(fields[columns.index('genomic_break_pos1')]) | |
214 genomic_break_pos2 = int(fields[columns.index('genomic_break_pos2')]) | |
215 breakpoint_homology = int(fields[columns.index('breakpoint_homology')]) | |
216 homlen = 'HOMLEN=%s' % breakpoint_homology | |
217 orf = fields[columns.index('orf')] == 'Y' | |
218 exonboundaries = fields[columns.index('exonboundaries')] == 'Y' | |
219 read_through = fields[columns.index('read_through')] == 'Y' | |
220 interchromosomal = fields[columns.index('interchromosomal')] == 'Y' | |
221 adjacent = fields[columns.index('adjacent')] == 'Y' | |
222 altsplice = fields[columns.index('altsplice')] == 'Y' | |
223 deletion = fields[columns.index('deletion')] == 'Y' | |
224 eversion = fields[columns.index('eversion')] == 'Y' | |
225 inversion = fields[columns.index('inversion')] == 'Y' | |
226 span_count = int(fields[columns.index('span_count')]) | |
227 splitr_count = int(fields[columns.index('splitr_count')]) | |
228 splice_score = int(fields[columns.index('splice_score')]) | |
229 probability = fields[columns.index('probability')] if columns.index('probability') else '.' | |
230 splitr_sequence = fields[columns.index('splitr_sequence')] | |
231 split_seqs = splitr_sequence.split('|') | |
232 mate_id1 = "bnd_%s_1" % cluster_id | |
233 mate_id2 = "bnd_%s_2" % cluster_id | |
234 ref1 = split_seqs[0][-1] | |
235 ref2 = split_seqs[1][0] | |
236 b1 = '[' if genomic_strand1 == '+' else ']' | |
237 b2 = '[' if genomic_strand2 == '+' else ']' | |
238 alt1 = "%s%s%s:%d%s" % (ref1,b2,gene_chromosome2,genomic_break_pos2,b2) | |
239 alt2 = "%s%s:%d%s%s" % (b1,gene_chromosome1,genomic_break_pos1,b1,ref2) | |
240 #TODO evaluate what should be included in the INFO field | |
241 info = ['DP=%d' % (span_count + splitr_count),'SPLITCNT=%d' % splitr_count,'SPANCNT=%d' % span_count,gene_name_info,gene_info,gene_loc,expr,homlen,'SPLICESCORE=%d' % splice_score] | |
242 if orf: | |
243 info.append('ORF') | |
244 if exonboundaries: | |
245 info.append('EXONBND') | |
246 if interchromosomal: | |
247 info.append('INTERCHROM') | |
248 if read_through: | |
249 info.append('READTHROUGH') | |
250 if adjacent: | |
251 info.append('ADJACENT') | |
252 if altsplice: | |
253 info.append('ALTSPLICE') | |
254 if deletion: | |
255 info.append('DELETION') | |
256 if eversion: | |
257 info.append('EVERSION') | |
258 if inversion: | |
259 info.append('INVERSION') | |
260 info1 = [svtype,'MATEID=%s;MATELOC=%s:%d' % (mate_id2,gene_chromosome2,genomic_break_pos2)] + info | |
261 info2 = [svtype,'MATEID=%s;MATELOC=%s:%d' % (mate_id1,gene_chromosome1,genomic_break_pos1)] + info | |
262 qual = int(float(fields[columns.index('probability')]) * 255) if columns.index('probability') else '.' | |
263 vcf1 = '%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s'% (gene_chromosome1,genomic_break_pos1, mate_id1, ref1, alt1, qual, filt, ';'.join(info1) ) | |
264 vcf2 = '%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s'% (gene_chromosome2,genomic_break_pos2, mate_id2, ref2, alt2, qual, filt, ';'.join(info2) ) | |
265 add_vcf_line(gene_chromosome1,genomic_break_pos1,mate_id1,vcf1) | |
266 add_vcf_line(gene_chromosome2,genomic_break_pos2,mate_id2,vcf2) | |
267 write_vcf() | |
268 except Exception, e: | |
269 print >> sys.stderr, "failed: %s" % e | |
270 sys.exit(1) | |
271 | |
272 if __name__ == "__main__" : __main__() | |
273 |