5
|
1 #!/usr/bin/env python
|
|
2 """
|
|
3 Usage vcf2gvf.py <vcf_file input> <gff_file output>
|
|
4 """
|
|
5 import sys,re
|
|
6
|
|
7 ##simplify the variant descriptor for GVF
|
|
8 def get_vartype(type_str):
|
|
9 type_str=type_str.upper()
|
|
10 for V in ['COMPLEX', 'MNP', 'DEL', 'INDEL','INS','SNP']:
|
|
11 if V in type_str:
|
|
12 return V
|
|
13 break
|
|
14
|
|
15
|
|
16
|
|
17 in_vcf_file = open(sys.argv[1], 'r')
|
|
18 out_gff_file = open(sys.argv[2], 'w')
|
|
19 out_gff_file.write("#gff-version 3\n") ####gvf-version 1.06
|
|
20
|
|
21 ##only deal with tidy source lines, samtools is special needs
|
|
22 source_pat = re.compile('##source=(\S+)', re.IGNORECASE)
|
|
23
|
|
24
|
|
25 while True:
|
|
26 line=in_vcf_file.readline()
|
|
27 if line[0]!='#':
|
|
28 break
|
|
29 elif line[2:10]=='samtools':
|
|
30 source_type='SAMTOOLS'
|
|
31 break
|
|
32 else:
|
|
33 m=source_pat.match(line)
|
|
34 if m:
|
|
35 source_type=m.group(1).upper()
|
|
36 break
|
|
37
|
|
38
|
|
39 ##now read the data
|
|
40 ##samtools only distinguishes SNPs and indels
|
|
41 ##This is much complicated in highly heterozygous systems
|
|
42 ##see http://www.sequenceontology.org/miso/current_release/term/SO:0001059
|
|
43 ##GVF column 3 requires a type from this ontology
|
|
44 ###vcf type may be snp, mnp, ins, del, or complex.
|
|
45 ##for current purposes map multiple types to most complex state
|
|
46 ##ie SNP,SNP -> SNP; SNP,MNP-> MNP; SNP,complex
|
|
47 try:
|
|
48 for line in in_vcf_file:
|
|
49 if line[0] != '#':
|
|
50 var=line.split() ##see if var[7] has indel at start. If so add TYPE=INDEL for samtools case
|
|
51 info_field=var[7].split(';')
|
|
52 if info_field[0]=='INDEL':
|
|
53 info_field[0]="TYPE=INDEL"
|
|
54 info_dict=dict(X.split('=') for X in info_field)
|
|
55 ##if no TYPE key, then add TYPE=SNP for samtools case
|
|
56 if not info_dict.has_key('TYPE'):
|
|
57 info_dict['TYPE']='SNP';
|
|
58 var_type=get_vartype(info_dict['TYPE'])
|
|
59 ID=":".join([var[0],source_type,var_type,var[1]])
|
|
60 start=int(var[1])
|
|
61 length=len(var[3]) ## using reference length in this case
|
|
62 attributes=";".join(['ID='+ID,'Reference_seq='+var[3],'Variant_seq='+var[4]])
|
|
63 output_line=[var[0], source_type, var_type, str(start), str(start+length-1) ,'.','.','.',attributes,"\n"]
|
|
64 out_gff_file.write("\t".join([str(X) for X in output_line]))
|
|
65 finally:
|
|
66 in_vcf_file.close()
|
|
67 out_gff_file.close()
|
|
68
|