0
|
1 import sys
|
|
2 import re
|
|
3 import requests
|
|
4
|
|
5 chromosome_re = re.compile('[0-9a-zA-Z_:]+\s+(chr[1-9]|chr1[0-9]|chr2[0-2]|chr[XY])\s+[0-9]+\s+[+-]\s+([ATGC]+|-)\s+([ATGC]+|-)', re.IGNORECASE)
|
|
6
|
|
7 def is_correct_input_line (line):
|
|
8 if chromosome_re.match(line) != None:
|
|
9 return True
|
|
10 else:
|
|
11 return False
|
|
12
|
|
13 def query (line):
|
1
|
14 url = query_url
|
|
15 params = {'mutation': '_'.join(line.split())}
|
|
16 r = requests.get(url, params=params)
|
0
|
17 annot = r.json()
|
|
18 return annot
|
|
19
|
1
|
20 query_url = 'http://www.cravat.us/rest/service/query'
|
0
|
21
|
|
22 first_headers = ['ID',
|
|
23 'Chromosome',
|
|
24 'Position',
|
|
25 'Strand',
|
|
26 'Reference base(s)',
|
|
27 'Alternate base(s)',
|
|
28 'Sample']
|
|
29
|
|
30 input_filename = sys.argv[1]
|
|
31 output_filename = sys.argv[2]
|
|
32
|
|
33 headers = []
|
|
34 header_not_loaded = True
|
|
35
|
|
36 f = open(input_filename)
|
|
37 wf = open(output_filename, 'w')
|
|
38 for line in f:
|
|
39 if is_correct_input_line(line) == False:
|
|
40 print 'Wrong format line:' + line[:-1]
|
|
41 continue
|
|
42
|
|
43 toks = line[:-1].split()
|
|
44 uid = toks[0]
|
|
45 if len(toks) >= 7:
|
|
46 sample_id = toks[6]
|
|
47 else:
|
|
48 sample_id = 'Unknown'
|
|
49 annot = query(' '.join(toks[1:]))
|
|
50 if header_not_loaded:
|
|
51 headers = annot.keys()
|
|
52 headers.sort()
|
|
53 wf.write('\t'.join(first_headers))
|
|
54 for header in headers:
|
|
55 if header not in first_headers:
|
|
56 wf.write('\t' + header)
|
|
57 wf.write('\n')
|
|
58 header_not_loaded = False
|
|
59 wf.write('\t'.join(toks[:6]) + '\t' + sample_id)
|
|
60 for header in headers:
|
|
61 if header not in first_headers:
|
|
62 wf.write('\t' + annot[header])
|
|
63 wf.write('\n')
|
|
64 f.close()
|
|
65 wf.close()
|