annotate cravat/cravat.py @ 1:c13857bac2c4 draft default tip

Updated for new CRAVAT server.
author insilicosolutions
date Tue, 08 Mar 2016 16:07:46 -0500
parents 9e29dd2972ab
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
1 import sys
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
2 import re
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
3 import requests
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
4
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
5 chromosome_re = re.compile('[0-9a-zA-Z_:]+\s+(chr[1-9]|chr1[0-9]|chr2[0-2]|chr[XY])\s+[0-9]+\s+[+-]\s+([ATGC]+|-)\s+([ATGC]+|-)', re.IGNORECASE)
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
6
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
7 def is_correct_input_line (line):
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
8 if chromosome_re.match(line) != None:
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
9 return True
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
10 else:
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
11 return False
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
12
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
13 def query (line):
1
c13857bac2c4 Updated for new CRAVAT server.
insilicosolutions
parents: 0
diff changeset
14 url = query_url
c13857bac2c4 Updated for new CRAVAT server.
insilicosolutions
parents: 0
diff changeset
15 params = {'mutation': '_'.join(line.split())}
c13857bac2c4 Updated for new CRAVAT server.
insilicosolutions
parents: 0
diff changeset
16 r = requests.get(url, params=params)
0
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
17 annot = r.json()
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
18 return annot
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
19
1
c13857bac2c4 Updated for new CRAVAT server.
insilicosolutions
parents: 0
diff changeset
20 query_url = 'http://www.cravat.us/rest/service/query'
0
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
21
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
22 first_headers = ['ID',
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
23 'Chromosome',
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
24 'Position',
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
25 'Strand',
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
26 'Reference base(s)',
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
27 'Alternate base(s)',
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
28 'Sample']
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
29
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
30 input_filename = sys.argv[1]
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
31 output_filename = sys.argv[2]
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
32
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
33 headers = []
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
34 header_not_loaded = True
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
35
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
36 f = open(input_filename)
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
37 wf = open(output_filename, 'w')
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
38 for line in f:
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
39 if is_correct_input_line(line) == False:
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
40 print 'Wrong format line:' + line[:-1]
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
41 continue
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
42
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
43 toks = line[:-1].split()
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
44 uid = toks[0]
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
45 if len(toks) >= 7:
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
46 sample_id = toks[6]
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
47 else:
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
48 sample_id = 'Unknown'
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
49 annot = query(' '.join(toks[1:]))
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
50 if header_not_loaded:
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
51 headers = annot.keys()
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
52 headers.sort()
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
53 wf.write('\t'.join(first_headers))
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
54 for header in headers:
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
55 if header not in first_headers:
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
56 wf.write('\t' + header)
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
57 wf.write('\n')
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
58 header_not_loaded = False
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
59 wf.write('\t'.join(toks[:6]) + '\t' + sample_id)
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
60 for header in headers:
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
61 if header not in first_headers:
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
62 wf.write('\t' + annot[header])
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
63 wf.write('\n')
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
64 f.close()
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
65 wf.close()