view cravat/cravat.py @ 0:9e29dd2972ab draft

Uploaded
author insilicosolutions
date Wed, 13 May 2015 15:24:18 -0400
parents
children c13857bac2c4
line wrap: on
line source

import sys
import re
import requests

chromosome_re = re.compile('[0-9a-zA-Z_:]+\s+(chr[1-9]|chr1[0-9]|chr2[0-2]|chr[XY])\s+[0-9]+\s+[+-]\s+([ATGC]+|-)\s+([ATGC]+|-)', re.IGNORECASE)

def is_correct_input_line (line):
    if chromosome_re.match(line) != None:
        return True
    else:
        return False

def query (line):
    url = query_url + '?mutation=' + '_'.join(line.split())
    r = requests.get(url)
    annot = r.json()
    return annot

query_url = 'http://staging.cravat.us/rest/service/query'

first_headers = ['ID',
                 'Chromosome',
                 'Position', 
                 'Strand', 
                 'Reference base(s)', 
                 'Alternate base(s)',
                 'Sample']

input_filename = sys.argv[1]
output_filename = sys.argv[2]

headers = []
header_not_loaded = True

f = open(input_filename)
wf = open(output_filename, 'w')
for line in f:
    if is_correct_input_line(line) == False:
        print 'Wrong format line:' + line[:-1]
        continue

    toks = line[:-1].split()
    uid = toks[0]
    if len(toks) >= 7:
        sample_id = toks[6]
    else:
        sample_id = 'Unknown'
    annot = query(' '.join(toks[1:]))
    if header_not_loaded:
        headers = annot.keys()
        headers.sort()
        wf.write('\t'.join(first_headers))
        for header in headers:
            if header not in first_headers:
                wf.write('\t' + header)
        wf.write('\n')
        header_not_loaded = False
    wf.write('\t'.join(toks[:6]) + '\t' + sample_id)
    for header in headers:
        if header not in first_headers:
            wf.write('\t' + annot[header])
    wf.write('\n')
f.close()
wf.close()