Mercurial > repos > davidvanzessen > shm_csr
comparison baseline/script_xlsx.py @ 0:c33d93683a09 draft
Uploaded
author | davidvanzessen |
---|---|
date | Thu, 13 Oct 2016 10:52:24 -0400 |
parents | |
children | ba33b94637ca 729738462297 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c33d93683a09 |
---|---|
1 import xlrd | |
2 import argparse | |
3 | |
4 parser = argparse.ArgumentParser() | |
5 parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence") | |
6 parser.add_argument("--ref", help="Reference file") | |
7 parser.add_argument("--output", help="Output file") | |
8 | |
9 args = parser.parse_args() | |
10 | |
11 gene_column = 6 | |
12 id_column = 7 | |
13 seq_column = 8 | |
14 LETTERS = [x for x in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"] | |
15 | |
16 | |
17 refdic = dict() | |
18 with open(args.ref, 'r') as ref: | |
19 currentSeq = "" | |
20 currentId = "" | |
21 for line in ref.readlines(): | |
22 if line[0] is ">": | |
23 if currentSeq is not "" and currentId is not "": | |
24 refdic[currentId[1:]] = currentSeq | |
25 currentId = line.rstrip() | |
26 currentSeq = "" | |
27 else: | |
28 currentSeq += line.rstrip() | |
29 refdic[currentId[1:]] = currentSeq | |
30 | |
31 currentSeq = "" | |
32 currentId = "" | |
33 with xlrd.open_workbook(args.input, 'r') as wb: | |
34 with open(args.output, 'a') as o: | |
35 for sheet in wb.sheets(): | |
36 if sheet.cell(1,gene_column).value.find("IGHV") < 0: | |
37 print "Genes not in column " + LETTERS[gene_column] + ", skipping sheet " + sheet.name | |
38 continue | |
39 o.write(">>>" + sheet.name + "\n") | |
40 outputdic = dict() | |
41 for rowindex in range(1, sheet.nrows): | |
42 ref = sheet.cell(rowindex, gene_column).value.replace(">", "") | |
43 if ref in outputdic: | |
44 outputdic[ref] += [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)] | |
45 else: | |
46 outputdic[ref] = [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)] | |
47 #print outputdic | |
48 | |
49 for k in outputdic.keys(): | |
50 if k in refdic: | |
51 o.write(">>" + k + "\n") | |
52 o.write(refdic[k] + "\n") | |
53 for seq in outputdic[k]: | |
54 #print seq | |
55 o.write(">" + seq[0] + "\n") | |
56 o.write(seq[1] + "\n") | |
57 else: | |
58 print k + " not in reference, skipping " + k |