0
|
1 import xlrd
|
|
2 import argparse
|
|
3
|
|
4 parser = argparse.ArgumentParser()
|
|
5 parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence")
|
|
6 parser.add_argument("--ref", help="Reference file")
|
|
7 parser.add_argument("--output", help="Output file")
|
|
8
|
|
9 args = parser.parse_args()
|
|
10
|
|
11 gene_column = 6
|
|
12 id_column = 7
|
|
13 seq_column = 8
|
|
14 LETTERS = [x for x in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"]
|
|
15
|
|
16
|
|
17 refdic = dict()
|
|
18 with open(args.ref, 'r') as ref:
|
|
19 currentSeq = ""
|
|
20 currentId = ""
|
|
21 for line in ref.readlines():
|
|
22 if line[0] is ">":
|
|
23 if currentSeq is not "" and currentId is not "":
|
|
24 refdic[currentId[1:]] = currentSeq
|
|
25 currentId = line.rstrip()
|
|
26 currentSeq = ""
|
|
27 else:
|
|
28 currentSeq += line.rstrip()
|
|
29 refdic[currentId[1:]] = currentSeq
|
|
30
|
|
31 currentSeq = ""
|
|
32 currentId = ""
|
|
33 with xlrd.open_workbook(args.input, 'r') as wb:
|
|
34 with open(args.output, 'a') as o:
|
|
35 for sheet in wb.sheets():
|
|
36 if sheet.cell(1,gene_column).value.find("IGHV") < 0:
|
|
37 print "Genes not in column " + LETTERS[gene_column] + ", skipping sheet " + sheet.name
|
|
38 continue
|
|
39 o.write(">>>" + sheet.name + "\n")
|
|
40 outputdic = dict()
|
|
41 for rowindex in range(1, sheet.nrows):
|
|
42 ref = sheet.cell(rowindex, gene_column).value.replace(">", "")
|
|
43 if ref in outputdic:
|
|
44 outputdic[ref] += [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)]
|
|
45 else:
|
|
46 outputdic[ref] = [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)]
|
|
47 #print outputdic
|
|
48
|
|
49 for k in outputdic.keys():
|
|
50 if k in refdic:
|
|
51 o.write(">>" + k + "\n")
|
|
52 o.write(refdic[k] + "\n")
|
|
53 for seq in outputdic[k]:
|
|
54 #print seq
|
|
55 o.write(">" + seq[0] + "\n")
|
|
56 o.write(seq[1] + "\n")
|
|
57 else:
|
|
58 print k + " not in reference, skipping " + k
|