annotate baseline/script_xlsx.py @ 88:a84ac3ee87e1 draft

"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
author rhpvorderman
date Wed, 27 Oct 2021 12:32:21 +0000
parents 729738462297
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
83
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
1 import xlrd
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
2 import argparse
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
3
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
4 parser = argparse.ArgumentParser()
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
5 parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence")
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
6 parser.add_argument("--ref", help="Reference file")
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
7 parser.add_argument("--output", help="Output file")
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
8
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
9 args = parser.parse_args()
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
10
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
11 gene_column = 6
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
12 id_column = 7
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
13 seq_column = 8
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
14 LETTERS = [x for x in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"]
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
15
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
16
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
17 refdic = dict()
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
18 with open(args.ref, 'r') as ref:
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
19 currentSeq = ""
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
20 currentId = ""
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
21 for line in ref.readlines():
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
22 if line[0] is ">":
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
23 if currentSeq is not "" and currentId is not "":
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
24 refdic[currentId[1:]] = currentSeq
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
25 currentId = line.rstrip()
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
26 currentSeq = ""
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
27 else:
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
28 currentSeq += line.rstrip()
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
29 refdic[currentId[1:]] = currentSeq
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
30
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
31 currentSeq = ""
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
32 currentId = ""
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
33 with xlrd.open_workbook(args.input, 'r') as wb:
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
34 with open(args.output, 'a') as o:
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
35 for sheet in wb.sheets():
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
36 if sheet.cell(1,gene_column).value.find("IGHV") < 0:
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
37 print("Genes not in column " + LETTERS[gene_column] + ", skipping sheet " + sheet.name)
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
38 continue
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
39 o.write(">>>" + sheet.name + "\n")
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
40 outputdic = dict()
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
41 for rowindex in range(1, sheet.nrows):
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
42 ref = sheet.cell(rowindex, gene_column).value.replace(">", "")
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
43 if ref in outputdic:
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
44 outputdic[ref] += [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)]
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
45 else:
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
46 outputdic[ref] = [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)]
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
47 #print outputdic
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
48
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
49 for k in list(outputdic.keys()):
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
50 if k in refdic:
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
51 o.write(">>" + k + "\n")
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
52 o.write(refdic[k] + "\n")
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
53 for seq in outputdic[k]:
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
54 #print seq
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
55 o.write(">" + seq[0] + "\n")
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
56 o.write(seq[1] + "\n")
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
57 else:
729738462297 "planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents: 0
diff changeset
58 print(k + " not in reference, skipping " + k)