Mercurial > repos > davidvanzessen > shm_csr
annotate baseline/script_xlsx.py @ 91:f387cc1580c6 draft
"planemo upload commit 6f5bdb4189fcc9028c90365d8edf8d1d7c1cf690"
author | rhpvorderman |
---|---|
date | Wed, 02 Feb 2022 10:57:36 +0000 |
parents | 729738462297 |
children |
rev | line source |
---|---|
83
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
1 import xlrd |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
2 import argparse |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
3 |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
4 parser = argparse.ArgumentParser() |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
5 parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence") |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
6 parser.add_argument("--ref", help="Reference file") |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
7 parser.add_argument("--output", help="Output file") |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
8 |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
9 args = parser.parse_args() |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
10 |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
11 gene_column = 6 |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
12 id_column = 7 |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
13 seq_column = 8 |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
14 LETTERS = [x for x in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"] |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
15 |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
16 |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
17 refdic = dict() |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
18 with open(args.ref, 'r') as ref: |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
19 currentSeq = "" |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
20 currentId = "" |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
21 for line in ref.readlines(): |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
22 if line[0] is ">": |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
23 if currentSeq is not "" and currentId is not "": |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
24 refdic[currentId[1:]] = currentSeq |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
25 currentId = line.rstrip() |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
26 currentSeq = "" |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
27 else: |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
28 currentSeq += line.rstrip() |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
29 refdic[currentId[1:]] = currentSeq |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
30 |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
31 currentSeq = "" |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
32 currentId = "" |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
33 with xlrd.open_workbook(args.input, 'r') as wb: |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
34 with open(args.output, 'a') as o: |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
35 for sheet in wb.sheets(): |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
36 if sheet.cell(1,gene_column).value.find("IGHV") < 0: |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
37 print("Genes not in column " + LETTERS[gene_column] + ", skipping sheet " + sheet.name) |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
38 continue |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
39 o.write(">>>" + sheet.name + "\n") |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
40 outputdic = dict() |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
41 for rowindex in range(1, sheet.nrows): |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
42 ref = sheet.cell(rowindex, gene_column).value.replace(">", "") |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
43 if ref in outputdic: |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
44 outputdic[ref] += [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)] |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
45 else: |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
46 outputdic[ref] = [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)] |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
47 #print outputdic |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
48 |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
49 for k in list(outputdic.keys()): |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
50 if k in refdic: |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
51 o.write(">>" + k + "\n") |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
52 o.write(refdic[k] + "\n") |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
53 for seq in outputdic[k]: |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
54 #print seq |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
55 o.write(">" + seq[0] + "\n") |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
56 o.write(seq[1] + "\n") |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
57 else: |
729738462297
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
rhpvorderman
parents:
0
diff
changeset
|
58 print(k + " not in reference, skipping " + k) |