Mercurial > repos > davidvanzessen > shm_csr
view baseline/script_xlsx.py @ 87:64bbc6734ec7 draft
"planemo upload commit 0dcc3e4e47a066373ff0bd56f16536298f8ac2a0"
author | rhpvorderman |
---|---|
date | Wed, 27 Oct 2021 11:30:25 +0000 |
parents | 729738462297 |
children |
line wrap: on
line source
import xlrd import argparse parser = argparse.ArgumentParser() parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence") parser.add_argument("--ref", help="Reference file") parser.add_argument("--output", help="Output file") args = parser.parse_args() gene_column = 6 id_column = 7 seq_column = 8 LETTERS = [x for x in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"] refdic = dict() with open(args.ref, 'r') as ref: currentSeq = "" currentId = "" for line in ref.readlines(): if line[0] is ">": if currentSeq is not "" and currentId is not "": refdic[currentId[1:]] = currentSeq currentId = line.rstrip() currentSeq = "" else: currentSeq += line.rstrip() refdic[currentId[1:]] = currentSeq currentSeq = "" currentId = "" with xlrd.open_workbook(args.input, 'r') as wb: with open(args.output, 'a') as o: for sheet in wb.sheets(): if sheet.cell(1,gene_column).value.find("IGHV") < 0: print("Genes not in column " + LETTERS[gene_column] + ", skipping sheet " + sheet.name) continue o.write(">>>" + sheet.name + "\n") outputdic = dict() for rowindex in range(1, sheet.nrows): ref = sheet.cell(rowindex, gene_column).value.replace(">", "") if ref in outputdic: outputdic[ref] += [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)] else: outputdic[ref] = [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)] #print outputdic for k in list(outputdic.keys()): if k in refdic: o.write(">>" + k + "\n") o.write(refdic[k] + "\n") for seq in outputdic[k]: #print seq o.write(">" + seq[0] + "\n") o.write(seq[1] + "\n") else: print(k + " not in reference, skipping " + k)