view baseline/script_xlsx.py @ 83:729738462297 draft

"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
author rhpvorderman
date Wed, 15 Sep 2021 12:24:06 +0000
parents c33d93683a09
children
line wrap: on
line source

import xlrd
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence")
parser.add_argument("--ref", help="Reference file")
parser.add_argument("--output", help="Output file")

args = parser.parse_args()

gene_column = 6
id_column = 7
seq_column = 8
LETTERS = [x for x in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"]


refdic = dict()
with open(args.ref, 'r') as ref:
	currentSeq = ""
	currentId = ""
	for line in ref.readlines():
		if line[0] is ">":
			if currentSeq is not "" and currentId is not "":
				refdic[currentId[1:]] = currentSeq
			currentId = line.rstrip()
			currentSeq = ""
		else:
			currentSeq += line.rstrip()
	refdic[currentId[1:]] = currentSeq
	
currentSeq = ""
currentId = ""
with xlrd.open_workbook(args.input, 'r') as wb:
	with open(args.output, 'a') as o:
		for sheet in wb.sheets():
			if sheet.cell(1,gene_column).value.find("IGHV") < 0:
				print("Genes not in column " + LETTERS[gene_column] + ", skipping sheet " + sheet.name)
				continue
			o.write(">>>" + sheet.name + "\n")
			outputdic = dict()
			for rowindex in range(1, sheet.nrows):
				ref = sheet.cell(rowindex, gene_column).value.replace(">", "")
				if ref in outputdic:
					outputdic[ref] += [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)]
				else:
					outputdic[ref] = [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)]
			#print outputdic
			
			for k in list(outputdic.keys()):
				if k in refdic:
					o.write(">>" + k + "\n")
					o.write(refdic[k] + "\n")
					for seq in outputdic[k]:
						#print seq
						o.write(">" + seq[0] + "\n")
						o.write(seq[1] + "\n")
				else:
					print(k + " not in reference, skipping " + k)