view baseline/script_imgt.py @ 90:6809c63d9161 draft

"planemo upload commit fd64827ff6e63df008f6f50ddb8576ad2b1dbb26"
author rhpvorderman
date Tue, 25 Jan 2022 11:28:29 +0000
parents 729738462297
children
line wrap: on
line source

#import xlrd #avoid dep
import argparse
import re

parser = argparse.ArgumentParser()
parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence")
parser.add_argument("--ref", help="Reference file")
parser.add_argument("--output", help="Output file")
parser.add_argument("--id", help="ID to be used at the '>>>' line in the output")

args = parser.parse_args()

print("script_imgt.py")
print("input:", args.input)
print("ref:", args.ref)
print("output:", args.output)
print("id:", args.id)

refdic = dict()
with open(args.ref, 'rU') as ref:
	currentSeq = ""
	currentId = ""
	for line in ref:
		if line.startswith(">"):
			if currentSeq is not "" and currentId is not "":
				refdic[currentId[1:]] = currentSeq
			currentId = line.rstrip()
			currentSeq = ""
		else:
			currentSeq += line.rstrip()
	refdic[currentId[1:]] = currentSeq

print("Have", str(len(refdic)), "reference sequences")

vPattern = [r"(IGHV[0-9]-[0-9ab]+-?[0-9]?D?\*\d{1,2})"]#,
#						r"(TRBV[0-9]{1,2}-?[0-9]?-?[123]?)",
#						r"(IGKV[0-3]D?-[0-9]{1,2})",
#						r"(IGLV[0-9]-[0-9]{1,2})",
#						r"(TRAV[0-9]{1,2}(-[1-46])?(/DV[45678])?)",
#						r"(TRGV[234589])",
#						r"(TRDV[1-3])"]

#vPattern = re.compile(r"|".join(vPattern))
vPattern = re.compile("|".join(vPattern))

def filterGene(s, pattern):
    if type(s) is not str:
        return None
    res = pattern.search(s)
    if res:
        return res.group(0)
    return None



currentSeq = ""
currentId = ""
first=True
with open(args.input, 'r') as i:
	with open(args.output, 'a') as o:
		o.write(">>>" + args.id + "\n")
		outputdic = dict()
		for line in i:
			if first:
				first = False
				continue
			linesplt = line.split("\t")
			ref = filterGene(linesplt[1], vPattern)
			if not ref or not linesplt[2].rstrip():
				continue
			if ref in outputdic:
				outputdic[ref] += [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())]
			else:
				outputdic[ref] = [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())]
		#print outputdic
		
		for k in list(outputdic.keys()):
			if k in refdic:
				o.write(">>" + k + "\n")
				o.write(refdic[k] + "\n")
				for seq in outputdic[k]:
					#print seq
					o.write(">" + seq[0] + "\n")
					o.write(seq[1] + "\n")
			else:
				print(k + " not in reference, skipping " + k)