| 
0
 | 
     1 #import xlrd #avoid dep
 | 
| 
 | 
     2 import argparse
 | 
| 
 | 
     3 import re
 | 
| 
 | 
     4 
 | 
| 
 | 
     5 parser = argparse.ArgumentParser()
 | 
| 
 | 
     6 parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence")
 | 
| 
 | 
     7 parser.add_argument("--ref", help="Reference file")
 | 
| 
 | 
     8 parser.add_argument("--output", help="Output file")
 | 
| 
 | 
     9 parser.add_argument("--id", help="ID to be used at the '>>>' line in the output")
 | 
| 
 | 
    10 
 | 
| 
 | 
    11 args = parser.parse_args()
 | 
| 
 | 
    12 
 | 
| 
 | 
    13 refdic = dict()
 | 
| 
 | 
    14 with open(args.ref, 'r') as ref:
 | 
| 
 | 
    15 	currentSeq = ""
 | 
| 
 | 
    16 	currentId = ""
 | 
| 
 | 
    17 	for line in ref:
 | 
| 
 | 
    18 		if line[0] is ">":
 | 
| 
 | 
    19 			if currentSeq is not "" and currentId is not "":
 | 
| 
 | 
    20 				refdic[currentId[1:]] = currentSeq
 | 
| 
 | 
    21 			currentId = line.rstrip()
 | 
| 
 | 
    22 			currentSeq = ""
 | 
| 
 | 
    23 		else:
 | 
| 
 | 
    24 			currentSeq += line.rstrip()
 | 
| 
 | 
    25 	refdic[currentId[1:]] = currentSeq
 | 
| 
 | 
    26 	
 | 
| 
 | 
    27 
 | 
| 
 | 
    28 vPattern = [r"(IGHV[0-9]-[0-9ab]+-?[0-9]?D?\*\d{1,2})"]#,
 | 
| 
 | 
    29 #						r"(TRBV[0-9]{1,2}-?[0-9]?-?[123]?)",
 | 
| 
 | 
    30 #						r"(IGKV[0-3]D?-[0-9]{1,2})",
 | 
| 
 | 
    31 #						r"(IGLV[0-9]-[0-9]{1,2})",
 | 
| 
 | 
    32 #						r"(TRAV[0-9]{1,2}(-[1-46])?(/DV[45678])?)",
 | 
| 
 | 
    33 #						r"(TRGV[234589])",
 | 
| 
 | 
    34 #						r"(TRDV[1-3])"]
 | 
| 
 | 
    35 
 | 
| 
 | 
    36 #vPattern = re.compile(r"|".join(vPattern))
 | 
| 
 | 
    37 vPattern = re.compile("|".join(vPattern))
 | 
| 
 | 
    38 
 | 
| 
 | 
    39 def filterGene(s, pattern):
 | 
| 
 | 
    40     if type(s) is not str:
 | 
| 
 | 
    41         return None
 | 
| 
 | 
    42     res = pattern.search(s)
 | 
| 
 | 
    43     if res:
 | 
| 
 | 
    44         return res.group(0)
 | 
| 
 | 
    45     return None
 | 
| 
 | 
    46 
 | 
| 
 | 
    47 
 | 
| 
 | 
    48 
 | 
| 
 | 
    49 currentSeq = ""
 | 
| 
 | 
    50 currentId = ""
 | 
| 
 | 
    51 first=True
 | 
| 
 | 
    52 with open(args.input, 'r') as i:
 | 
| 
 | 
    53 	with open(args.output, 'a') as o:
 | 
| 
 | 
    54 		o.write(">>>" + args.id + "\n")
 | 
| 
 | 
    55 		outputdic = dict()
 | 
| 
 | 
    56 		for line in i:
 | 
| 
 | 
    57 			if first:
 | 
| 
 | 
    58 				first = False
 | 
| 
 | 
    59 				continue
 | 
| 
 | 
    60 			linesplt = line.split("\t")
 | 
| 
 | 
    61 			ref = filterGene(linesplt[1], vPattern)
 | 
| 
 | 
    62 			if not ref or not linesplt[2].rstrip():
 | 
| 
 | 
    63 				continue
 | 
| 
 | 
    64 			if ref in outputdic:
 | 
| 
 | 
    65 				outputdic[ref] += [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())]
 | 
| 
 | 
    66 			else:
 | 
| 
 | 
    67 				outputdic[ref] = [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())]
 | 
| 
 | 
    68 		#print outputdic
 | 
| 
 | 
    69 		
 | 
| 
 | 
    70 		for k in outputdic.keys():
 | 
| 
 | 
    71 			if k in refdic:
 | 
| 
 | 
    72 				o.write(">>" + k + "\n")
 | 
| 
 | 
    73 				o.write(refdic[k] + "\n")
 | 
| 
 | 
    74 				for seq in outputdic[k]:
 | 
| 
 | 
    75 					#print seq
 | 
| 
 | 
    76 					o.write(">" + seq[0] + "\n")
 | 
| 
 | 
    77 					o.write(seq[1] + "\n")
 | 
| 
 | 
    78 			else:
 | 
| 
 | 
    79 				print k + " not in reference, skipping " + k
 |