67
|
1 #import xlrd #avoid dep
|
|
2 import argparse
|
|
3 import re
|
|
4
|
|
5 parser = argparse.ArgumentParser()
|
|
6 parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence")
|
|
7 parser.add_argument("--ref", help="Reference file")
|
|
8 parser.add_argument("--output", help="Output file")
|
|
9 parser.add_argument("--id", help="ID to be used at the '>>>' line in the output")
|
|
10
|
|
11 args = parser.parse_args()
|
|
12
|
|
13 print "script_imgt.py"
|
|
14 print "input:", args.input
|
|
15 print "ref:", args.ref
|
|
16 print "output:", args.output
|
|
17 print "id:", args.id
|
|
18
|
|
19 refdic = dict()
|
|
20 with open(args.ref, 'rU') as ref:
|
|
21 currentSeq = ""
|
|
22 currentId = ""
|
|
23 for line in ref:
|
|
24 if line.startswith(">"):
|
|
25 if currentSeq is not "" and currentId is not "":
|
|
26 refdic[currentId[1:]] = currentSeq
|
|
27 currentId = line.rstrip()
|
|
28 currentSeq = ""
|
|
29 else:
|
|
30 currentSeq += line.rstrip()
|
|
31 refdic[currentId[1:]] = currentSeq
|
|
32
|
|
33 print "Have", str(len(refdic)), "reference sequences"
|
|
34
|
|
35 vPattern = [r"(IGHV[0-9]-[0-9ab]+-?[0-9]?D?\*\d{1,2})"]#,
|
|
36 # r"(TRBV[0-9]{1,2}-?[0-9]?-?[123]?)",
|
|
37 # r"(IGKV[0-3]D?-[0-9]{1,2})",
|
|
38 # r"(IGLV[0-9]-[0-9]{1,2})",
|
|
39 # r"(TRAV[0-9]{1,2}(-[1-46])?(/DV[45678])?)",
|
|
40 # r"(TRGV[234589])",
|
|
41 # r"(TRDV[1-3])"]
|
|
42
|
|
43 #vPattern = re.compile(r"|".join(vPattern))
|
|
44 vPattern = re.compile("|".join(vPattern))
|
|
45
|
|
46 def filterGene(s, pattern):
|
|
47 if type(s) is not str:
|
|
48 return None
|
|
49 res = pattern.search(s)
|
|
50 if res:
|
|
51 return res.group(0)
|
|
52 return None
|
|
53
|
|
54
|
|
55
|
|
56 currentSeq = ""
|
|
57 currentId = ""
|
|
58 first=True
|
|
59 with open(args.input, 'r') as i:
|
|
60 with open(args.output, 'a') as o:
|
|
61 o.write(">>>" + args.id + "\n")
|
|
62 outputdic = dict()
|
|
63 for line in i:
|
|
64 if first:
|
|
65 first = False
|
|
66 continue
|
|
67 linesplt = line.split("\t")
|
|
68 ref = filterGene(linesplt[1], vPattern)
|
|
69 if not ref or not linesplt[2].rstrip():
|
|
70 continue
|
|
71 if ref in outputdic:
|
|
72 outputdic[ref] += [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())]
|
|
73 else:
|
|
74 outputdic[ref] = [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())]
|
|
75 #print outputdic
|
|
76
|
|
77 for k in outputdic.keys():
|
|
78 if k in refdic:
|
|
79 o.write(">>" + k + "\n")
|
|
80 o.write(refdic[k] + "\n")
|
|
81 for seq in outputdic[k]:
|
|
82 #print seq
|
|
83 o.write(">" + seq[0] + "\n")
|
|
84 o.write(seq[1] + "\n")
|
|
85 else:
|
|
86 print k + " not in reference, skipping " + k
|