comparison baseline/script_imgt.py @ 81:b6f9a640e098 draft

Uploaded
author davidvanzessen
date Fri, 19 Feb 2021 15:10:54 +0000
parents
children 729738462297
comparison
equal deleted inserted replaced
80:a4617f1d1d89 81:b6f9a640e098
1 #import xlrd #avoid dep
2 import argparse
3 import re
4
5 parser = argparse.ArgumentParser()
6 parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence")
7 parser.add_argument("--ref", help="Reference file")
8 parser.add_argument("--output", help="Output file")
9 parser.add_argument("--id", help="ID to be used at the '>>>' line in the output")
10
11 args = parser.parse_args()
12
13 print "script_imgt.py"
14 print "input:", args.input
15 print "ref:", args.ref
16 print "output:", args.output
17 print "id:", args.id
18
19 refdic = dict()
20 with open(args.ref, 'rU') as ref:
21 currentSeq = ""
22 currentId = ""
23 for line in ref:
24 if line.startswith(">"):
25 if currentSeq is not "" and currentId is not "":
26 refdic[currentId[1:]] = currentSeq
27 currentId = line.rstrip()
28 currentSeq = ""
29 else:
30 currentSeq += line.rstrip()
31 refdic[currentId[1:]] = currentSeq
32
33 print "Have", str(len(refdic)), "reference sequences"
34
35 vPattern = [r"(IGHV[0-9]-[0-9ab]+-?[0-9]?D?\*\d{1,2})"]#,
36 # r"(TRBV[0-9]{1,2}-?[0-9]?-?[123]?)",
37 # r"(IGKV[0-3]D?-[0-9]{1,2})",
38 # r"(IGLV[0-9]-[0-9]{1,2})",
39 # r"(TRAV[0-9]{1,2}(-[1-46])?(/DV[45678])?)",
40 # r"(TRGV[234589])",
41 # r"(TRDV[1-3])"]
42
43 #vPattern = re.compile(r"|".join(vPattern))
44 vPattern = re.compile("|".join(vPattern))
45
46 def filterGene(s, pattern):
47 if type(s) is not str:
48 return None
49 res = pattern.search(s)
50 if res:
51 return res.group(0)
52 return None
53
54
55
56 currentSeq = ""
57 currentId = ""
58 first=True
59 with open(args.input, 'r') as i:
60 with open(args.output, 'a') as o:
61 o.write(">>>" + args.id + "\n")
62 outputdic = dict()
63 for line in i:
64 if first:
65 first = False
66 continue
67 linesplt = line.split("\t")
68 ref = filterGene(linesplt[1], vPattern)
69 if not ref or not linesplt[2].rstrip():
70 continue
71 if ref in outputdic:
72 outputdic[ref] += [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())]
73 else:
74 outputdic[ref] = [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())]
75 #print outputdic
76
77 for k in outputdic.keys():
78 if k in refdic:
79 o.write(">>" + k + "\n")
80 o.write(refdic[k] + "\n")
81 for seq in outputdic[k]:
82 #print seq
83 o.write(">" + seq[0] + "\n")
84 o.write(seq[1] + "\n")
85 else:
86 print k + " not in reference, skipping " + k