annotate baseline/script_imgt.py @ 59:1ad34508be98 draft

Uploaded
author davidvanzessen
date Tue, 18 Jul 2017 05:03:55 -0400
parents 4c5ba6b5d10d
children 8728284105ee
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
1 #import xlrd #avoid dep
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
2 import argparse
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
3 import re
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
4
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
5 parser = argparse.ArgumentParser()
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
6 parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence")
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
7 parser.add_argument("--ref", help="Reference file")
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
8 parser.add_argument("--output", help="Output file")
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
9 parser.add_argument("--id", help="ID to be used at the '>>>' line in the output")
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
10
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
11 args = parser.parse_args()
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
12
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
13 refdic = dict()
32
4c5ba6b5d10d Uploaded
davidvanzessen
parents: 0
diff changeset
14 with open(args.ref, 'rU') as ref:
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
15 currentSeq = ""
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
16 currentId = ""
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
17 for line in ref:
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
18 if line[0] is ">":
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
19 if currentSeq is not "" and currentId is not "":
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
20 refdic[currentId[1:]] = currentSeq
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
21 currentId = line.rstrip()
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
22 currentSeq = ""
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
23 else:
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
24 currentSeq += line.rstrip()
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
25 refdic[currentId[1:]] = currentSeq
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
26
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
27
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
28 vPattern = [r"(IGHV[0-9]-[0-9ab]+-?[0-9]?D?\*\d{1,2})"]#,
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
29 # r"(TRBV[0-9]{1,2}-?[0-9]?-?[123]?)",
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
30 # r"(IGKV[0-3]D?-[0-9]{1,2})",
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
31 # r"(IGLV[0-9]-[0-9]{1,2})",
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
32 # r"(TRAV[0-9]{1,2}(-[1-46])?(/DV[45678])?)",
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
33 # r"(TRGV[234589])",
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
34 # r"(TRDV[1-3])"]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
35
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
36 #vPattern = re.compile(r"|".join(vPattern))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
37 vPattern = re.compile("|".join(vPattern))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
38
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
39 def filterGene(s, pattern):
32
4c5ba6b5d10d Uploaded
davidvanzessen
parents: 0
diff changeset
40 s1 = s[s.find(" ") + 1:]
4c5ba6b5d10d Uploaded
davidvanzessen
parents: 0
diff changeset
41 return s1[:s1.find(" ")]
4c5ba6b5d10d Uploaded
davidvanzessen
parents: 0
diff changeset
42 """
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
43 if type(s) is not str:
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
44 return None
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
45 res = pattern.search(s)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
46 if res:
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
47 return res.group(0)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
48 return None
32
4c5ba6b5d10d Uploaded
davidvanzessen
parents: 0
diff changeset
49 """
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
50
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
51
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
52 currentSeq = ""
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
53 currentId = ""
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
54 first=True
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
55 with open(args.input, 'r') as i:
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
56 with open(args.output, 'a') as o:
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
57 o.write(">>>" + args.id + "\n")
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
58 outputdic = dict()
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
59 for line in i:
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
60 if first:
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
61 first = False
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
62 continue
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
63 linesplt = line.split("\t")
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
64 ref = filterGene(linesplt[1], vPattern)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
65 if not ref or not linesplt[2].rstrip():
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
66 continue
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
67 if ref in outputdic:
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
68 outputdic[ref] += [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
69 else:
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
70 outputdic[ref] = [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
71 #print outputdic
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
72
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
73 for k in outputdic.keys():
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
74 if k in refdic:
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
75 o.write(">>" + k + "\n")
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
76 o.write(refdic[k] + "\n")
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
77 for seq in outputdic[k]:
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
78 #print seq
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
79 o.write(">" + seq[0] + "\n")
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
80 o.write(seq[1] + "\n")
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
81 else:
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
82 print k + " not in reference, skipping " + k