annotate imgt_locus_split.py @ 1:418b7dbc8947 draft

Uploaded
author davidvanzessen
date Mon, 17 Jul 2017 08:54:02 -0400
parents b00c257f0a67
children 4bb8f6523130
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
1 import argparse
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
2 import logging
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
3 import os
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
4 import re
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
5 import shutil
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
6 import sys
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
7 import tarfile
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
8 import tempfile
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
9 import zipfile
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
10
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
11 import magic
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
12
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
13 imgt_file_regex = re.compile("^\d+_[^P]")
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
14
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
15
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
16 def sniff_imgt_type(input_file):
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
17 m = magic.Magic()
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
18 file_type = m.from_file(input_file)
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
19 logging.debug("File type of {0} is {1}".format(input_file, file_type))
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
20 return file_type.split(" ")[0]
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
21
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
22
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
23 def unpack_imgt_zip(input_file, output_dir):
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
24 imgt_type = sniff_imgt_type(input_file)
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
25 if imgt_type == "Zip":
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
26 with zipfile.ZipFile(input_file) as inf:
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
27 inf.extractall(output_dir)
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
28 elif imgt_type == "XZ":
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
29 with tarfile.open(input_file) as inf:
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
30 inf.extractall(output_dir)
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
31 else:
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
32 raise IOError("Unsuppported file type: {0}".format(imgt_type))
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
33 logging.debug("Extracted {0} to {1}".format(input_file, output_dir))
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
34 check = os.listdir(output_dir)
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
35 if len(check) == 1:
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
36 check = os.path.join(output_dir, check[0])
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
37 if os.path.isdir(check):
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
38 logging.info("{0} is an older IMGT zip, removing extra dir".format(input_file))
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
39 files = os.listdir(check)
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
40 for file in files:
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
41 new_file = os.path.join(output_dir, file)
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
42 file = os.path.join(check, file)
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
43 shutil.move(file, new_file)
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
44 shutil.rmtree(check)
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
45
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
46
1
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
47 def filter_imgt_file(old_file, new_file, column, fltr):
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
48 logging.debug("Filtering {0} with {1}".format(old_file, fltr))
0
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
49 first = True
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
50 total = 0
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
51 remain = 0
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
52 with open(old_file, 'r') as of, open(new_file, 'w') as nf:
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
53 column_index = -1
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
54 for line in of:
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
55 splt = line.split("\t")
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
56 if first:
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
57 column_index = splt.index(column)
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
58 first = False
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
59 nf.write(line)
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
60 continue
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
61 total += 1
1
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
62 if len(splt) > column_index and splt[column_index].find(fltr) != -1:
0
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
63 remain += 1
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
64 nf.write(line)
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
65 return total, remain
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
66
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
67
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
68 def all_same_in_list(l):
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
69 return all(l[0] == x for x in l[1:])
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
70
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
71
1
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
72 def filter_imgt_dir(imgt_dir, locus):
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
73 logging.info("Working on {0}".format(locus))
0
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
74 imgt_files = [f for f in os.listdir(imgt_dir) if imgt_file_regex.match(f)]
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
75 tmp_file = os.path.join(imgt_dir, "tmp.txt")
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
76 totals = []
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
77 remains = []
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
78 for imgt_file in imgt_files:
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
79 imgt_file = os.path.join(imgt_dir, imgt_file)
1
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
80 total, remain = filter_imgt_file(imgt_file, tmp_file, "V-GENE and allele", locus)
0
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
81 totals.append(total)
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
82 remains.append(remain)
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
83 logging.debug("{0} rows, {1} after filtering".format(total, remain))
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
84 shutil.move(tmp_file, imgt_file)
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
85 if not (all_same_in_list(totals) and all_same_in_list(remains)):
1
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
86 logging.warning("Not all files had the same number of sequences remaining for {0}".format(imgt_dir))
0
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
87 return totals[0], remains[0]
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
88
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
89
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
90 def make_new_xz_file(input_dir, output_file):
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
91 logging.info("Creating new IMGT zip for {0} at {1}".format(input_dir, output_file))
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
92 imgt_files = [f for f in os.listdir(input_dir)]
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
93 with tarfile.open(output_file, 'w:xz') as out:
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
94 for imgt_file in imgt_files:
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
95 logging.debug("Writing {0} to new IMGT zip".format(imgt_file))
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
96 imgt_file = os.path.join(input_dir, imgt_file)
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
97 out.add(imgt_file, arcname=os.path.basename(imgt_file))
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
98
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
99
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
100 def main():
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
101 parser = argparse.ArgumentParser()
1
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
102 parser.add_argument("--input", help="The input IMGT file", required=True)
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
103 parser.add_argument("--output-ig", help="The output file for new IMGT ZIP with just IG sequences", default="None")
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
104 parser.add_argument("--output-igh", help="The output file for new IMGT ZIP with just IGH sequences", default="None")
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
105 parser.add_argument("--output-igk", help="The output file for new IMGT ZIP with just IGK sequences", default="None")
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
106 parser.add_argument("--output-igl", help="The output file for new IMGT ZIP with just IGL sequences", default="None")
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
107 parser.add_argument("--output-tr", help="The output file for new IMGT ZIP with just TR sequences", default="None")
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
108 parser.add_argument("--output-tra", help="The output file for new IMGT ZIP with just TRA sequences", default="None")
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
109 parser.add_argument("--output-trb", help="The output file for new IMGT ZIP with just TRB sequences", default="None")
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
110 parser.add_argument("--output-trd", help="The output file for new IMGT ZIP with just TRD sequences", default="None")
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
111 parser.add_argument("--output-trg", help="The output file for new IMGT ZIP with just TRG sequences", default="None")
0
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
112
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
113 logging.basicConfig(filename="./log.html", level=logging.DEBUG, format="%(asctime)s:&emsp;%(message)s <br />",
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
114 datefmt='%Y/%m/%d %H:%M:%S')
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
115 logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
116 logging.info("Started IMGT locus split")
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
117
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
118 args = parser.parse_args()
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
119 input_file = args.input
1
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
120 output_ig = args.output_ig
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
121 output_igh = args.output_igh
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
122 output_igk = args.output_igk
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
123 output_igl = args.output_igl
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
124 output_tr = args.output_tr
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
125 output_tra = args.output_tra
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
126 output_trb = args.output_trb
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
127 output_trd = args.output_trd
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
128 output_trg = args.output_trg
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
129
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
130 loci = {
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
131 "IG": output_ig,
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
132 "IGH": output_igh,
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
133 "IGK": output_igk,
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
134 "IGL": output_igl,
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
135 "TR": output_tr,
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
136 "TRA": output_tra,
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
137 "TRB": output_trb,
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
138 "TRD": output_trd,
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
139 "TRG": output_trg
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
140 }
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
141
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
142 loci_to_filter = {}
0
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
143
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
144 logging.debug("All Parameters:")
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
145 logging.debug("Input: {0}".format(input_file))
1
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
146 for locus, path in loci.items():
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
147 logging.debug("{0}: {1}".format(locus, path))
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
148 if path != "None" and os.path.isdir(os.path.split(path)[0]):
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
149 loci_to_filter[locus] = path
0
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
150
1
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
151 if len(loci_to_filter) == 0:
0
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
152 raise Exception("No locus selected, nothing to do")
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
153
1
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
154 logging.info("Parameters:")
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
155 for locus, path in loci_to_filter.items():
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
156 logging.info("{0}: {1}".format(locus, path))
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
157
0
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
158 work_dir = tempfile.mkdtemp()
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
159 original_files_dir = os.path.join(work_dir, "original")
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
160 os.mkdir(original_files_dir)
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
161
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
162 unpack_imgt_zip(input_file, original_files_dir)
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
163
1
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
164 for locus, path in loci_to_filter.items():
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
165 locus_dir = os.path.join(work_dir, locus)
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
166 shutil.copytree(original_files_dir, locus_dir)
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
167 total, remain = filter_imgt_dir(locus_dir, locus)
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
168 logging.info("{0}\t{1}\t{2}\t{3}".format(locus, path, total, remain))
0
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
169
1
418b7dbc8947 Uploaded
davidvanzessen
parents: 0
diff changeset
170 make_new_xz_file(locus_dir, loci_to_filter[locus])
0
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
171
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
172
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
173 if __name__ == "__main__":
b00c257f0a67 Uploaded
davidvanzessen
parents:
diff changeset
174 main()