comparison imgt_locus_split.py @ 0:b00c257f0a67 draft

Uploaded
author davidvanzessen
date Thu, 13 Jul 2017 10:24:39 -0400
parents
children 418b7dbc8947
comparison
equal deleted inserted replaced
-1:000000000000 0:b00c257f0a67
1 import argparse
2 import logging
3 import os
4 import re
5 import shutil
6 import sys
7 import tarfile
8 import tempfile
9 import zipfile
10
11 import magic
12
13 imgt_file_regex = re.compile("^\d+_[^P]")
14
15
16 def sniff_imgt_type(input_file):
17 m = magic.Magic()
18 file_type = m.from_file(input_file)
19 logging.debug("File type of {0} is {1}".format(input_file, file_type))
20 return file_type.split(" ")[0]
21
22
23 def unpack_imgt_zip(input_file, output_dir):
24 imgt_type = sniff_imgt_type(input_file)
25 if imgt_type == "Zip":
26 with zipfile.ZipFile(input_file) as inf:
27 inf.extractall(output_dir)
28 elif imgt_type == "XZ":
29 with tarfile.open(input_file) as inf:
30 inf.extractall(output_dir)
31 else:
32 raise IOError("Unsuppported file type: {0}".format(imgt_type))
33 logging.debug("Extracted {0} to {1}".format(input_file, output_dir))
34 check = os.listdir(output_dir)
35 if len(check) == 1:
36 check = os.path.join(output_dir, check[0])
37 if os.path.isdir(check):
38 logging.info("{0} is an older IMGT zip, removing extra dir".format(input_file))
39 files = os.listdir(check)
40 for file in files:
41 new_file = os.path.join(output_dir, file)
42 file = os.path.join(check, file)
43 shutil.move(file, new_file)
44 shutil.rmtree(check)
45
46
47 def filter_tabular_file(old_file, new_file, column, regex):
48 logging.debug("Filtering {0} with {1}".format(old_file, regex.pattern))
49 first = True
50 total = 0
51 remain = 0
52 with open(old_file, 'r') as of, open(new_file, 'w') as nf:
53 column_index = -1
54 for line in of:
55 splt = line.split("\t")
56 if first:
57 column_index = splt.index(column)
58 first = False
59 nf.write(line)
60 continue
61 total += 1
62 if len(splt) >= column_index and regex.search(splt[column_index]):
63 remain += 1
64 nf.write(line)
65 return total, remain
66
67
68 def all_same_in_list(l):
69 return all(l[0] == x for x in l[1:])
70
71
72 def filter_imgt_dir(imgt_dir, loci):
73 logging.info("Filtering {0} with {1}".format(imgt_dir, loci))
74 imgt_files = [f for f in os.listdir(imgt_dir) if imgt_file_regex.match(f)]
75 tmp_file = os.path.join(imgt_dir, "tmp.txt")
76 totals = []
77 remains = []
78 loci_regex = re.compile("|".join(loci))
79 for imgt_file in imgt_files:
80 imgt_file = os.path.join(imgt_dir, imgt_file)
81 total, remain = filter_tabular_file(imgt_file, tmp_file, "V-GENE and allele", loci_regex)
82 totals.append(total)
83 remains.append(remain)
84 logging.debug("{0} rows, {1} after filtering".format(total, remain))
85 shutil.move(tmp_file, imgt_file)
86 if not (all_same_in_list(totals) and all_same_in_list(remains)):
87 logging.warning("Not all files had the same number of sequences remaining for {0}: {1}".format(imgt_dir, remains))
88 return totals[0], remains[0]
89
90
91 def make_new_xz_file(input_dir, output_file):
92 logging.info("Creating new IMGT zip for {0} at {1}".format(input_dir, output_file))
93 imgt_files = [f for f in os.listdir(input_dir)]
94 with tarfile.open(output_file, 'w:xz') as out:
95 for imgt_file in imgt_files:
96 logging.debug("Writing {0} to new IMGT zip".format(imgt_file))
97 imgt_file = os.path.join(input_dir, imgt_file)
98 out.add(imgt_file, arcname=os.path.basename(imgt_file))
99
100
101 def main():
102 parser = argparse.ArgumentParser()
103 parser.add_argument("-i", "--input", help="The input IMGT file", required=True)
104 parser.add_argument("-l", "--loci", help="The Loci to filter on", required=True)
105 parser.add_argument("-o", "--output", help="The output file for the new IMGT zip with just the filtered sequences", required=True)
106
107 logging.basicConfig(filename="./log.html", level=logging.DEBUG, format="%(asctime)s:&emsp;%(message)s <br />",
108 datefmt='%Y/%m/%d %H:%M:%S')
109 logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
110 logging.info("Started IMGT locus split")
111
112 args = parser.parse_args()
113 input_file = args.input
114 loci = args.loci.split(",")
115 output_file = args.output
116
117 logging.debug("All Parameters:")
118 logging.debug("Input: {0}".format(input_file))
119 logging.debug("Loci: {0}".format(loci))
120 logging.debug("Output: {0}".format(output_file))
121
122 if len(loci) == 0:
123 raise Exception("No locus selected, nothing to do")
124
125 work_dir = tempfile.mkdtemp()
126 original_files_dir = os.path.join(work_dir, "original")
127 os.mkdir(original_files_dir)
128
129 unpack_imgt_zip(input_file, original_files_dir)
130
131 total, remain = filter_imgt_dir(original_files_dir, loci)
132 logging.info("{0}\t{1}".format(total, remain))
133
134 make_new_xz_file(original_files_dir, output_file)
135
136
137 if __name__ == "__main__":
138 main()