Mercurial > repos > davidvanzessen > imgt_locus_split
comparison imgt_locus_split.py @ 0:b00c257f0a67 draft
Uploaded
| author | davidvanzessen |
|---|---|
| date | Thu, 13 Jul 2017 10:24:39 -0400 |
| parents | |
| children | 418b7dbc8947 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:b00c257f0a67 |
|---|---|
| 1 import argparse | |
| 2 import logging | |
| 3 import os | |
| 4 import re | |
| 5 import shutil | |
| 6 import sys | |
| 7 import tarfile | |
| 8 import tempfile | |
| 9 import zipfile | |
| 10 | |
| 11 import magic | |
| 12 | |
| 13 imgt_file_regex = re.compile("^\d+_[^P]") | |
| 14 | |
| 15 | |
| 16 def sniff_imgt_type(input_file): | |
| 17 m = magic.Magic() | |
| 18 file_type = m.from_file(input_file) | |
| 19 logging.debug("File type of {0} is {1}".format(input_file, file_type)) | |
| 20 return file_type.split(" ")[0] | |
| 21 | |
| 22 | |
| 23 def unpack_imgt_zip(input_file, output_dir): | |
| 24 imgt_type = sniff_imgt_type(input_file) | |
| 25 if imgt_type == "Zip": | |
| 26 with zipfile.ZipFile(input_file) as inf: | |
| 27 inf.extractall(output_dir) | |
| 28 elif imgt_type == "XZ": | |
| 29 with tarfile.open(input_file) as inf: | |
| 30 inf.extractall(output_dir) | |
| 31 else: | |
| 32 raise IOError("Unsuppported file type: {0}".format(imgt_type)) | |
| 33 logging.debug("Extracted {0} to {1}".format(input_file, output_dir)) | |
| 34 check = os.listdir(output_dir) | |
| 35 if len(check) == 1: | |
| 36 check = os.path.join(output_dir, check[0]) | |
| 37 if os.path.isdir(check): | |
| 38 logging.info("{0} is an older IMGT zip, removing extra dir".format(input_file)) | |
| 39 files = os.listdir(check) | |
| 40 for file in files: | |
| 41 new_file = os.path.join(output_dir, file) | |
| 42 file = os.path.join(check, file) | |
| 43 shutil.move(file, new_file) | |
| 44 shutil.rmtree(check) | |
| 45 | |
| 46 | |
| 47 def filter_tabular_file(old_file, new_file, column, regex): | |
| 48 logging.debug("Filtering {0} with {1}".format(old_file, regex.pattern)) | |
| 49 first = True | |
| 50 total = 0 | |
| 51 remain = 0 | |
| 52 with open(old_file, 'r') as of, open(new_file, 'w') as nf: | |
| 53 column_index = -1 | |
| 54 for line in of: | |
| 55 splt = line.split("\t") | |
| 56 if first: | |
| 57 column_index = splt.index(column) | |
| 58 first = False | |
| 59 nf.write(line) | |
| 60 continue | |
| 61 total += 1 | |
| 62 if len(splt) >= column_index and regex.search(splt[column_index]): | |
| 63 remain += 1 | |
| 64 nf.write(line) | |
| 65 return total, remain | |
| 66 | |
| 67 | |
| 68 def all_same_in_list(l): | |
| 69 return all(l[0] == x for x in l[1:]) | |
| 70 | |
| 71 | |
| 72 def filter_imgt_dir(imgt_dir, loci): | |
| 73 logging.info("Filtering {0} with {1}".format(imgt_dir, loci)) | |
| 74 imgt_files = [f for f in os.listdir(imgt_dir) if imgt_file_regex.match(f)] | |
| 75 tmp_file = os.path.join(imgt_dir, "tmp.txt") | |
| 76 totals = [] | |
| 77 remains = [] | |
| 78 loci_regex = re.compile("|".join(loci)) | |
| 79 for imgt_file in imgt_files: | |
| 80 imgt_file = os.path.join(imgt_dir, imgt_file) | |
| 81 total, remain = filter_tabular_file(imgt_file, tmp_file, "V-GENE and allele", loci_regex) | |
| 82 totals.append(total) | |
| 83 remains.append(remain) | |
| 84 logging.debug("{0} rows, {1} after filtering".format(total, remain)) | |
| 85 shutil.move(tmp_file, imgt_file) | |
| 86 if not (all_same_in_list(totals) and all_same_in_list(remains)): | |
| 87 logging.warning("Not all files had the same number of sequences remaining for {0}: {1}".format(imgt_dir, remains)) | |
| 88 return totals[0], remains[0] | |
| 89 | |
| 90 | |
| 91 def make_new_xz_file(input_dir, output_file): | |
| 92 logging.info("Creating new IMGT zip for {0} at {1}".format(input_dir, output_file)) | |
| 93 imgt_files = [f for f in os.listdir(input_dir)] | |
| 94 with tarfile.open(output_file, 'w:xz') as out: | |
| 95 for imgt_file in imgt_files: | |
| 96 logging.debug("Writing {0} to new IMGT zip".format(imgt_file)) | |
| 97 imgt_file = os.path.join(input_dir, imgt_file) | |
| 98 out.add(imgt_file, arcname=os.path.basename(imgt_file)) | |
| 99 | |
| 100 | |
| 101 def main(): | |
| 102 parser = argparse.ArgumentParser() | |
| 103 parser.add_argument("-i", "--input", help="The input IMGT file", required=True) | |
| 104 parser.add_argument("-l", "--loci", help="The Loci to filter on", required=True) | |
| 105 parser.add_argument("-o", "--output", help="The output file for the new IMGT zip with just the filtered sequences", required=True) | |
| 106 | |
| 107 logging.basicConfig(filename="./log.html", level=logging.DEBUG, format="%(asctime)s: %(message)s <br />", | |
| 108 datefmt='%Y/%m/%d %H:%M:%S') | |
| 109 logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) | |
| 110 logging.info("Started IMGT locus split") | |
| 111 | |
| 112 args = parser.parse_args() | |
| 113 input_file = args.input | |
| 114 loci = args.loci.split(",") | |
| 115 output_file = args.output | |
| 116 | |
| 117 logging.debug("All Parameters:") | |
| 118 logging.debug("Input: {0}".format(input_file)) | |
| 119 logging.debug("Loci: {0}".format(loci)) | |
| 120 logging.debug("Output: {0}".format(output_file)) | |
| 121 | |
| 122 if len(loci) == 0: | |
| 123 raise Exception("No locus selected, nothing to do") | |
| 124 | |
| 125 work_dir = tempfile.mkdtemp() | |
| 126 original_files_dir = os.path.join(work_dir, "original") | |
| 127 os.mkdir(original_files_dir) | |
| 128 | |
| 129 unpack_imgt_zip(input_file, original_files_dir) | |
| 130 | |
| 131 total, remain = filter_imgt_dir(original_files_dir, loci) | |
| 132 logging.info("{0}\t{1}".format(total, remain)) | |
| 133 | |
| 134 make_new_xz_file(original_files_dir, output_file) | |
| 135 | |
| 136 | |
| 137 if __name__ == "__main__": | |
| 138 main() |
