Mercurial > repos > greg > ensure_synced
comparison ensure_synced.py @ 0:9180906544b6 draft
Uploaded
| author | greg |
|---|---|
| date | Thu, 15 Aug 2019 10:37:49 -0400 |
| parents | |
| children | 3dc919d53939 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:9180906544b6 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 from __future__ import print_function | |
| 3 | |
| 4 import argparse | |
| 5 import psycopg2 | |
| 6 import sys | |
| 7 | |
| 8 from sqlalchemy import create_engine | |
| 9 from sqlalchemy import MetaData | |
| 10 from sqlalchemy.engine.url import make_url | |
| 11 | |
| 12 metadata = MetaData() | |
| 13 | |
| 14 SKIP_VALS = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT'] | |
| 15 | |
| 16 | |
| 17 class EnsureSynced(object): | |
| 18 def __init__(self): | |
| 19 self.args = None | |
| 20 self.conn = None | |
| 21 self.parse_args() | |
| 22 self.outfh = open(self.args.output, "w") | |
| 23 self.connect_db() | |
| 24 self.engine = create_engine(self.args.database_connection_string) | |
| 25 self.metadata = MetaData(self.engine) | |
| 26 self.affy_ids_from_db = [] | |
| 27 self.affy_ids_from_file = [] | |
| 28 | |
| 29 def connect_db(self): | |
| 30 url = make_url(self.args.database_connection_string) | |
| 31 args = url.translate_connect_args(username='user') | |
| 32 args.update(url.query) | |
| 33 assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.' | |
| 34 self.conn = psycopg2.connect(**args) | |
| 35 | |
| 36 def get_affy_ids_from_db(self): | |
| 37 cmd = "SELECT affy_id FROM sample WHERE genotype_id NOT IN (SELECT id FROM genotype WHERE coral_mlg_clonal_id = 'failed') ORDER BY affy_id;" | |
| 38 cur = self.conn.cursor() | |
| 39 cur.execute(cmd) | |
| 40 rows = cur.fetchall() | |
| 41 for row in rows: | |
| 42 self.affy_ids_from_db.append(row[0]) | |
| 43 | |
| 44 def get_affy_ids_from_file(self, f): | |
| 45 with open(f) as fh: | |
| 46 for line in fh: | |
| 47 line = line.strip() | |
| 48 if line in SKIP_VALS: | |
| 49 # Skip the first 9 lines in the file. | |
| 50 continue | |
| 51 self.affy_ids_from_file.append(line) | |
| 52 self.affy_ids_from_file.sort() | |
| 53 | |
| 54 def get_difference(self, list1, list2): | |
| 55 if len(list1) > len(list2): | |
| 56 return list(set(list1) - set(list2)) | |
| 57 return list(set(list2) - set(list1)) | |
| 58 | |
| 59 def log(self, msg): | |
| 60 self.outfh.write("%s\n" % msg) | |
| 61 | |
| 62 def parse_args(self): | |
| 63 parser = argparse.ArgumentParser() | |
| 64 parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'), | |
| 65 parser.add_argument('--affy_ids_from_file', dest='affy_ids_from_file', help='Affy ids taken from all previously genotyped samples vcf file') | |
| 66 parser.add_argument('--output', dest='output', help='Output dataset'), | |
| 67 self.args = parser.parse_args() | |
| 68 | |
| 69 def run(self): | |
| 70 self.get_affy_ids_from_db() | |
| 71 self.get_affy_ids_from_file(self.args.affy_ids_from_file) | |
| 72 if self.affy_ids_from_db == self.affy_ids_from_file: | |
| 73 in_sync = True | |
| 74 self.log("The selected file is in sync with the database.\n\n") | |
| 75 else: | |
| 76 in_sync = False | |
| 77 self.log("The selected file is not in sync with the database.\n\n") | |
| 78 num_affy_ids_from_db = len(self.affy_ids_from_db) | |
| 79 self.log("Number of Affymetrix ids in the database: %d\n" % num_affy_ids_from_db) | |
| 80 num_affy_ids_from_file = len(self.affy_ids_from_file) | |
| 81 self.log("Number of Affymetrix ids in the file: %d\n" % num_affy_ids_from_file) | |
| 82 if not in_sync: | |
| 83 if num_affy_ids_from_db > num_affy_ids_from_file: | |
| 84 self.log("The database contains the following Affymetrix ids that are not in the file.\n") | |
| 85 else: | |
| 86 self.log("The file contains the following Affymetrix ids that are not in the database.\n") | |
| 87 diff_list = self.get_difference(self.affy_ids_from_db, self.affy_ids_from_file) | |
| 88 for affy_id in diff_list: | |
| 89 self.log("%s\n" % affy_id) | |
| 90 self.outfh.flush() | |
| 91 self.outfh.close() | |
| 92 sys.exit(1) | |
| 93 | |
| 94 def shutdown(self): | |
| 95 self.outfh.flush() | |
| 96 self.outfh.close() | |
| 97 self.conn.close() | |
| 98 | |
| 99 | |
| 100 if __name__ == '__main__': | |
| 101 es = EnsureSynced() | |
| 102 es.run() | |
| 103 es.shutdown() |
