| 0 | 1 #!/usr/bin/env python | 
| 2 | 2 | 
| 0 | 3 import argparse | 
|  | 4 import sys | 
|  | 5 | 
| 3 | 6 import psycopg2 | 
| 7 | 7 from sqlalchemy import create_engine | 
| 3 | 8 from sqlalchemy import MetaData | 
| 0 | 9 from sqlalchemy.engine.url import make_url | 
|  | 10 | 
|  | 11 metadata = MetaData() | 
|  | 12 | 
|  | 13 SKIP_VALS = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT'] | 
|  | 14 | 
|  | 15 | 
|  | 16 class EnsureSynced(object): | 
|  | 17     def __init__(self): | 
|  | 18         self.args = None | 
|  | 19         self.conn = None | 
|  | 20         self.parse_args() | 
|  | 21         self.outfh = open(self.args.output, "w") | 
|  | 22         self.connect_db() | 
|  | 23         self.engine = create_engine(self.args.database_connection_string) | 
|  | 24         self.metadata = MetaData(self.engine) | 
| 8 | 25         self.coral_mlg_rep_sample_ids_from_db = [] | 
| 0 | 26         self.affy_ids_from_file = [] | 
|  | 27 | 
|  | 28     def connect_db(self): | 
|  | 29         url = make_url(self.args.database_connection_string) | 
|  | 30         args = url.translate_connect_args(username='user') | 
|  | 31         args.update(url.query) | 
|  | 32         assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.' | 
|  | 33         self.conn = psycopg2.connect(**args) | 
|  | 34 | 
| 8 | 35     def get_coral_mlg_rep_sample_ids_from_db(self): | 
| 7 | 36         cmd = "SELECT coral_mlg_rep_sample_id, coral_mlg_clonal_id FROM genotype WHERE coral_mlg_rep_sample_id IS NOT NULL AND coral_mlg_rep_sample_id != '' AND coral_mlg_clonal_id != 'failed' ORDER BY coral_mlg_rep_sample_id;" | 
| 0 | 37         cur = self.conn.cursor() | 
|  | 38         cur.execute(cmd) | 
|  | 39         rows = cur.fetchall() | 
|  | 40         for row in rows: | 
| 8 | 41             self.coral_mlg_rep_sample_ids_from_db.append(row[0]) | 
|  | 42         self.coral_mlg_rep_sample_ids_from_db.sort() | 
| 0 | 43 | 
|  | 44     def get_affy_ids_from_file(self, f): | 
|  | 45         with open(f) as fh: | 
|  | 46             for line in fh: | 
|  | 47                 line = line.strip() | 
|  | 48                 if line in SKIP_VALS: | 
|  | 49                     # Skip the first 9 lines in the file. | 
|  | 50                     continue | 
|  | 51                 self.affy_ids_from_file.append(line) | 
|  | 52         self.affy_ids_from_file.sort() | 
|  | 53 | 
|  | 54     def get_difference(self, list1, list2): | 
|  | 55         if len(list1) > len(list2): | 
|  | 56             return list(set(list1) - set(list2)) | 
|  | 57         return list(set(list2) - set(list1)) | 
|  | 58 | 
|  | 59     def log(self, msg): | 
|  | 60         self.outfh.write("%s\n" % msg) | 
|  | 61 | 
|  | 62     def parse_args(self): | 
|  | 63         parser = argparse.ArgumentParser() | 
|  | 64         parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'), | 
|  | 65         parser.add_argument('--affy_ids_from_file', dest='affy_ids_from_file', help='Affy ids taken from all previously genotyped samples vcf file') | 
|  | 66         parser.add_argument('--output', dest='output', help='Output dataset'), | 
|  | 67         self.args = parser.parse_args() | 
|  | 68 | 
|  | 69     def run(self): | 
| 8 | 70         self.get_coral_mlg_rep_sample_ids_from_db() | 
| 0 | 71         self.get_affy_ids_from_file(self.args.affy_ids_from_file) | 
| 8 | 72         if self.coral_mlg_rep_sample_ids_from_db == self.affy_ids_from_file: | 
| 0 | 73             in_sync = True | 
|  | 74             self.log("The selected file is in sync with the database.\n\n") | 
|  | 75         else: | 
|  | 76             in_sync = False | 
|  | 77             self.log("The selected file is not in sync with the database.\n\n") | 
| 8 | 78         num_coral_mlg_rep_sample_ids_from_db = len(self.coral_mlg_rep_sample_ids_from_db) | 
|  | 79         self.log("Number of coral mlg rep sample ids in the database: %d\n" % num_coral_mlg_rep_sample_ids_from_db) | 
| 0 | 80         num_affy_ids_from_file = len(self.affy_ids_from_file) | 
|  | 81         self.log("Number of Affymetrix ids in the file: %d\n" % num_affy_ids_from_file) | 
|  | 82         if not in_sync: | 
| 8 | 83             if num_coral_mlg_rep_sample_ids_from_db > num_affy_ids_from_file: | 
| 0 | 84                 self.log("The database contains the following Affymetrix ids that are not in the file.\n") | 
|  | 85             else: | 
|  | 86                 self.log("The file contains the following Affymetrix ids that are not in the database.\n") | 
| 8 | 87             diff_list = self.get_difference(self.coral_mlg_rep_sample_ids_from_db, self.affy_ids_from_file) | 
| 0 | 88             for affy_id in diff_list: | 
|  | 89                 self.log("%s\n" % affy_id) | 
|  | 90             self.outfh.flush() | 
|  | 91             self.outfh.close() | 
|  | 92             sys.exit(1) | 
|  | 93 | 
|  | 94     def shutdown(self): | 
|  | 95         self.outfh.flush() | 
|  | 96         self.outfh.close() | 
|  | 97         self.conn.close() | 
|  | 98 | 
|  | 99 | 
|  | 100 if __name__ == '__main__': | 
|  | 101     es = EnsureSynced() | 
|  | 102     es.run() | 
|  | 103     es.shutdown() |