| 0 | 1 #!/usr/bin/env python | 
| 2 | 2 | 
| 0 | 3 import argparse | 
|  | 4 import sys | 
|  | 5 | 
| 3 | 6 import psycopg2 | 
| 7 | 7 from sqlalchemy import create_engine | 
| 0 | 8 from sqlalchemy.engine.url import make_url | 
|  | 9 | 
|  | 10 SKIP_VALS = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT'] | 
|  | 11 | 
|  | 12 | 
|  | 13 class EnsureSynced(object): | 
|  | 14     def __init__(self): | 
|  | 15         self.args = None | 
|  | 16         self.conn = None | 
|  | 17         self.parse_args() | 
|  | 18         self.outfh = open(self.args.output, "w") | 
|  | 19         self.connect_db() | 
|  | 20         self.engine = create_engine(self.args.database_connection_string) | 
| 8 | 21         self.coral_mlg_rep_sample_ids_from_db = [] | 
| 0 | 22         self.affy_ids_from_file = [] | 
|  | 23 | 
|  | 24     def connect_db(self): | 
|  | 25         url = make_url(self.args.database_connection_string) | 
|  | 26         args = url.translate_connect_args(username='user') | 
|  | 27         args.update(url.query) | 
|  | 28         assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.' | 
|  | 29         self.conn = psycopg2.connect(**args) | 
|  | 30 | 
| 8 | 31     def get_coral_mlg_rep_sample_ids_from_db(self): | 
| 7 | 32         cmd = "SELECT coral_mlg_rep_sample_id, coral_mlg_clonal_id FROM genotype WHERE coral_mlg_rep_sample_id IS NOT NULL AND coral_mlg_rep_sample_id != '' AND coral_mlg_clonal_id != 'failed' ORDER BY coral_mlg_rep_sample_id;" | 
| 0 | 33         cur = self.conn.cursor() | 
|  | 34         cur.execute(cmd) | 
|  | 35         rows = cur.fetchall() | 
|  | 36         for row in rows: | 
| 8 | 37             self.coral_mlg_rep_sample_ids_from_db.append(row[0]) | 
|  | 38         self.coral_mlg_rep_sample_ids_from_db.sort() | 
| 0 | 39 | 
|  | 40     def get_affy_ids_from_file(self, f): | 
|  | 41         with open(f) as fh: | 
|  | 42             for line in fh: | 
|  | 43                 line = line.strip() | 
|  | 44                 if line in SKIP_VALS: | 
|  | 45                     # Skip the first 9 lines in the file. | 
|  | 46                     continue | 
|  | 47                 self.affy_ids_from_file.append(line) | 
|  | 48         self.affy_ids_from_file.sort() | 
|  | 49 | 
|  | 50     def get_difference(self, list1, list2): | 
|  | 51         if len(list1) > len(list2): | 
|  | 52             return list(set(list1) - set(list2)) | 
|  | 53         return list(set(list2) - set(list1)) | 
|  | 54 | 
|  | 55     def log(self, msg): | 
|  | 56         self.outfh.write("%s\n" % msg) | 
|  | 57 | 
|  | 58     def parse_args(self): | 
|  | 59         parser = argparse.ArgumentParser() | 
|  | 60         parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'), | 
|  | 61         parser.add_argument('--affy_ids_from_file', dest='affy_ids_from_file', help='Affy ids taken from all previously genotyped samples vcf file') | 
|  | 62         parser.add_argument('--output', dest='output', help='Output dataset'), | 
|  | 63         self.args = parser.parse_args() | 
|  | 64 | 
|  | 65     def run(self): | 
| 8 | 66         self.get_coral_mlg_rep_sample_ids_from_db() | 
| 0 | 67         self.get_affy_ids_from_file(self.args.affy_ids_from_file) | 
| 8 | 68         if self.coral_mlg_rep_sample_ids_from_db == self.affy_ids_from_file: | 
| 0 | 69             in_sync = True | 
|  | 70             self.log("The selected file is in sync with the database.\n\n") | 
|  | 71         else: | 
|  | 72             in_sync = False | 
|  | 73             self.log("The selected file is not in sync with the database.\n\n") | 
| 8 | 74         num_coral_mlg_rep_sample_ids_from_db = len(self.coral_mlg_rep_sample_ids_from_db) | 
|  | 75         self.log("Number of coral mlg rep sample ids in the database: %d\n" % num_coral_mlg_rep_sample_ids_from_db) | 
| 0 | 76         num_affy_ids_from_file = len(self.affy_ids_from_file) | 
|  | 77         self.log("Number of Affymetrix ids in the file: %d\n" % num_affy_ids_from_file) | 
|  | 78         if not in_sync: | 
| 8 | 79             if num_coral_mlg_rep_sample_ids_from_db > num_affy_ids_from_file: | 
| 0 | 80                 self.log("The database contains the following Affymetrix ids that are not in the file.\n") | 
|  | 81             else: | 
|  | 82                 self.log("The file contains the following Affymetrix ids that are not in the database.\n") | 
| 8 | 83             diff_list = self.get_difference(self.coral_mlg_rep_sample_ids_from_db, self.affy_ids_from_file) | 
| 0 | 84             for affy_id in diff_list: | 
|  | 85                 self.log("%s\n" % affy_id) | 
|  | 86             self.outfh.flush() | 
|  | 87             self.outfh.close() | 
|  | 88             sys.exit(1) | 
|  | 89 | 
|  | 90     def shutdown(self): | 
|  | 91         self.outfh.flush() | 
|  | 92         self.outfh.close() | 
|  | 93         self.conn.close() | 
|  | 94 | 
|  | 95 | 
|  | 96 if __name__ == '__main__': | 
|  | 97     es = EnsureSynced() | 
|  | 98     es.run() | 
|  | 99     es.shutdown() |