Mercurial > repos > greg > ensure_synced
comparison ensure_synced.py @ 0:9180906544b6 draft
Uploaded
author | greg |
---|---|
date | Thu, 15 Aug 2019 10:37:49 -0400 |
parents | |
children | 3dc919d53939 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9180906544b6 |
---|---|
1 #!/usr/bin/env python | |
2 from __future__ import print_function | |
3 | |
4 import argparse | |
5 import psycopg2 | |
6 import sys | |
7 | |
8 from sqlalchemy import create_engine | |
9 from sqlalchemy import MetaData | |
10 from sqlalchemy.engine.url import make_url | |
11 | |
12 metadata = MetaData() | |
13 | |
14 SKIP_VALS = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT'] | |
15 | |
16 | |
17 class EnsureSynced(object): | |
18 def __init__(self): | |
19 self.args = None | |
20 self.conn = None | |
21 self.parse_args() | |
22 self.outfh = open(self.args.output, "w") | |
23 self.connect_db() | |
24 self.engine = create_engine(self.args.database_connection_string) | |
25 self.metadata = MetaData(self.engine) | |
26 self.affy_ids_from_db = [] | |
27 self.affy_ids_from_file = [] | |
28 | |
29 def connect_db(self): | |
30 url = make_url(self.args.database_connection_string) | |
31 args = url.translate_connect_args(username='user') | |
32 args.update(url.query) | |
33 assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.' | |
34 self.conn = psycopg2.connect(**args) | |
35 | |
36 def get_affy_ids_from_db(self): | |
37 cmd = "SELECT affy_id FROM sample WHERE genotype_id NOT IN (SELECT id FROM genotype WHERE coral_mlg_clonal_id = 'failed') ORDER BY affy_id;" | |
38 cur = self.conn.cursor() | |
39 cur.execute(cmd) | |
40 rows = cur.fetchall() | |
41 for row in rows: | |
42 self.affy_ids_from_db.append(row[0]) | |
43 | |
44 def get_affy_ids_from_file(self, f): | |
45 with open(f) as fh: | |
46 for line in fh: | |
47 line = line.strip() | |
48 if line in SKIP_VALS: | |
49 # Skip the first 9 lines in the file. | |
50 continue | |
51 self.affy_ids_from_file.append(line) | |
52 self.affy_ids_from_file.sort() | |
53 | |
54 def get_difference(self, list1, list2): | |
55 if len(list1) > len(list2): | |
56 return list(set(list1) - set(list2)) | |
57 return list(set(list2) - set(list1)) | |
58 | |
59 def log(self, msg): | |
60 self.outfh.write("%s\n" % msg) | |
61 | |
62 def parse_args(self): | |
63 parser = argparse.ArgumentParser() | |
64 parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'), | |
65 parser.add_argument('--affy_ids_from_file', dest='affy_ids_from_file', help='Affy ids taken from all previously genotyped samples vcf file') | |
66 parser.add_argument('--output', dest='output', help='Output dataset'), | |
67 self.args = parser.parse_args() | |
68 | |
69 def run(self): | |
70 self.get_affy_ids_from_db() | |
71 self.get_affy_ids_from_file(self.args.affy_ids_from_file) | |
72 if self.affy_ids_from_db == self.affy_ids_from_file: | |
73 in_sync = True | |
74 self.log("The selected file is in sync with the database.\n\n") | |
75 else: | |
76 in_sync = False | |
77 self.log("The selected file is not in sync with the database.\n\n") | |
78 num_affy_ids_from_db = len(self.affy_ids_from_db) | |
79 self.log("Number of Affymetrix ids in the database: %d\n" % num_affy_ids_from_db) | |
80 num_affy_ids_from_file = len(self.affy_ids_from_file) | |
81 self.log("Number of Affymetrix ids in the file: %d\n" % num_affy_ids_from_file) | |
82 if not in_sync: | |
83 if num_affy_ids_from_db > num_affy_ids_from_file: | |
84 self.log("The database contains the following Affymetrix ids that are not in the file.\n") | |
85 else: | |
86 self.log("The file contains the following Affymetrix ids that are not in the database.\n") | |
87 diff_list = self.get_difference(self.affy_ids_from_db, self.affy_ids_from_file) | |
88 for affy_id in diff_list: | |
89 self.log("%s\n" % affy_id) | |
90 self.outfh.flush() | |
91 self.outfh.close() | |
92 sys.exit(1) | |
93 | |
94 def shutdown(self): | |
95 self.outfh.flush() | |
96 self.outfh.close() | |
97 self.conn.close() | |
98 | |
99 | |
100 if __name__ == '__main__': | |
101 es = EnsureSynced() | |
102 es.run() | |
103 es.shutdown() |