0
|
1 #!/usr/bin/env python
|
2
|
2
|
0
|
3 import argparse
|
|
4 import sys
|
|
5
|
3
|
6 import psycopg2
|
7
|
7 from sqlalchemy import create_engine
|
3
|
8 from sqlalchemy import MetaData
|
0
|
9 from sqlalchemy.engine.url import make_url
|
|
10
|
|
11 metadata = MetaData()
|
|
12
|
|
13 SKIP_VALS = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT']
|
|
14
|
|
15
|
|
16 class EnsureSynced(object):
|
|
17 def __init__(self):
|
|
18 self.args = None
|
|
19 self.conn = None
|
|
20 self.parse_args()
|
|
21 self.outfh = open(self.args.output, "w")
|
|
22 self.connect_db()
|
|
23 self.engine = create_engine(self.args.database_connection_string)
|
|
24 self.metadata = MetaData(self.engine)
|
8
|
25 self.coral_mlg_rep_sample_ids_from_db = []
|
0
|
26 self.affy_ids_from_file = []
|
|
27
|
|
28 def connect_db(self):
|
|
29 url = make_url(self.args.database_connection_string)
|
|
30 args = url.translate_connect_args(username='user')
|
|
31 args.update(url.query)
|
|
32 assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.'
|
|
33 self.conn = psycopg2.connect(**args)
|
|
34
|
8
|
35 def get_coral_mlg_rep_sample_ids_from_db(self):
|
7
|
36 cmd = "SELECT coral_mlg_rep_sample_id, coral_mlg_clonal_id FROM genotype WHERE coral_mlg_rep_sample_id IS NOT NULL AND coral_mlg_rep_sample_id != '' AND coral_mlg_clonal_id != 'failed' ORDER BY coral_mlg_rep_sample_id;"
|
0
|
37 cur = self.conn.cursor()
|
|
38 cur.execute(cmd)
|
|
39 rows = cur.fetchall()
|
|
40 for row in rows:
|
8
|
41 self.coral_mlg_rep_sample_ids_from_db.append(row[0])
|
|
42 self.coral_mlg_rep_sample_ids_from_db.sort()
|
0
|
43
|
|
44 def get_affy_ids_from_file(self, f):
|
|
45 with open(f) as fh:
|
|
46 for line in fh:
|
|
47 line = line.strip()
|
|
48 if line in SKIP_VALS:
|
|
49 # Skip the first 9 lines in the file.
|
|
50 continue
|
|
51 self.affy_ids_from_file.append(line)
|
|
52 self.affy_ids_from_file.sort()
|
|
53
|
|
54 def get_difference(self, list1, list2):
|
|
55 if len(list1) > len(list2):
|
|
56 return list(set(list1) - set(list2))
|
|
57 return list(set(list2) - set(list1))
|
|
58
|
|
59 def log(self, msg):
|
|
60 self.outfh.write("%s\n" % msg)
|
|
61
|
|
62 def parse_args(self):
|
|
63 parser = argparse.ArgumentParser()
|
|
64 parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'),
|
|
65 parser.add_argument('--affy_ids_from_file', dest='affy_ids_from_file', help='Affy ids taken from all previously genotyped samples vcf file')
|
|
66 parser.add_argument('--output', dest='output', help='Output dataset'),
|
|
67 self.args = parser.parse_args()
|
|
68
|
|
69 def run(self):
|
8
|
70 self.get_coral_mlg_rep_sample_ids_from_db()
|
0
|
71 self.get_affy_ids_from_file(self.args.affy_ids_from_file)
|
8
|
72 if self.coral_mlg_rep_sample_ids_from_db == self.affy_ids_from_file:
|
0
|
73 in_sync = True
|
|
74 self.log("The selected file is in sync with the database.\n\n")
|
|
75 else:
|
|
76 in_sync = False
|
|
77 self.log("The selected file is not in sync with the database.\n\n")
|
8
|
78 num_coral_mlg_rep_sample_ids_from_db = len(self.coral_mlg_rep_sample_ids_from_db)
|
|
79 self.log("Number of coral mlg rep sample ids in the database: %d\n" % num_coral_mlg_rep_sample_ids_from_db)
|
0
|
80 num_affy_ids_from_file = len(self.affy_ids_from_file)
|
|
81 self.log("Number of Affymetrix ids in the file: %d\n" % num_affy_ids_from_file)
|
|
82 if not in_sync:
|
8
|
83 if num_coral_mlg_rep_sample_ids_from_db > num_affy_ids_from_file:
|
0
|
84 self.log("The database contains the following Affymetrix ids that are not in the file.\n")
|
|
85 else:
|
|
86 self.log("The file contains the following Affymetrix ids that are not in the database.\n")
|
8
|
87 diff_list = self.get_difference(self.coral_mlg_rep_sample_ids_from_db, self.affy_ids_from_file)
|
0
|
88 for affy_id in diff_list:
|
|
89 self.log("%s\n" % affy_id)
|
|
90 self.outfh.flush()
|
|
91 self.outfh.close()
|
|
92 sys.exit(1)
|
|
93
|
|
94 def shutdown(self):
|
|
95 self.outfh.flush()
|
|
96 self.outfh.close()
|
|
97 self.conn.close()
|
|
98
|
|
99
|
|
100 if __name__ == '__main__':
|
|
101 es = EnsureSynced()
|
|
102 es.run()
|
|
103 es.shutdown()
|