annotate ensure_synced.py @ 9:05920a4bd5b6 draft default tip

Uploaded
author greg
date Sat, 16 Nov 2024 18:34:02 +0000
parents aaa6ae7c64de
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9180906544b6 Uploaded
greg
parents:
diff changeset
1 #!/usr/bin/env python
2
3dc919d53939 Uploaded
greg
parents: 0
diff changeset
2
0
9180906544b6 Uploaded
greg
parents:
diff changeset
3 import argparse
9180906544b6 Uploaded
greg
parents:
diff changeset
4 import sys
9180906544b6 Uploaded
greg
parents:
diff changeset
5
3
2656d87abfee Uploaded
greg
parents: 2
diff changeset
6 import psycopg2
7
59edc91d0bea Uploaded
greg
parents: 6
diff changeset
7 from sqlalchemy import create_engine
0
9180906544b6 Uploaded
greg
parents:
diff changeset
8 from sqlalchemy.engine.url import make_url
9180906544b6 Uploaded
greg
parents:
diff changeset
9
9180906544b6 Uploaded
greg
parents:
diff changeset
10 SKIP_VALS = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT']
9180906544b6 Uploaded
greg
parents:
diff changeset
11
9180906544b6 Uploaded
greg
parents:
diff changeset
12
9180906544b6 Uploaded
greg
parents:
diff changeset
13 class EnsureSynced(object):
9180906544b6 Uploaded
greg
parents:
diff changeset
14 def __init__(self):
9180906544b6 Uploaded
greg
parents:
diff changeset
15 self.args = None
9180906544b6 Uploaded
greg
parents:
diff changeset
16 self.conn = None
9180906544b6 Uploaded
greg
parents:
diff changeset
17 self.parse_args()
9180906544b6 Uploaded
greg
parents:
diff changeset
18 self.outfh = open(self.args.output, "w")
9180906544b6 Uploaded
greg
parents:
diff changeset
19 self.connect_db()
9180906544b6 Uploaded
greg
parents:
diff changeset
20 self.engine = create_engine(self.args.database_connection_string)
8
aaa6ae7c64de Uploaded
greg
parents: 7
diff changeset
21 self.coral_mlg_rep_sample_ids_from_db = []
0
9180906544b6 Uploaded
greg
parents:
diff changeset
22 self.affy_ids_from_file = []
9180906544b6 Uploaded
greg
parents:
diff changeset
23
9180906544b6 Uploaded
greg
parents:
diff changeset
24 def connect_db(self):
9180906544b6 Uploaded
greg
parents:
diff changeset
25 url = make_url(self.args.database_connection_string)
9180906544b6 Uploaded
greg
parents:
diff changeset
26 args = url.translate_connect_args(username='user')
9180906544b6 Uploaded
greg
parents:
diff changeset
27 args.update(url.query)
9180906544b6 Uploaded
greg
parents:
diff changeset
28 assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.'
9180906544b6 Uploaded
greg
parents:
diff changeset
29 self.conn = psycopg2.connect(**args)
9180906544b6 Uploaded
greg
parents:
diff changeset
30
8
aaa6ae7c64de Uploaded
greg
parents: 7
diff changeset
31 def get_coral_mlg_rep_sample_ids_from_db(self):
7
59edc91d0bea Uploaded
greg
parents: 6
diff changeset
32 cmd = "SELECT coral_mlg_rep_sample_id, coral_mlg_clonal_id FROM genotype WHERE coral_mlg_rep_sample_id IS NOT NULL AND coral_mlg_rep_sample_id != '' AND coral_mlg_clonal_id != 'failed' ORDER BY coral_mlg_rep_sample_id;"
0
9180906544b6 Uploaded
greg
parents:
diff changeset
33 cur = self.conn.cursor()
9180906544b6 Uploaded
greg
parents:
diff changeset
34 cur.execute(cmd)
9180906544b6 Uploaded
greg
parents:
diff changeset
35 rows = cur.fetchall()
9180906544b6 Uploaded
greg
parents:
diff changeset
36 for row in rows:
8
aaa6ae7c64de Uploaded
greg
parents: 7
diff changeset
37 self.coral_mlg_rep_sample_ids_from_db.append(row[0])
aaa6ae7c64de Uploaded
greg
parents: 7
diff changeset
38 self.coral_mlg_rep_sample_ids_from_db.sort()
0
9180906544b6 Uploaded
greg
parents:
diff changeset
39
9180906544b6 Uploaded
greg
parents:
diff changeset
40 def get_affy_ids_from_file(self, f):
9180906544b6 Uploaded
greg
parents:
diff changeset
41 with open(f) as fh:
9180906544b6 Uploaded
greg
parents:
diff changeset
42 for line in fh:
9180906544b6 Uploaded
greg
parents:
diff changeset
43 line = line.strip()
9180906544b6 Uploaded
greg
parents:
diff changeset
44 if line in SKIP_VALS:
9180906544b6 Uploaded
greg
parents:
diff changeset
45 # Skip the first 9 lines in the file.
9180906544b6 Uploaded
greg
parents:
diff changeset
46 continue
9180906544b6 Uploaded
greg
parents:
diff changeset
47 self.affy_ids_from_file.append(line)
9180906544b6 Uploaded
greg
parents:
diff changeset
48 self.affy_ids_from_file.sort()
9180906544b6 Uploaded
greg
parents:
diff changeset
49
9180906544b6 Uploaded
greg
parents:
diff changeset
50 def get_difference(self, list1, list2):
9180906544b6 Uploaded
greg
parents:
diff changeset
51 if len(list1) > len(list2):
9180906544b6 Uploaded
greg
parents:
diff changeset
52 return list(set(list1) - set(list2))
9180906544b6 Uploaded
greg
parents:
diff changeset
53 return list(set(list2) - set(list1))
9180906544b6 Uploaded
greg
parents:
diff changeset
54
9180906544b6 Uploaded
greg
parents:
diff changeset
55 def log(self, msg):
9180906544b6 Uploaded
greg
parents:
diff changeset
56 self.outfh.write("%s\n" % msg)
9180906544b6 Uploaded
greg
parents:
diff changeset
57
9180906544b6 Uploaded
greg
parents:
diff changeset
58 def parse_args(self):
9180906544b6 Uploaded
greg
parents:
diff changeset
59 parser = argparse.ArgumentParser()
9180906544b6 Uploaded
greg
parents:
diff changeset
60 parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'),
9180906544b6 Uploaded
greg
parents:
diff changeset
61 parser.add_argument('--affy_ids_from_file', dest='affy_ids_from_file', help='Affy ids taken from all previously genotyped samples vcf file')
9180906544b6 Uploaded
greg
parents:
diff changeset
62 parser.add_argument('--output', dest='output', help='Output dataset'),
9180906544b6 Uploaded
greg
parents:
diff changeset
63 self.args = parser.parse_args()
9180906544b6 Uploaded
greg
parents:
diff changeset
64
9180906544b6 Uploaded
greg
parents:
diff changeset
65 def run(self):
8
aaa6ae7c64de Uploaded
greg
parents: 7
diff changeset
66 self.get_coral_mlg_rep_sample_ids_from_db()
0
9180906544b6 Uploaded
greg
parents:
diff changeset
67 self.get_affy_ids_from_file(self.args.affy_ids_from_file)
8
aaa6ae7c64de Uploaded
greg
parents: 7
diff changeset
68 if self.coral_mlg_rep_sample_ids_from_db == self.affy_ids_from_file:
0
9180906544b6 Uploaded
greg
parents:
diff changeset
69 in_sync = True
9180906544b6 Uploaded
greg
parents:
diff changeset
70 self.log("The selected file is in sync with the database.\n\n")
9180906544b6 Uploaded
greg
parents:
diff changeset
71 else:
9180906544b6 Uploaded
greg
parents:
diff changeset
72 in_sync = False
9180906544b6 Uploaded
greg
parents:
diff changeset
73 self.log("The selected file is not in sync with the database.\n\n")
8
aaa6ae7c64de Uploaded
greg
parents: 7
diff changeset
74 num_coral_mlg_rep_sample_ids_from_db = len(self.coral_mlg_rep_sample_ids_from_db)
aaa6ae7c64de Uploaded
greg
parents: 7
diff changeset
75 self.log("Number of coral mlg rep sample ids in the database: %d\n" % num_coral_mlg_rep_sample_ids_from_db)
0
9180906544b6 Uploaded
greg
parents:
diff changeset
76 num_affy_ids_from_file = len(self.affy_ids_from_file)
9180906544b6 Uploaded
greg
parents:
diff changeset
77 self.log("Number of Affymetrix ids in the file: %d\n" % num_affy_ids_from_file)
9180906544b6 Uploaded
greg
parents:
diff changeset
78 if not in_sync:
8
aaa6ae7c64de Uploaded
greg
parents: 7
diff changeset
79 if num_coral_mlg_rep_sample_ids_from_db > num_affy_ids_from_file:
0
9180906544b6 Uploaded
greg
parents:
diff changeset
80 self.log("The database contains the following Affymetrix ids that are not in the file.\n")
9180906544b6 Uploaded
greg
parents:
diff changeset
81 else:
9180906544b6 Uploaded
greg
parents:
diff changeset
82 self.log("The file contains the following Affymetrix ids that are not in the database.\n")
8
aaa6ae7c64de Uploaded
greg
parents: 7
diff changeset
83 diff_list = self.get_difference(self.coral_mlg_rep_sample_ids_from_db, self.affy_ids_from_file)
0
9180906544b6 Uploaded
greg
parents:
diff changeset
84 for affy_id in diff_list:
9180906544b6 Uploaded
greg
parents:
diff changeset
85 self.log("%s\n" % affy_id)
9180906544b6 Uploaded
greg
parents:
diff changeset
86 self.outfh.flush()
9180906544b6 Uploaded
greg
parents:
diff changeset
87 self.outfh.close()
9180906544b6 Uploaded
greg
parents:
diff changeset
88 sys.exit(1)
9180906544b6 Uploaded
greg
parents:
diff changeset
89
9180906544b6 Uploaded
greg
parents:
diff changeset
90 def shutdown(self):
9180906544b6 Uploaded
greg
parents:
diff changeset
91 self.outfh.flush()
9180906544b6 Uploaded
greg
parents:
diff changeset
92 self.outfh.close()
9180906544b6 Uploaded
greg
parents:
diff changeset
93 self.conn.close()
9180906544b6 Uploaded
greg
parents:
diff changeset
94
9180906544b6 Uploaded
greg
parents:
diff changeset
95
9180906544b6 Uploaded
greg
parents:
diff changeset
96 if __name__ == '__main__':
9180906544b6 Uploaded
greg
parents:
diff changeset
97 es = EnsureSynced()
9180906544b6 Uploaded
greg
parents:
diff changeset
98 es.run()
9180906544b6 Uploaded
greg
parents:
diff changeset
99 es.shutdown()