comparison ensure_synced.py @ 8:aaa6ae7c64de draft

Uploaded
author greg
date Tue, 25 Jan 2022 14:24:54 +0000
parents 59edc91d0bea
children 05920a4bd5b6
comparison
equal deleted inserted replaced
7:59edc91d0bea 8:aaa6ae7c64de
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2
3 2
4 import argparse 3 import argparse
5 import sys 4 import sys
6 5
7 import psycopg2 6 import psycopg2
21 self.parse_args() 20 self.parse_args()
22 self.outfh = open(self.args.output, "w") 21 self.outfh = open(self.args.output, "w")
23 self.connect_db() 22 self.connect_db()
24 self.engine = create_engine(self.args.database_connection_string) 23 self.engine = create_engine(self.args.database_connection_string)
25 self.metadata = MetaData(self.engine) 24 self.metadata = MetaData(self.engine)
26 self.affy_ids_from_db = [] 25 self.coral_mlg_rep_sample_ids_from_db = []
27 self.affy_ids_from_file = [] 26 self.affy_ids_from_file = []
28 27
29 def connect_db(self): 28 def connect_db(self):
30 url = make_url(self.args.database_connection_string) 29 url = make_url(self.args.database_connection_string)
31 args = url.translate_connect_args(username='user') 30 args = url.translate_connect_args(username='user')
32 args.update(url.query) 31 args.update(url.query)
33 assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.' 32 assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.'
34 self.conn = psycopg2.connect(**args) 33 self.conn = psycopg2.connect(**args)
35 34
36 def get_affy_ids_from_db(self): 35 def get_coral_mlg_rep_sample_ids_from_db(self):
37 cmd = "SELECT coral_mlg_rep_sample_id, coral_mlg_clonal_id FROM genotype WHERE coral_mlg_rep_sample_id IS NOT NULL AND coral_mlg_rep_sample_id != '' AND coral_mlg_clonal_id != 'failed' ORDER BY coral_mlg_rep_sample_id;" 36 cmd = "SELECT coral_mlg_rep_sample_id, coral_mlg_clonal_id FROM genotype WHERE coral_mlg_rep_sample_id IS NOT NULL AND coral_mlg_rep_sample_id != '' AND coral_mlg_clonal_id != 'failed' ORDER BY coral_mlg_rep_sample_id;"
38 cur = self.conn.cursor() 37 cur = self.conn.cursor()
39 cur.execute(cmd) 38 cur.execute(cmd)
40 rows = cur.fetchall() 39 rows = cur.fetchall()
41 for row in rows: 40 for row in rows:
42 self.affy_ids_from_db.append(row[0]) 41 self.coral_mlg_rep_sample_ids_from_db.append(row[0])
43 self.affy_ids_from_db.sort() 42 self.coral_mlg_rep_sample_ids_from_db.sort()
44 43
45 def get_affy_ids_from_file(self, f): 44 def get_affy_ids_from_file(self, f):
46 with open(f) as fh: 45 with open(f) as fh:
47 for line in fh: 46 for line in fh:
48 line = line.strip() 47 line = line.strip()
66 parser.add_argument('--affy_ids_from_file', dest='affy_ids_from_file', help='Affy ids taken from all previously genotyped samples vcf file') 65 parser.add_argument('--affy_ids_from_file', dest='affy_ids_from_file', help='Affy ids taken from all previously genotyped samples vcf file')
67 parser.add_argument('--output', dest='output', help='Output dataset'), 66 parser.add_argument('--output', dest='output', help='Output dataset'),
68 self.args = parser.parse_args() 67 self.args = parser.parse_args()
69 68
70 def run(self): 69 def run(self):
71 self.get_affy_ids_from_db() 70 self.get_coral_mlg_rep_sample_ids_from_db()
72 self.get_affy_ids_from_file(self.args.affy_ids_from_file) 71 self.get_affy_ids_from_file(self.args.affy_ids_from_file)
73 if self.affy_ids_from_db == self.affy_ids_from_file: 72 if self.coral_mlg_rep_sample_ids_from_db == self.affy_ids_from_file:
74 in_sync = True 73 in_sync = True
75 self.log("The selected file is in sync with the database.\n\n") 74 self.log("The selected file is in sync with the database.\n\n")
76 else: 75 else:
77 in_sync = False 76 in_sync = False
78 self.log("The selected file is not in sync with the database.\n\n") 77 self.log("The selected file is not in sync with the database.\n\n")
79 num_affy_ids_from_db = len(self.affy_ids_from_db) 78 num_coral_mlg_rep_sample_ids_from_db = len(self.coral_mlg_rep_sample_ids_from_db)
80 self.log("Number of Affymetrix ids in the database: %d\n" % num_affy_ids_from_db) 79 self.log("Number of coral mlg rep sample ids in the database: %d\n" % num_coral_mlg_rep_sample_ids_from_db)
81 num_affy_ids_from_file = len(self.affy_ids_from_file) 80 num_affy_ids_from_file = len(self.affy_ids_from_file)
82 self.log("Number of Affymetrix ids in the file: %d\n" % num_affy_ids_from_file) 81 self.log("Number of Affymetrix ids in the file: %d\n" % num_affy_ids_from_file)
83 if not in_sync: 82 if not in_sync:
84 if num_affy_ids_from_db > num_affy_ids_from_file: 83 if num_coral_mlg_rep_sample_ids_from_db > num_affy_ids_from_file:
85 self.log("The database contains the following Affymetrix ids that are not in the file.\n") 84 self.log("The database contains the following Affymetrix ids that are not in the file.\n")
86 else: 85 else:
87 self.log("The file contains the following Affymetrix ids that are not in the database.\n") 86 self.log("The file contains the following Affymetrix ids that are not in the database.\n")
88 diff_list = self.get_difference(self.affy_ids_from_db, self.affy_ids_from_file) 87 diff_list = self.get_difference(self.coral_mlg_rep_sample_ids_from_db, self.affy_ids_from_file)
89 for affy_id in diff_list: 88 for affy_id in diff_list:
90 self.log("%s\n" % affy_id) 89 self.log("%s\n" % affy_id)
91 self.outfh.flush() 90 self.outfh.flush()
92 self.outfh.close() 91 self.outfh.close()
93 sys.exit(1) 92 sys.exit(1)