Mercurial > repos > greg > genotype_population_info
view genotype_population_info.py @ 1:4943d9b12222 draft
Uploaded
author | greg |
---|---|
date | Wed, 26 Aug 2020 13:48:06 -0400 |
parents | 0b7555cd19f3 |
children |
line wrap: on
line source
#!/usr/bin/env python import argparse import sys import psycopg2 from sqlalchemy import MetaData, create_engine from sqlalchemy.engine.url import make_url class GenotypeInfoGenerator(object): def __init__(self): self.args = None self.conn = None self.parse_args() self.outfh = open(self.args.output, "w") self.connect_db() self.engine = create_engine(self.args.database_connection_string) self.metadata = MetaData(self.engine) def parse_args(self): parser = argparse.ArgumentParser() parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'), parser.add_argument('--input_partial_info', dest='input_partial_info', help='Tabular file containing part of the genotype info') parser.add_argument('--output', dest='output', help='Output dataset'), self.args = parser.parse_args() def connect_db(self): url = make_url(self.args.database_connection_string) args = url.translate_connect_args(username='user') args.update(url.query) assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.' self.conn = psycopg2.connect(**args) def run(self): sql = """ SELECT sample.user_specimen_id, reef.region FROM sample LEFT OUTER JOIN colony ON sample.colony_id = colony.id LEFT OUTER JOIN reef ON reef.id = colony.reef_id WHERE sample.affy_id = '%s'; """ with open(self.args.input_partial_info, "r") as fh: for line in fh: line = line.strip() out_items = [] items = line.split('\t') # Item number. out_items.append(items[0]) affy_id = items[1] out_items.append(affy_id) if len(items) == 2: # Example line: # 1 a100000-4368120-060520-256_I07.CEL # The line is missing the user_specimen_id and # region, so retrieve it from the database. query = sql % affy_id cur = self.conn.cursor() cur.execute(query) try: missing_items = cur.fetchone() # user_specimen_id out_items.append(missing_items[0]) # region out_items.append(missing_items[1]) except Exception as e: msg = "Error retrieving user_specimen_id and region from the database for affy_id %s: %s" % (affy_id, e) self.stop_err(msg) else: # The line contains all of the information we need. # user_specimen_id out_items.append(items[3]) # region out_items.append(items[9]) self.outfh.write("%s\n" % "\t".join(out_items)) self.outfh.close() def shutdown(self): self.conn.close() def stop_err(self, msg): sys.stderr.write(msg) self.outfh.flush() self.outfh.close() sys.exit(1) if __name__ == '__main__': gig = GenotypeInfoGenerator() gig.run() gig.shutdown()