changeset 0:f15e63d204b1 draft

Uploaded
author greg
date Mon, 23 Sep 2019 14:12:31 -0400
parents
children 633cb7831526
files .shed.yml export_all_sample_data.py export_all_sample_data.xml
diffstat 3 files changed, 286 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.shed.yml	Mon Sep 23 14:12:31 2019 -0400
@@ -0,0 +1,11 @@
+name: export_all_sample_data
+owner: greg
+description: |
+  Generates a tabular dataset of all samples and associated metadata in the stag database.
+homepage_url: http://baumslab.org
+long_description: |
+  Generates a tabular dataset of all samples and associated metadata in the stag database.
+remote_repository_url: https://github.com/gregvonkuster/galaxy_tools/tree/master/tools/corals/export_all_sample_data
+type: unrestricted
+categories:
+  - Micro-array Analysis
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/export_all_sample_data.py	Mon Sep 23 14:12:31 2019 -0400
@@ -0,0 +1,197 @@
+#!/usr/bin/env python
+from __future__ import print_function
+
+import argparse
+import datetime
+import psycopg2
+
+from sqlalchemy import create_engine
+from sqlalchemy import MetaData
+from sqlalchemy.engine.url import make_url
+
+metadata = MetaData()
+
+COLUMNS = ["Affymetrix ID", "Sample ID", "User Specimen ID", "Field Call", "Sample Depth",
+           "Percent Missing Data Coral", "Percent Heterozygous Coral", "Percent Acerv Coral",
+           "Percent Apalm Coral", "Bcoral Genet Id", "Registry ID", "DNA Extraction Method",
+           "DNA Concentration", "Colony Location", "Colony Latitude", "Colony Longitude",
+           "Colony Depth", "Reef Name", "Region", "Reef Latitude", "Reef Longitude",
+           "GPS Coordinates Associated With", "Coral Mlg Clonal ID", "Coral Mlg Rep Sample ID",
+           "Genetic Coral Species Call", "Spawning", "Sperm Motility", "TLE", "Disease Resist",
+           "Bleach Resist", "Mortality", "Healing Time", "Sequencing Facility", "Array Version",
+           "Plate Barcode", "Collector Last Name", "First Name", "Organization", "Email",
+           "Collection Date"]
+
+
+class ExportAllSampleData(object):
+    def __init__(self):
+        self.args = None
+        self.conn = None
+        self.parse_args()
+        self.outfh = open(self.args.output, "w")
+        self.outfh.write("%s\n" % "\t".join(COLUMNS))
+        self.connect_db()
+        self.engine = create_engine(self.args.database_connection_string)
+
+    def connect_db(self):
+        url = make_url(self.args.database_connection_string)
+        args = url.translate_connect_args(username='user')
+        args.update(url.query)
+        assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.'
+        self.conn = psycopg2.connect(**args)
+
+    def export_from_db(self):
+        today = datetime.date.today()
+        cmd = """
+           SELECT sample.affy_id, sample.sample_id, sample.genotype_id,
+           sample.phenotype_id, sample.experiment_id, sample.colony_id,
+           sample.colony_location, sample.collector_id, sample.collection_date,
+           sample.user_specimen_id, sample.registry_id, sample.depth AS sample_depth,
+           sample.dna_extraction_method, sample.dna_concentration,
+           sample.percent_missing_data_coral, sample.percent_acerv_coral,
+           sample.percent_apalm_coral, sample.percent_heterozygous_coral,
+           sample.field_call, sample.bcoral_genet_id, genotype.coral_mlg_clonal_id,
+           genotype.coral_mlg_rep_sample_id, genotype.genetic_coral_species_call,
+           phenotype.spawning, phenotype.sperm_motility, phenotype.tle,
+           phenotype.disease_resist, phenotype.bleach_resist, phenotype.mortality,
+           phenotype.healing_time, experiment.seq_facility, experiment.array_version,
+           experiment.plate_barcode, colony.latitude AS colony_latitude,
+           colony.longitude AS colony_longitude, colony.depth AS colony_depth,
+           reef.name, reef.region, reef.latitude AS reef_latitude, reef.longitude AS reef_longitude,
+           reef.geographic_origin, person.last_name, person.first_name,
+           person.organization, person.email
+           FROM sample
+           LEFT OUTER JOIN genotype
+                           ON sample.genotype_id = genotype.id
+           LEFT OUTER JOIN phenotype
+                           ON sample.phenotype_id = phenotype.id
+           LEFT OUTER JOIN experiment
+                           ON sample.experiment_id = experiment.id
+           LEFT OUTER JOIN colony
+                           ON sample.colony_id = colony.id
+           LEFT OUTER JOIN reef
+                           ON reef.id = colony.reef_id
+           LEFT OUTER JOIN person
+                           ON sample.collector_id = person.id
+           WHERE sample.public OR sample.public_after_date < date'%s'
+           ORDER BY affy_id;""" % today
+        # Instantiate the cursor.
+        cur = self.conn.cursor()
+        # Execute the query.
+        cur.execute(cmd)
+        rows = cur.fetchall()
+        for tup in rows:
+            values = self.extract_values(tup)
+            # Output the row.
+            self.outfh.write("%s\n" % "\t".join(values))
+
+    def extract_values(self, tup):
+        values = []
+        # Extract the items from the tuple.
+        affy_id = self.get_value(tup[0])
+        sample_id = self.get_value(tup[1])
+        colony_location = self.get_value(tup[6])
+        collection_date = self.get_value(tup[8])
+        if len(collection_date) > 0:
+            collection_date = collection_date[:10]
+        user_specimen_id = self.get_value(tup[9])
+        registry_id = self.get_value(tup[10])
+        sample_depth = self.get_value(tup[11])
+        dna_extraction_method = self.get_value(tup[12])
+        dna_concentration = self.get_value(tup[13])
+        percent_missing_data_coral = self.get_value(tup[14])
+        percent_acerv_coral = self.get_value(tup[15])
+        percent_apalm_coral = self.get_value(tup[16])
+        percent_heterozygous_coral = self.get_value(tup[17])
+        field_call = self.get_value(tup[18])
+        bcoral_genet_id = self.get_value(tup[19])
+        coral_mlg_clonal_id = self.get_value(tup[20])
+        coral_mlg_rep_sample_id = self.get_value(tup[21])
+        genetic_coral_species_call = self.get_value(tup[22])
+        spawning = self.get_value(tup[23])
+        sperm_motility = self.get_value(tup[24])
+        tle = self.get_value(tup[25])
+        disease_resist = self.get_value(tup[26])
+        bleach_resist = self.get_value(tup[27])
+        mortality = self.get_value(tup[28])
+        healing_time = self.get_value(tup[29])
+        seq_facility = self.get_value(tup[30])
+        array_version = self.get_value(tup[31])
+        plate_barcode = self.get_value(tup[32])
+        colony_latitude = self.get_value(tup[33])
+        colony_longitude = self.get_value(tup[34])
+        colony_depth = self.get_value(tup[35])
+        reef_name = self.get_value(tup[36])
+        region = self.get_value(tup[37])
+        reef_latitude = self.get_value(tup[38])
+        reef_longitude = self.get_value(tup[39])
+        geographic_origin = self.get_value(tup[40])
+        last_name = self.get_value(tup[41])
+        first_name = self.get_value(tup[42])
+        organization = self.get_value(tup[43])
+        email = self.get_value(tup[44])
+        # Append the columns in the specified order.
+        values.append(affy_id)
+        values.append(sample_id)
+        values.append(user_specimen_id)
+        values.append(field_call)
+        values.append(sample_depth)
+        values.append(percent_missing_data_coral)
+        values.append(percent_heterozygous_coral)
+        values.append(percent_acerv_coral)
+        values.append(percent_apalm_coral)
+        values.append(bcoral_genet_id)
+        values.append(registry_id)
+        values.append(dna_extraction_method)
+        values.append(dna_concentration)
+        values.append(colony_location)
+        values.append(colony_latitude)
+        values.append(colony_longitude)
+        values.append(colony_depth)
+        values.append(reef_name)
+        values.append(region)
+        values.append(reef_latitude)
+        values.append(reef_longitude)
+        values.append(geographic_origin)
+        values.append(coral_mlg_clonal_id)
+        values.append(coral_mlg_rep_sample_id)
+        values.append(genetic_coral_species_call)
+        values.append(spawning)
+        values.append(sperm_motility)
+        values.append(tle)
+        values.append(disease_resist)
+        values.append(bleach_resist)
+        values.append(mortality)
+        values.append(healing_time)
+        values.append(seq_facility)
+        values.append(array_version)
+        values.append(plate_barcode)
+        values.append(last_name)
+        values.append(first_name)
+        values.append(organization)
+        values.append(email)
+        values.append(collection_date)
+        return values
+
+    def get_value(self, loc):
+        return str(loc) or ""
+
+    def parse_args(self):
+        parser = argparse.ArgumentParser()
+        parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'),
+        parser.add_argument('--output', dest='output', help='Output dataset'),
+        self.args = parser.parse_args()
+
+    def run(self):
+        self.export_from_db()
+
+    def shutdown(self):
+        self.outfh.flush()
+        self.outfh.close()
+        self.conn.close()
+
+
+if __name__ == '__main__':
+    easd = ExportAllSampleData()
+    easd.run()
+    easd.shutdown()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/export_all_sample_data.xml	Mon Sep 23 14:12:31 2019 -0400
@@ -0,0 +1,78 @@
+<tool id="export_all_sample_data" name="Export all sample data" version="1.0.0">
+    <description>from the stag database</description>
+    <command detect_errors="exit_code"><![CDATA[
+python '$__tool_directory__/export_all_sample_data.py'
+--database_connection_string '$__app__.config.corals_database_connection'
+--output '$output']]></command>
+    <inputs>
+    </inputs>
+    <outputs>
+        <data name="output" format="tabular" label="${tool.name}"/>
+    </outputs>
+    <tests>
+        <test>
+            <!--Testing this tool is a bit difficult at the current time.-->
+        </test>
+    </tests>
+    <help>
+**What it does**
+
+Generates a tabular dataset of all samples and associated metadata in the stag database.
+
+-----
+
+**Output Columns**
+
+ * Affymetrix ID
+ * Sample ID
+ * User Specimen ID
+ * Field Call
+ * Sample Depth
+ * Percent Missing Data Coral
+ * Percent Heterozygous Coral
+ * Percent Acerv Coral
+ * Percent Apalm Coral
+ * Bcoral Genet ID
+ * Registry ID
+ * DNA Extraction Method
+ * DNA Concentration
+ * Colony Location
+ * Colony Latitude
+ * Colony Longitude
+ * Colony Depth
+ * Reef Name
+ * Region
+ * Reef Latitude
+ * Reef Longitude
+ * GPS Coordinates Associated With
+ * Coral MLG Clonal ID
+ * Coral MLG Rep Sample ID
+ * Genetic Coral Species Call
+ * Spawning
+ * Sperm Motility
+ * TLE
+ * Disease Resist
+ * Bleach Resist
+ * Mortality
+ * Healing Time
+ * Sequencing Facility
+ * Array Version
+ * Plate Barcode
+ * Collector Last Name
+ * First Name
+ * Organization
+ * Email
+ * Collection Date
+    </help>
+    <citations>
+        <citation type="bibtex">
+            @misc{None,
+            journal = {None},
+            author = {Baums I},
+            title = {Manuscript in preparation},
+            year = {None},
+            url = {http://baumslab.org}
+        </citation>
+    </citations>
+</tool>
+