0
+ − 1 #!/usr/bin/env python
+ − 2 """
+ − 3 Validate the metadata file associated with Affymetrix 96 well plate data.
+ − 4 """
+ − 5 import argparse
+ − 6 import datetime
+ − 7 import decimal
+ − 8 import re
+ − 9 import shutil
+ − 10 import sys
+ − 11
+ − 12 parser = argparse.ArgumentParser()
+ − 13 parser.add_argument('--input', dest='input', help='Metadata file for Affymetrix 96 well plate data')
+ − 14 parser.add_argument('--output', dest='output', help='Output dataset'),
+ − 15 args = parser.parse_args()
+ − 16
+ − 17 EMAIL_MAX_LEN = 255
+ − 18 VALID_EMAIL_RE = re.compile("[^@]+@[^@]+\.[^@]+")
+ − 19
+ − 20
+ − 21 def add_error_msg(accumulated_msgs, msg):
+ − 22 return "%s\n%s" % (accumulated_msgs, msg)
+ − 23
+ − 24
+ − 25 def empty_value(line_no, label, accumulated_msgs):
+ − 26 return add_error_msg(accumulated_msgs, "The required %s value is missing on line %d." % (label, line_no))
+ − 27
+ − 28
+ − 29 def stop_error(msg):
+ − 30 sys.exit(msg)
+ − 31
+ − 32
+ − 33 def string_as_boolean_string(string):
+ − 34 if str(string).lower() in ['true', 'yes', 'on', '1']:
+ − 35 return 'True'
+ − 36 else:
+ − 37 return 'False'
+ − 38
+ − 39
+ − 40 def validate_date_string(line_no, date_string, column, accumulated_msgs):
+ − 41 if len(date_string) == 0:
+ − 42 return accumulated_msgs
+ − 43 try:
+ − 44 datetime.datetime.strptime(date_string, '%Y-%m-%d')
+ − 45 return accumulated_msgs
+ − 46 except ValueError:
+ − 47 return add_error_msg(accumulated_msgs, "Line %d contains an incorrect date format (%s must be YYYY-MM-DD) for column %s." % (line_no, date_string, column))
+ − 48
+ − 49
+ − 50 def validate_decimal(line_no, decimal_string, column, accumulated_msgs):
+ − 51 try:
+ − 52 decimal.Decimal(decimal_string)
+ − 53 return accumulated_msgs
+ − 54 except Exception:
+ − 55 return add_error_msg(accumulated_msgs, "Line %d contains an incorrect decimal value (%s) for column %s." % (line_no, decimal_string, column))
+ − 56
+ − 57
+ − 58 def validate_email(line_no, email, accumulated_msgs):
+ − 59 if not (VALID_EMAIL_RE.match(email)):
+ − 60 return add_error_msg(accumulated_msgs, "Line %d contains an invalid email address (%s). " % (line_no, email))
+ − 61 elif len(email) > EMAIL_MAX_LEN:
+ − 62 return add_error_msg(accumulated_msgs, "Line %d contains an email address (%) that is longer than the maximum length, %d characters." % (line_no, email))
+ − 63 return accumulated_msgs
+ − 64
+ − 65
+ − 66 accumulated_msgs = ""
+ − 67 # Parse the input file, skipping the header, and validating
+ − 68 # that each data line consists of 31 comma-separated items.
+ − 69 with open(args.input, "r") as ih:
+ − 70 for i, line in enumerate(ih):
+ − 71 if i == 0:
+ − 72 # Skip the header.
+ − 73 continue
+ − 74 # Keep 1-based line value for error messages.
+ − 75 line_no = i + 1
+ − 76 line = line.rstrip("\r\n")
+ − 77 items = line.split("\t")
+ − 78 if len(items) != 32:
+ − 79 accumulated_msgs = add_error_msg(accumulated_msgs, "Line %d contains %s columns, (must be 32)." % (line_no, len(items)))
+ − 80 stop_error(accumulated_msgs)
+ − 81 # Required and validated.
+ − 82 # Required.
+ − 83 user_specimen_id = items[0]
+ − 84 if len(user_specimen_id) == 0:
+ − 85 accumulated_msgs = empty_value(line_no, "user_specimen_id", accumulated_msgs)
+ − 86 # Optional.
+ − 87 field_call = items[1]
+ − 88 # Optional.
+ − 89 bcoral_genet_id = items[2]
+ − 90 # Optional.
+ − 91 bsym_genet_id = items[3]
+ − 92 # Required.
+ − 93 reef = items[4]
+ − 94 if len(reef) == 0:
+ − 95 accumulated_msgs = empty_value(line_no, "reef", accumulated_msgs)
+ − 96 # Required.
+ − 97 region = items[5]
+ − 98 if len(region) == 0:
+ − 99 accumulated_msgs = empty_value(line_no, "region", accumulated_msgs)
+ − 100 # Required and validated.
+ − 101 latitude = items[6]
+ − 102 accumulated_msgs = validate_decimal(line_no, latitude, "latitude", accumulated_msgs)
+ − 103 # Required and validated.
+ − 104 longitude = items[7]
+ − 105 accumulated_msgs = validate_decimal(line_no, longitude, "longitude", accumulated_msgs)
+ − 106 # Optional.
+ − 107 geographic_origin = items[8]
+ − 108 # Optional.
+ − 109 colony_location = items[9]
+ − 110 # Optional.
+ − 111 depth = items[10]
+ − 112 # Optional.
+ − 113 disease_resist = items[11]
+ − 114 # Optional.
+ − 115 bleach_resist = items[12]
+ − 116 # Optional.
+ − 117 mortality = items[13]
+ − 118 # Optional.
+ − 119 tle = items[14]
+ − 120 # Optional.
+ − 121 spawning = string_as_boolean_string(items[15])
+ − 122 # Required.
+ − 123 collector_last_name = items[16]
+ − 124 if len(collector_last_name) == 0:
+ − 125 accumulated_msgs = empty_value(line_no, "collector_last_name", accumulated_msgs)
+ − 126 # Required.
+ − 127 collector_first_name = items[17]
+ − 128 if len(collector_first_name) == 0:
+ − 129 accumulated_msgs = empty_value(line_no, "collector_first_name", accumulated_msgs)
+ − 130 # Required.
+ − 131 org = items[18]
+ − 132 if len(org) == 0:
+ − 133 accumulated_msgs = empty_value(line_no, "org", accumulated_msgs)
+ − 134 # Required and validated.
+ − 135 collection_date = items[19]
+ − 136 accumulated_msgs = validate_date_string(line_no, collection_date, "collection_date", accumulated_msgs)
+ − 137 # Required and validated.
+ − 138 contact_email = items[20]
+ − 139 accumulated_msgs = validate_email(line_no, contact_email, accumulated_msgs)
+ − 140 # Required.
+ − 141 seq_facility = items[21]
+ − 142 if len(seq_facility) == 0:
+ − 143 accumulated_msgs = empty_value(line_no, "seq_facility", accumulated_msgs)
+ − 144 # Optional.
+ − 145 array_version = items[22]
+ − 146 # Optional.
+ − 147 public = string_as_boolean_string(items[23])
+ − 148 # Optional.
+ − 149 public_after_date = items[24]
+ − 150 accumulated_msga = validate_date_string(line_no, public_after_date, "public_after_date", accumulated_msgs)
+ − 151 # Required and validated.
+ − 152 sperm_motility = items[25]
+ − 153 accumulated_msgs = validate_decimal(line_no, sperm_motility, "sperm_motility", accumulated_msgs)
+ − 154 # Required and validated.
+ − 155 healing_time = items[26]
+ − 156 accumulated_msgs = validate_decimal(line_no, healing_time, "healing_time", accumulated_msgs)
+ − 157 # Optional.
+ − 158 dna_extraction_method = items[27]
+ − 159 # Optional.
+ − 160 dna_concentration = items[28]
+ − 161 # If dna_concentration has a value, then it must be decimal.
+ − 162 if len(dna_concentration) > 0:
+ − 163 accumulated_msgs = validate_decimal(line_no, dna_concentration, "dna_concentration", accumulated_msgs)
+ − 164 # Optional.
+ − 165 registry_id = items[29]
+ − 166 # Optional.
+ − 167 result_folder_name = items[30]
+ − 168 # Optional.
+ − 169 plate_barcode = items[31]
+ − 170
+ − 171
+ − 172 if len(accumulated_msgs) > 0:
+ − 173 stop_error(accumulated_msgs)
+ − 174
+ − 175 shutil.copyfile(args.input, args.output)