Mercurial > repos > greg > validate_affy_metadata
comparison validate_affy_metadata.py @ 0:80d672b3e6dd draft
Uploaded
author | greg |
---|---|
date | Thu, 15 Aug 2019 13:17:41 -0400 |
parents | |
children | b5f5c3d0f349 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:80d672b3e6dd |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 Validate the metadata file associated with Affymetrix 96 well plate data. | |
4 """ | |
5 import argparse | |
6 import datetime | |
7 import decimal | |
8 import re | |
9 import shutil | |
10 import sys | |
11 | |
12 parser = argparse.ArgumentParser() | |
13 parser.add_argument('--input', dest='input', help='Metadata file for Affymetrix 96 well plate data') | |
14 parser.add_argument('--output', dest='output', help='Output dataset'), | |
15 args = parser.parse_args() | |
16 | |
17 EMAIL_MAX_LEN = 255 | |
18 VALID_EMAIL_RE = re.compile("[^@]+@[^@]+\.[^@]+") | |
19 | |
20 | |
21 def add_error_msg(accumulated_msgs, msg): | |
22 return "%s\n%s" % (accumulated_msgs, msg) | |
23 | |
24 | |
25 def empty_value(line_no, label, accumulated_msgs): | |
26 return add_error_msg(accumulated_msgs, "The required %s value is missing on line %d." % (label, line_no)) | |
27 | |
28 | |
29 def stop_error(msg): | |
30 sys.exit(msg) | |
31 | |
32 | |
33 def string_as_boolean_string(string): | |
34 if str(string).lower() in ['true', 'yes', 'on', '1']: | |
35 return 'True' | |
36 else: | |
37 return 'False' | |
38 | |
39 | |
40 def validate_date_string(line_no, date_string, column, accumulated_msgs): | |
41 if len(date_string) == 0: | |
42 return accumulated_msgs | |
43 try: | |
44 datetime.datetime.strptime(date_string, '%Y-%m-%d') | |
45 return accumulated_msgs | |
46 except ValueError: | |
47 return add_error_msg(accumulated_msgs, "Line %d contains an incorrect date format (%s must be YYYY-MM-DD) for column %s." % (line_no, date_string, column)) | |
48 | |
49 | |
50 def validate_decimal(line_no, decimal_string, column, accumulated_msgs): | |
51 try: | |
52 decimal.Decimal(decimal_string) | |
53 return accumulated_msgs | |
54 except Exception: | |
55 return add_error_msg(accumulated_msgs, "Line %d contains an incorrect decimal value (%s) for column %s." % (line_no, decimal_string, column)) | |
56 | |
57 | |
58 def validate_email(line_no, email, accumulated_msgs): | |
59 if not (VALID_EMAIL_RE.match(email)): | |
60 return add_error_msg(accumulated_msgs, "Line %d contains an invalid email address (%s). " % (line_no, email)) | |
61 elif len(email) > EMAIL_MAX_LEN: | |
62 return add_error_msg(accumulated_msgs, "Line %d contains an email address (%) that is longer than the maximum length, %d characters." % (line_no, email)) | |
63 return accumulated_msgs | |
64 | |
65 | |
66 accumulated_msgs = "" | |
67 # Parse the input file, skipping the header, and validating | |
68 # that each data line consists of 31 comma-separated items. | |
69 with open(args.input, "r") as ih: | |
70 for i, line in enumerate(ih): | |
71 if i == 0: | |
72 # Skip the header. | |
73 continue | |
74 # Keep 1-based line value for error messages. | |
75 line_no = i + 1 | |
76 line = line.rstrip("\r\n") | |
77 if i > 97: | |
78 accumulated_msgs = add_error_msg(accumulated_msgs, "The input file contains more than 97 lines (must be 1 header line and no more than 96 data lines).") | |
79 stop_error(accumulated_msgs) | |
80 items = line.split("\t") | |
81 if len(items) != 32: | |
82 accumulated_msgs = add_error_msg(accumulated_msgs, "Line %d contains %s columns, (must be 32)." % (line_no, len(items))) | |
83 stop_error(accumulated_msgs) | |
84 # Required and validated. | |
85 # Required. | |
86 user_specimen_id = items[0] | |
87 if len(user_specimen_id) == 0: | |
88 accumulated_msgs = empty_value(line_no, "user_specimen_id", accumulated_msgs) | |
89 # Optional. | |
90 field_call = items[1] | |
91 # Optional. | |
92 bcoral_genet_id = items[2] | |
93 # Optional. | |
94 bsym_genet_id = items[3] | |
95 # Required. | |
96 reef = items[4] | |
97 if len(reef) == 0: | |
98 accumulated_msgs = empty_value(line_no, "reef", accumulated_msgs) | |
99 # Required. | |
100 region = items[5] | |
101 if len(region) == 0: | |
102 accumulated_msgs = empty_value(line_no, "region", accumulated_msgs) | |
103 # Required and validated. | |
104 latitude = items[6] | |
105 accumulated_msgs = validate_decimal(line_no, latitude, "latitude", accumulated_msgs) | |
106 # Required and validated. | |
107 longitude = items[7] | |
108 accumulated_msgs = validate_decimal(line_no, longitude, "longitude", accumulated_msgs) | |
109 # Optional. | |
110 geographic_origin = items[8] | |
111 # Optional. | |
112 colony_location = items[9] | |
113 # Optional. | |
114 depth = items[10] | |
115 # Optional. | |
116 disease_resist = items[11] | |
117 # Optional. | |
118 bleach_resist = items[12] | |
119 # Optional. | |
120 mortality = items[13] | |
121 # Optional. | |
122 tle = items[14] | |
123 # Optional. | |
124 spawning = string_as_boolean_string(items[15]) | |
125 # Required. | |
126 collector_last_name = items[16] | |
127 if len(collector_last_name) == 0: | |
128 accumulated_msgs = empty_value(line_no, "collector_last_name", accumulated_msgs) | |
129 # Required. | |
130 collector_first_name = items[17] | |
131 if len(collector_first_name) == 0: | |
132 accumulated_msgs = empty_value(line_no, "collector_first_name", accumulated_msgs) | |
133 # Required. | |
134 org = items[18] | |
135 if len(org) == 0: | |
136 accumulated_msgs = empty_value(line_no, "org", accumulated_msgs) | |
137 # Required and validated. | |
138 collection_date = items[19] | |
139 accumulated_msgs = validate_date_string(line_no, collection_date, "collection_date", accumulated_msgs) | |
140 # Required and validated. | |
141 contact_email = items[20] | |
142 accumulated_msgs = validate_email(line_no, contact_email, accumulated_msgs) | |
143 # Required. | |
144 seq_facility = items[21] | |
145 if len(seq_facility) == 0: | |
146 accumulated_msgs = empty_value(line_no, "seq_facility", accumulated_msgs) | |
147 # Optional. | |
148 array_version = items[22] | |
149 # Optional. | |
150 public = string_as_boolean_string(items[23]) | |
151 # Optional. | |
152 public_after_date = items[24] | |
153 accumulated_msga = validate_date_string(line_no, public_after_date, "public_after_date", accumulated_msgs) | |
154 # Required and validated. | |
155 sperm_motility = items[25] | |
156 accumulated_msgs = validate_decimal(line_no, sperm_motility, "sperm_motility", accumulated_msgs) | |
157 # Required and validated. | |
158 healing_time = items[26] | |
159 accumulated_msgs = validate_decimal(line_no, healing_time, "healing_time", accumulated_msgs) | |
160 # Optional. | |
161 dna_extraction_method = items[27] | |
162 # Optional. | |
163 dna_concentration = items[28] | |
164 # If dna_concentration has a value, then it must be decimal. | |
165 if len(dna_concentration) > 0: | |
166 accumulated_msgs = validate_decimal(line_no, dna_concentration, "dna_concentration", accumulated_msgs) | |
167 # Optional. | |
168 registry_id = items[29] | |
169 # Optional. | |
170 result_folder_name = items[30] | |
171 # Optional. | |
172 plate_barcode = items[31] | |
173 | |
174 | |
175 if len(accumulated_msgs) > 0: | |
176 stop_error(accumulated_msgs) | |
177 | |
178 shutil.copyfile(args.input, args.output) |