Mercurial > repos > iuc > query_tabular
comparison load_db.py @ 1:8a33b442ecd9 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/query_tabular commit 81f69ad5f39223059c40501e55ac777d3feca845
| author | iuc |
|---|---|
| date | Fri, 18 Aug 2017 16:48:09 -0400 |
| parents | 3708ff0198b7 |
| children | fb8484ee54d8 |
comparison
equal
deleted
inserted
replaced
| 0:3708ff0198b7 | 1:8a33b442ecd9 |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 | 2 |
| 3 from __future__ import print_function | 3 from __future__ import print_function |
| 4 | 4 |
| 5 import re | |
| 5 import sys | 6 import sys |
| 6 | 7 |
| 7 from filters import TabularReader | 8 from filters import TabularReader |
| 9 | |
| 10 | |
| 11 SQLITE_KEYWORDS = [ | |
| 12 'ABORT', | |
| 13 'ACTION', | |
| 14 'ADD', | |
| 15 'AFTER', | |
| 16 'ALL', | |
| 17 'ALTER', | |
| 18 'ANALYZE', | |
| 19 'AND', | |
| 20 'AS', | |
| 21 'ASC', | |
| 22 'ATTACH', | |
| 23 'AUTOINCREMENT', | |
| 24 'BEFORE', | |
| 25 'BEGIN', | |
| 26 'BETWEEN', | |
| 27 'BY', | |
| 28 'CASCADE', | |
| 29 'CASE', | |
| 30 'CAST', | |
| 31 'CHECK', | |
| 32 'COLLATE', | |
| 33 'COLUMN', | |
| 34 'COMMIT', | |
| 35 'CONFLICT', | |
| 36 'CONSTRAINT', | |
| 37 'CREATE', | |
| 38 'CROSS', | |
| 39 'CURRENT_DATE', | |
| 40 'CURRENT_TIME', | |
| 41 'CURRENT_TIMESTAMP', | |
| 42 'DATABASE', | |
| 43 'DEFAULT', | |
| 44 'DEFERRABLE', | |
| 45 'DEFERRED', | |
| 46 'DELETE', | |
| 47 'DESC', | |
| 48 'DETACH', | |
| 49 'DISTINCT', | |
| 50 'DROP', | |
| 51 'EACH', | |
| 52 'ELSE', | |
| 53 'END', | |
| 54 'ESCAPE', | |
| 55 'EXCEPT', | |
| 56 'EXCLUSIVE', | |
| 57 'EXISTS', | |
| 58 'EXPLAIN', | |
| 59 'FAIL', | |
| 60 'FOR', | |
| 61 'FOREIGN', | |
| 62 'FROM', | |
| 63 'FULL', | |
| 64 'GLOB', | |
| 65 'GROUP', | |
| 66 'HAVING', | |
| 67 'IF', | |
| 68 'IGNORE', | |
| 69 'IMMEDIATE', | |
| 70 'IN', | |
| 71 'INDEX', | |
| 72 'INDEXED', | |
| 73 'INITIALLY', | |
| 74 'INNER', | |
| 75 'INSERT', | |
| 76 'INSTEAD', | |
| 77 'INTERSECT', | |
| 78 'INTO', | |
| 79 'IS', | |
| 80 'ISNULL', | |
| 81 'JOIN', | |
| 82 'KEY', | |
| 83 'LEFT', | |
| 84 'LIKE', | |
| 85 'LIMIT', | |
| 86 'MATCH', | |
| 87 'NATURAL', | |
| 88 'NO', | |
| 89 'NOT', | |
| 90 'NOTNULL', | |
| 91 'NULL', | |
| 92 'OF', | |
| 93 'OFFSET', | |
| 94 'ON', | |
| 95 'OR', | |
| 96 'ORDER', | |
| 97 'OUTER', | |
| 98 'PLAN', | |
| 99 'PRAGMA', | |
| 100 'PRIMARY', | |
| 101 'QUERY', | |
| 102 'RAISE', | |
| 103 'RECURSIVE', | |
| 104 'REFERENCES', | |
| 105 'REGEXP', | |
| 106 'REINDEX', | |
| 107 'RELEASE', | |
| 108 'RENAME', | |
| 109 'REPLACE', | |
| 110 'RESTRICT', | |
| 111 'RIGHT', | |
| 112 'ROLLBACK', | |
| 113 'ROW', | |
| 114 'SAVEPOINT', | |
| 115 'SELECT', | |
| 116 'SET', | |
| 117 'TABLE', | |
| 118 'TEMP', | |
| 119 'TEMPORARY', | |
| 120 'THEN', | |
| 121 'TO', | |
| 122 'TRANSACTION', | |
| 123 'TRIGGER', | |
| 124 'UNION', | |
| 125 'UNIQUE', | |
| 126 'UPDATE', | |
| 127 'USING', | |
| 128 'VACUUM', | |
| 129 'VALUES', | |
| 130 'VIEW', | |
| 131 'VIRTUAL', | |
| 132 'WHEN', | |
| 133 'WHERE', | |
| 134 'WITH', | |
| 135 'WITHOUT' | |
| 136 ] | |
| 137 | |
| 138 | |
| 139 def get_valid_column_name(name): | |
| 140 valid_name = name | |
| 141 if not name or not name.strip(): | |
| 142 return None | |
| 143 elif name.upper() in SQLITE_KEYWORDS: | |
| 144 valid_name = '"%s"' % name | |
| 145 elif re.match('^[a-zA-Z]\w*$', name): | |
| 146 pass | |
| 147 elif re.match('^"[^"]+"$', name): | |
| 148 pass | |
| 149 elif re.match('^\[[^\[\]]*\]$', name): | |
| 150 pass | |
| 151 elif re.match("^`[^`]+`$", name): | |
| 152 pass | |
| 153 elif name.find('"') < 0: | |
| 154 valid_name = '"%s"' % name | |
| 155 elif name.find('[') < 0 and name.find(']') < 0: | |
| 156 valid_name = '[%s]' % name | |
| 157 elif name.find('`') < 0: | |
| 158 valid_name = '`%s`' % name | |
| 159 elif name.find("'") < 0: | |
| 160 valid_name = "'%s'" % name | |
| 161 return valid_name | |
| 8 | 162 |
| 9 | 163 |
| 10 def getValueType(val): | 164 def getValueType(val): |
| 11 if val or 0. == val: | 165 if val or 0. == val: |
| 12 try: | 166 try: |
| 21 return None | 175 return None |
| 22 | 176 |
| 23 | 177 |
| 24 def get_column_def(file_path, table_name, skip=0, comment_char='#', | 178 def get_column_def(file_path, table_name, skip=0, comment_char='#', |
| 25 column_names=None, max_lines=100, load_named_columns=False, | 179 column_names=None, max_lines=100, load_named_columns=False, |
| 26 filters=None): | 180 firstlinenames=False, filters=None): |
| 27 col_pref = ['TEXT', 'REAL', 'INTEGER', None] | 181 col_pref = ['TEXT', 'REAL', 'INTEGER', None] |
| 28 col_types = [] | 182 col_types = [] |
| 29 col_idx = None | 183 col_idx = None |
| 184 col_names = [] | |
| 30 try: | 185 try: |
| 31 tr = TabularReader(file_path, skip=skip, comment_char=comment_char, | 186 tr = TabularReader(file_path, skip=skip, comment_char=comment_char, |
| 32 col_idx=None, filters=filters) | 187 col_idx=None, filters=filters) |
| 33 for linenum, fields in enumerate(tr): | 188 for linenum, fields in enumerate(tr): |
| 189 if linenum == 0 and firstlinenames: | |
| 190 col_names = [get_valid_column_name(name) or 'c%d' % (i + 1) | |
| 191 for i, name in enumerate(fields)] | |
| 192 continue | |
| 34 if linenum > max_lines: | 193 if linenum > max_lines: |
| 35 break | 194 break |
| 36 try: | 195 try: |
| 37 while len(col_types) < len(fields): | 196 while len(col_types) < len(fields): |
| 38 col_types.append(None) | 197 col_types.append(None) |
| 46 except Exception as e: | 205 except Exception as e: |
| 47 print('Failed: %s' % (e), file=sys.stderr) | 206 print('Failed: %s' % (e), file=sys.stderr) |
| 48 for i, col_type in enumerate(col_types): | 207 for i, col_type in enumerate(col_types): |
| 49 if not col_type: | 208 if not col_type: |
| 50 col_types[i] = 'TEXT' | 209 col_types[i] = 'TEXT' |
| 210 if not col_names: | |
| 211 col_names = ['c%d' % i for i in range(1, len(col_types) + 1)] | |
| 51 if column_names: | 212 if column_names: |
| 52 col_names = [] | |
| 53 if load_named_columns: | 213 if load_named_columns: |
| 54 col_idx = [] | 214 col_idx = [] |
| 215 cnames = [] | |
| 55 for i, cname in enumerate( | 216 for i, cname in enumerate( |
| 56 [cn.strip() for cn in column_names.split(',')]): | 217 [cn.strip() for cn in column_names.split(',')]): |
| 57 if cname != '': | 218 if cname != '': |
| 58 col_idx.append(i) | 219 col_idx.append(i) |
| 59 col_names.append(cname) | 220 cnames.append(cname) |
| 60 col_types = [col_types[i] for i in col_idx] | 221 col_types = [col_types[i] for i in col_idx] |
| 222 col_names = cnames | |
| 61 else: | 223 else: |
| 62 col_names = ['c%d' % i for i in range(1, len(col_types) + 1)] | |
| 63 for i, cname in enumerate( | 224 for i, cname in enumerate( |
| 64 [cn.strip() for cn in column_names.split(',')]): | 225 [cn.strip() for cn in column_names.split(',')]): |
| 65 if cname and i < len(col_names): | 226 if cname and i < len(col_names): |
| 66 col_names[i] = cname | 227 col_names[i] = cname |
| 67 else: | |
| 68 col_names = ['c%d' % i for i in range(1, len(col_types) + 1)] | |
| 69 col_def = [] | 228 col_def = [] |
| 70 for i, col_name in enumerate(col_names): | 229 for i, col_name in enumerate(col_names): |
| 71 col_def.append('%s %s' % (col_names[i], col_types[i])) | 230 col_def.append('%s %s' % (col_names[i], col_types[i])) |
| 72 return col_names, col_types, col_def, col_idx | 231 return col_names, col_types, col_def, col_idx |
| 73 | 232 |
| 74 | 233 |
| 75 def create_table(conn, file_path, table_name, skip=0, comment_char='#', | 234 def create_table(conn, file_path, table_name, skip=0, comment_char='#', |
| 76 pkey_autoincr=None, column_names=None, | 235 pkey_autoincr=None, column_names=None, |
| 77 load_named_columns=False, filters=None, | 236 load_named_columns=False, firstlinenames=False, |
| 78 unique_indexes=[], indexes=[]): | 237 filters=None, unique_indexes=[], indexes=[]): |
| 79 col_names, col_types, col_def, col_idx = \ | 238 col_names, col_types, col_def, col_idx = \ |
| 80 get_column_def(file_path, table_name, skip=skip, | 239 get_column_def(file_path, table_name, skip=skip, |
| 81 comment_char=comment_char, column_names=column_names, | 240 comment_char=comment_char, column_names=column_names, |
| 82 load_named_columns=load_named_columns, filters=filters) | 241 load_named_columns=load_named_columns, |
| 242 firstlinenames=firstlinenames, | |
| 243 filters=filters) | |
| 83 col_func = [float if t == 'REAL' else int | 244 col_func = [float if t == 'REAL' else int |
| 84 if t == 'INTEGER' else str for t in col_types] | 245 if t == 'INTEGER' else str for t in col_types] |
| 85 table_def = 'CREATE TABLE %s (\n %s%s\n);' % ( | 246 table_def = 'CREATE TABLE %s (\n %s%s\n);' % ( |
| 86 table_name, | 247 table_name, |
| 87 '%s INTEGER PRIMARY KEY AUTOINCREMENT,' % | 248 '%s INTEGER PRIMARY KEY AUTOINCREMENT,' % |
| 96 try: | 257 try: |
| 97 c = conn.cursor() | 258 c = conn.cursor() |
| 98 c.execute(table_def) | 259 c.execute(table_def) |
| 99 conn.commit() | 260 conn.commit() |
| 100 c.close() | 261 c.close() |
| 101 for i, index in enumerate(unique_indexes): | 262 |
| 102 index_name = 'idx_uniq_%s_%d' % (table_name, i) | |
| 103 index_columns = index.split(',') | |
| 104 create_index(conn, table_name, index_name, index_columns, | |
| 105 unique=True) | |
| 106 for i, index in enumerate(indexes): | |
| 107 index_name = 'idx_%s_%d' % (table_name, i) | |
| 108 index_columns = index.split(',') | |
| 109 create_index(conn, table_name, index_name, index_columns) | |
| 110 c = conn.cursor() | 263 c = conn.cursor() |
| 111 tr = TabularReader(file_path, skip=skip, comment_char=comment_char, | 264 tr = TabularReader(file_path, skip=skip, comment_char=comment_char, |
| 112 col_idx=col_idx, filters=filters) | 265 col_idx=col_idx, filters=filters) |
| 113 for linenum, fields in enumerate(tr): | 266 for linenum, fields in enumerate(tr): |
| 267 if linenum == 0 and firstlinenames: | |
| 268 continue | |
| 114 data_lines += 1 | 269 data_lines += 1 |
| 115 try: | 270 try: |
| 116 vals = [col_func[i](x) | 271 vals = [col_func[i](x) |
| 117 if x else None for i, x in enumerate(fields)] | 272 if x else None for i, x in enumerate(fields)] |
| 118 c.execute(insert_stmt, vals) | 273 c.execute(insert_stmt, vals) |
| 119 except Exception as e: | 274 except Exception as e: |
| 120 print('Failed at line: %d err: %s' % (linenum, e), | 275 print('Failed at line: %d err: %s' % (linenum, e), |
| 121 file=sys.stderr) | 276 file=sys.stderr) |
| 122 conn.commit() | 277 conn.commit() |
| 123 c.close() | 278 c.close() |
| 279 for i, index in enumerate(unique_indexes): | |
| 280 index_name = 'idx_uniq_%s_%d' % (table_name, i) | |
| 281 index_columns = index.split(',') | |
| 282 create_index(conn, table_name, index_name, index_columns, | |
| 283 unique=True) | |
| 284 for i, index in enumerate(indexes): | |
| 285 index_name = 'idx_%s_%d' % (table_name, i) | |
| 286 index_columns = index.split(',') | |
| 287 create_index(conn, table_name, index_name, index_columns) | |
| 124 except Exception as e: | 288 except Exception as e: |
| 125 exit('Error: %s' % (e)) | 289 exit('Error: %s' % (e)) |
| 126 | 290 |
| 127 | 291 |
| 128 def create_index(conn, table_name, index_name, index_columns, unique=False): | 292 def create_index(conn, table_name, index_name, index_columns, unique=False): |
| 129 index_def = "CREATE %s INDEX %s on %s(%s)" % ( | 293 index_def = 'CREATE %s INDEX %s on %s(%s)' % ( |
| 130 'UNIQUE' if unique else '', index_name, | 294 'UNIQUE' if unique else '', index_name, |
| 131 table_name, ','.join(index_columns)) | 295 table_name, ','.join(index_columns)) |
| 132 c = conn.cursor() | 296 try: |
| 133 c.execute(index_def) | 297 c = conn.cursor() |
| 134 conn.commit() | 298 c.execute(index_def) |
| 135 c.close() | 299 conn.commit() |
| 300 c.close() | |
| 301 except Exception as e: | |
| 302 print('Failed: %s err: %s' % (index_def, e), file=sys.stderr) | |
| 303 raise(e) |
