Mercurial > repos > cstrittmatter > test_galtrakr_eurl_vtec_wgs_pt_23
comparison scripts/modules/utils.py @ 0:e37910d2c794 draft
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
| author | cstrittmatter |
|---|---|
| date | Mon, 20 Jan 2020 15:11:03 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:e37910d2c794 |
|---|---|
| 1 import pickle | |
| 2 import traceback | |
| 3 import shlex | |
| 4 import subprocess | |
| 5 from threading import Timer | |
| 6 import shutil | |
| 7 import time | |
| 8 import functools | |
| 9 import os.path | |
| 10 import sys | |
| 11 import argparse | |
| 12 | |
| 13 | |
| 14 def start_logger(workdir): | |
| 15 time_str = time.strftime("%Y%m%d-%H%M%S") | |
| 16 sys.stdout = Logger(workdir, time_str) | |
| 17 logfile = sys.stdout.getLogFile() | |
| 18 return logfile, time_str | |
| 19 | |
| 20 | |
| 21 class Logger(object): | |
| 22 def __init__(self, out_directory, time_str): | |
| 23 self.logfile = os.path.join(out_directory, str('run.' + time_str + '.log')) | |
| 24 self.terminal = sys.stdout | |
| 25 self.log = open(self.logfile, "w") | |
| 26 | |
| 27 def write(self, message): | |
| 28 self.terminal.write(message) | |
| 29 self.log.write(message) | |
| 30 self.log.flush() | |
| 31 | |
| 32 def flush(self): | |
| 33 pass | |
| 34 | |
| 35 def getLogFile(self): | |
| 36 return self.logfile | |
| 37 | |
| 38 | |
| 39 def checkPrograms(programs_version_dictionary): | |
| 40 print '\n' + 'Checking dependencies...' | |
| 41 programs = programs_version_dictionary | |
| 42 which_program = ['which', ''] | |
| 43 listMissings = [] | |
| 44 for program in programs: | |
| 45 which_program[1] = program | |
| 46 run_successfully, stdout, stderr = runCommandPopenCommunicate(which_program, False, None, False) | |
| 47 if not run_successfully: | |
| 48 listMissings.append(program + ' not found in PATH.') | |
| 49 else: | |
| 50 print stdout.splitlines()[0] | |
| 51 if programs[program][0] is None: | |
| 52 print program + ' (impossible to determine programme version) found at: ' + stdout.splitlines()[0] | |
| 53 else: | |
| 54 if program.endswith('.jar'): | |
| 55 check_version = ['java', '-jar', stdout.splitlines()[0], programs[program][0]] | |
| 56 programs[program].append(stdout.splitlines()[0]) | |
| 57 else: | |
| 58 check_version = [stdout.splitlines()[0], programs[program][0]] | |
| 59 run_successfully, stdout, stderr = runCommandPopenCommunicate(check_version, False, None, False) | |
| 60 if stdout == '': | |
| 61 stdout = stderr | |
| 62 if program == 'wget': | |
| 63 version_line = stdout.splitlines()[0].split(' ', 3)[2] | |
| 64 else: | |
| 65 version_line = stdout.splitlines()[0].split(' ')[-1] | |
| 66 replace_characters = ['"', 'v', 'V', '+'] | |
| 67 for i in replace_characters: | |
| 68 version_line = version_line.replace(i, '') | |
| 69 print program + ' (' + version_line + ') found' | |
| 70 if programs[program][1] == '>=': | |
| 71 program_found_version = version_line.split('.') | |
| 72 program_version_required = programs[program][2].split('.') | |
| 73 if len(program_version_required) == 3: | |
| 74 if len(program_found_version) == 2: | |
| 75 program_found_version.append(0) | |
| 76 else: | |
| 77 program_found_version[2] = program_found_version[2].split('_')[0] | |
| 78 for i in range(0, len(program_version_required)): | |
| 79 if isinstance(program_found_version[i], (int, long)): | |
| 80 if int(program_found_version[i]) < int(program_version_required[i]): | |
| 81 listMissings.append('It is required ' + program + ' with version ' + programs[program][1] + ' ' + programs[program][2]) | |
| 82 else: | |
| 83 if version_line != programs[program][2]: | |
| 84 listMissings.append('It is required ' + program + ' with version ' + programs[program][1] + ' ' + programs[program][2]) | |
| 85 return listMissings | |
| 86 | |
| 87 | |
| 88 def requiredPrograms(): | |
| 89 programs_version_dictionary = {} | |
| 90 programs_version_dictionary['rematch.py'] = ['--version', '>=', '3.2'] | |
| 91 missingPrograms = checkPrograms(programs_version_dictionary) | |
| 92 if len(missingPrograms) > 0: | |
| 93 sys.exit('\n' + 'Errors:' + '\n' + '\n'.join(missingPrograms)) | |
| 94 | |
| 95 | |
| 96 def general_information(logfile, version, outdir, time_str): | |
| 97 # Check if output directory exists | |
| 98 | |
| 99 print '\n' + '==========> patho_typing <==========' | |
| 100 print '\n' + 'Program start: ' + time.ctime() | |
| 101 | |
| 102 # Tells where the logfile will be stored | |
| 103 print '\n' + 'LOGFILE:' | |
| 104 print logfile | |
| 105 | |
| 106 # Print command | |
| 107 print '\n' + 'COMMAND:' | |
| 108 script_path = os.path.abspath(sys.argv[0]) | |
| 109 print sys.executable + ' ' + script_path + ' ' + ' '.join(sys.argv[1:]) | |
| 110 | |
| 111 # Print directory where programme was lunch | |
| 112 print '\n' + 'PRESENT DIRECTORY:' | |
| 113 present_directory = os.path.abspath(os.getcwd()) | |
| 114 print present_directory | |
| 115 | |
| 116 # Print program version | |
| 117 print '\n' + 'VERSION:' | |
| 118 scriptVersionGit(version, present_directory, script_path) | |
| 119 | |
| 120 # Check programms | |
| 121 requiredPrograms() | |
| 122 | |
| 123 return script_path | |
| 124 | |
| 125 | |
| 126 def setPATHvariable(doNotUseProvidedSoftware, script_path): | |
| 127 path_variable = os.environ['PATH'] | |
| 128 script_folder = os.path.dirname(script_path) | |
| 129 # Set path to use provided softwares | |
| 130 if not doNotUseProvidedSoftware: | |
| 131 bowtie2 = os.path.join(script_folder, 'src', 'bowtie2-2.2.9') | |
| 132 samtools = os.path.join(script_folder, 'src', 'samtools-1.3.1', 'bin') | |
| 133 bcftools = os.path.join(script_folder, 'src', 'bcftools-1.3.1', 'bin') | |
| 134 | |
| 135 os.environ['PATH'] = str(':'.join([bowtie2, samtools, bcftools, path_variable])) | |
| 136 | |
| 137 # Print PATH variable | |
| 138 print '\n' + 'PATH variable:' | |
| 139 print os.environ['PATH'] | |
| 140 | |
| 141 | |
| 142 def scriptVersionGit(version, directory, script_path): | |
| 143 print 'Version ' + version | |
| 144 | |
| 145 try: | |
| 146 os.chdir(os.path.dirname(script_path)) | |
| 147 command = ['git', 'log', '-1', '--date=local', '--pretty=format:"%h (%H) - Commit by %cn, %cd) : %s"'] | |
| 148 run_successfully, stdout, stderr = runCommandPopenCommunicate(command, False, 15, False) | |
| 149 print stdout | |
| 150 command = ['git', 'remote', 'show', 'origin'] | |
| 151 run_successfully, stdout, stderr = runCommandPopenCommunicate(command, False, 15, False) | |
| 152 print stdout | |
| 153 os.chdir(directory) | |
| 154 except: | |
| 155 print 'HARMLESS WARNING: git command possibly not found. The GitHub repository information will not be obtained.' | |
| 156 | |
| 157 | |
| 158 def runTime(start_time): | |
| 159 end_time = time.time() | |
| 160 time_taken = end_time - start_time | |
| 161 hours, rest = divmod(time_taken, 3600) | |
| 162 minutes, seconds = divmod(rest, 60) | |
| 163 print 'Runtime :' + str(hours) + 'h:' + str(minutes) + 'm:' + str(round(seconds, 2)) + 's' | |
| 164 return round(time_taken, 2) | |
| 165 | |
| 166 | |
| 167 def timer(function, name): | |
| 168 @functools.wraps(function) | |
| 169 def wrapper(*args, **kwargs): | |
| 170 print('\n' + 'RUNNING {0}\n'.format(name)) | |
| 171 start_time = time.time() | |
| 172 | |
| 173 results = list(function(*args, **kwargs)) # guarantees return is a list to allow .insert() | |
| 174 | |
| 175 time_taken = runTime(start_time) | |
| 176 print('END {0}'.format(name)) | |
| 177 | |
| 178 results.insert(0, time_taken) | |
| 179 return results | |
| 180 return wrapper | |
| 181 | |
| 182 | |
| 183 def removeDirectory(directory): | |
| 184 if os.path.isdir(directory): | |
| 185 shutil.rmtree(directory) | |
| 186 | |
| 187 | |
| 188 def saveVariableToPickle(variableToStore, pickleFile): | |
| 189 with open(pickleFile, 'wb') as writer: | |
| 190 pickle.dump(variableToStore, writer) | |
| 191 | |
| 192 | |
| 193 def extractVariableFromPickle(pickleFile): | |
| 194 with open(pickleFile, 'rb') as reader: | |
| 195 variable = pickle.load(reader) | |
| 196 return variable | |
| 197 | |
| 198 | |
| 199 def trace_unhandled_exceptions(func): | |
| 200 @functools.wraps(func) | |
| 201 def wrapped_func(*args, **kwargs): | |
| 202 try: | |
| 203 func(*args, **kwargs) | |
| 204 except: | |
| 205 print 'Exception in ' + func.__name__ | |
| 206 traceback.print_exc() | |
| 207 return wrapped_func | |
| 208 | |
| 209 | |
| 210 def kill_subprocess_Popen(subprocess_Popen, command): | |
| 211 print 'Command run out of time: ' + str(command) | |
| 212 subprocess_Popen.kill() | |
| 213 | |
| 214 | |
| 215 def runCommandPopenCommunicate(command, shell_True, timeout_sec_None, print_comand_True): | |
| 216 run_successfully = False | |
| 217 if not isinstance(command, basestring): | |
| 218 command = ' '.join(command) | |
| 219 command = shlex.split(command) | |
| 220 | |
| 221 if print_comand_True: | |
| 222 print 'Running: ' + ' '.join(command) | |
| 223 | |
| 224 if shell_True: | |
| 225 command = ' '.join(command) | |
| 226 proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) | |
| 227 else: | |
| 228 proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
| 229 | |
| 230 not_killed_by_timer = True | |
| 231 if timeout_sec_None is None: | |
| 232 stdout, stderr = proc.communicate() | |
| 233 else: | |
| 234 timer = Timer(timeout_sec_None, kill_subprocess_Popen, args=(proc, command,)) | |
| 235 timer.start() | |
| 236 stdout, stderr = proc.communicate() | |
| 237 timer.cancel() | |
| 238 not_killed_by_timer = timer.isAlive() | |
| 239 | |
| 240 if proc.returncode == 0: | |
| 241 run_successfully = True | |
| 242 else: | |
| 243 if not print_comand_True and not_killed_by_timer: | |
| 244 print 'Running: ' + str(command) | |
| 245 if len(stdout) > 0: | |
| 246 print 'STDOUT' | |
| 247 print stdout.decode("utf-8") | |
| 248 if len(stderr) > 0: | |
| 249 print 'STDERR' | |
| 250 print stderr.decode("utf-8") | |
| 251 return run_successfully, stdout, stderr | |
| 252 | |
| 253 | |
| 254 def required_length(tuple_length_options, argument_name): | |
| 255 class RequiredLength(argparse.Action): | |
| 256 def __call__(self, parser, args, values, option_string=None): | |
| 257 if len(values) not in tuple_length_options: | |
| 258 msg = 'Option {argument_name} requires one of the following number of arguments: {tuple_length_options}'.format( | |
| 259 argument_name=self.argument_name, tuple_length_options=tuple_length_options) | |
| 260 raise argparse.ArgumentTypeError(msg) | |
| 261 setattr(args, self.dest, values) | |
| 262 return RequiredLength | |
| 263 | |
| 264 | |
| 265 def get_sequence_information(fasta_file, length_extra_seq): | |
| 266 sequence_dict = {} | |
| 267 headers = {} | |
| 268 | |
| 269 with open(fasta_file, 'rtU') as reader: | |
| 270 blank_line_found = False | |
| 271 sequence_counter = 0 | |
| 272 temp_sequence_dict = {} | |
| 273 for line in reader: | |
| 274 line = line.splitlines()[0] | |
| 275 if len(line) > 0: | |
| 276 if not blank_line_found: | |
| 277 if line.startswith('>'): | |
| 278 if len(temp_sequence_dict) > 0: | |
| 279 if temp_sequence_dict.values()[0]['length'] - 2 * length_extra_seq > 0: | |
| 280 sequence_dict[temp_sequence_dict.keys()[0]] = temp_sequence_dict.values()[0] | |
| 281 headers[temp_sequence_dict.values()[0]['header'].lower()] = sequence_counter | |
| 282 else: | |
| 283 print temp_sequence_dict.values()[0]['header'] + ' sequence ignored due to length <= 0' | |
| 284 temp_sequence_dict = {} | |
| 285 | |
| 286 if line[1:].lower() in headers: | |
| 287 sys.exit('Found duplicated sequence headers') | |
| 288 | |
| 289 sequence_counter += 1 | |
| 290 temp_sequence_dict[sequence_counter] = {'header': line[1:].lower(), 'sequence': '', 'length': 0} | |
| 291 else: | |
| 292 temp_sequence_dict[sequence_counter]['sequence'] += line.upper() | |
| 293 temp_sequence_dict[sequence_counter]['length'] += len(line) | |
| 294 else: | |
| 295 sys.exit('It was found a blank line between the fasta file above line ' + line) | |
| 296 else: | |
| 297 blank_line_found = True | |
| 298 | |
| 299 if len(temp_sequence_dict) > 0: | |
| 300 if temp_sequence_dict.values()[0]['length'] - 2 * length_extra_seq > 0: | |
| 301 sequence_dict[temp_sequence_dict.keys()[0]] = temp_sequence_dict.values()[0] | |
| 302 headers[temp_sequence_dict.values()[0]['header'].lower()] = sequence_counter | |
| 303 else: | |
| 304 print temp_sequence_dict.values()[0]['header'] + ' sequence ignored due to length <= 0' | |
| 305 | |
| 306 return sequence_dict, headers | |
| 307 | |
| 308 | |
| 309 def simplify_sequence_dict(sequence_dict): | |
| 310 simple_sequence_dict = {} | |
| 311 for counter, info in sequence_dict.items(): | |
| 312 simple_sequence_dict[info['header']] = info | |
| 313 del simple_sequence_dict[info['header']]['header'] | |
| 314 return simple_sequence_dict | |
| 315 | |
| 316 | |
| 317 def chunkstring(string, length): | |
| 318 return (string[0 + i:length + i] for i in range(0, len(string), length)) | |
| 319 | |
| 320 | |
| 321 def clean_headers_sequences(sequence_dict): | |
| 322 problematic_characters = ["|", " ", ",", ".", "(", ")", "'", "/", ":"] | |
| 323 # print 'Checking if reference sequences contain ' + str(problematic_characters) + '\n' | |
| 324 | |
| 325 headers_changed = False | |
| 326 new_headers = {} | |
| 327 for i in sequence_dict: | |
| 328 if any(x in sequence_dict[i]['header'] for x in problematic_characters): | |
| 329 for x in problematic_characters: | |
| 330 sequence_dict[i]['header'] = sequence_dict[i]['header'].replace(x, '_') | |
| 331 headers_changed = True | |
| 332 new_headers[sequence_dict[i]['header'].lower()] = i | |
| 333 | |
| 334 if headers_changed: | |
| 335 print 'At least one of the those characters was found. Replacing those with _' + '\n' | |
| 336 | |
| 337 return sequence_dict, new_headers |
