view utils.py @ 3:910ebd2a6826

fix
author pieter.lukasse@wur.nl
date Thu, 22 Jan 2015 16:59:59 +0100
parents cce6989ed423
children 8c20185752da
line wrap: on
line source

'''
Created on 31 dec. 2014

@author: lukas007
'''
import shutil
import subprocess
import csv
from collections import OrderedDict
    
def copy_dir(src, dst):
    shutil.copytree(src, dst)

    
def copy_file(src, dst):  
    shutil.copy(src, dst)

def get_process_list():
    p = subprocess.Popen(['ps', '-A'], stdout=subprocess.PIPE)
    out, err = p.communicate()
    return out.splitlines()

def get_process_pid(process_name):
    pid = -1
    for line in get_process_list():
        if process_name in line:
            pid = int(line.split(None, 1)[0])
    return pid


def get_as_dict(in_tsv):
    '''
    Generic method to parse a tab-separated file returning a dictionary with named columns
    @param in_tsv: input filename to be parsed
    '''
    data = list(csv.reader(open(in_tsv, 'rU'), delimiter='\t'))
    header = data.pop(0)
    # Create dictionary with column name as key
    output = {}
    for index in xrange(len(header)):
        output[header[index]] = [row[index] for row in data]
    return output

def save_dict_as_tsv(dict, out_tsv):
    '''
    Writes tab-separated data to file
    @param data: dictionary containing merged dataset
    @param out_tsv: output tsv file
    '''

    # Open output file for writing
    out_file = open(out_tsv, 'wb')
    output_writer = csv.writer(out_file, delimiter="\t")

    # Write headers
    output_writer.writerow(list(dict.keys()))

    # Write 
    for record_index in xrange(len(dict[dict.keys()[0]])):
        row = [dict[k][record_index] for k in dict]
        output_writer.writerow(row)
            
            
            

def get_nist_out_as_dict(nist_result_file):
    '''
    Method to parse NIST specific output into a dictionary.
    @param nist_result_file: result file as produced by NIST nistms$.exe 
    '''
    # Create dictionary with column name as key
    output = OrderedDict()
    output['id'] = []
    output['compound_name'] = []
    output['formula'] = []
    output['lib_name'] = []
    output['id_in_lib'] = []
    output['mf'] = []
    output['rmf'] = []
    output['prob'] = []
    output['cas'] = []
    output['mw'] = []
    
    
    for line in open(nist_result_file):
        row = line.split('<<')
        if row[0].startswith('Unknown'):
            title_row = row[0]
            continue
        elif row[0].startswith('Hit'):
            hit = row

            output['id'].append(title_row.split(': ')[1].split('  ')[0])
            output['compound_name'].append((hit[1].split('>>')[0]).decode('utf-8', errors='replace')) # see http://blog.webforefront.com/archives/2011/02/python_ascii_co.html
            output['formula'].append(hit[2].split('>>')[0])
            output['lib_name'].append(hit[3].split('>>')[0])
            
            other_fields_list = (hit[2].split('>>')[1] + hit[3].split('>>')[1]).split(';')
            count = 0
            for field in other_fields_list:
                if field.startswith(' MF: '):
                    count += 1
                    output['mf'].append(field.split('MF: ')[1])
                elif field.startswith(' RMF: '):
                    count += 1
                    output['rmf'].append(field.split('RMF: ')[1])
                elif field.startswith(' Prob: '):
                    count += 1
                    output['prob'].append(field.split('Prob: ')[1])
                elif field.startswith(' CAS:'):
                    count += 1
                    output['cas'].append(field.split('CAS:')[1])
                elif field.startswith(' Mw: '):
                    count += 1
                    output['mw'].append(field.split('Mw: ')[1])
                elif field.startswith(' Id: '):
                    count += 1
                    output['id_in_lib'].append(field.split('Id: ')[1][0:-2])  # the [0:-2] is to avoid the last 2 characters, namely a '.' and a \n
                elif field != '' and field != ' Lib: ':
                    raise Exception('Error: unexpected field in NIST output: ' + field)                    
            
            if count != 6:
                raise Exception('Error: did not find all expected fields in NIST output')  

    return output

def get_spectra_file_as_dict(spectrum_file):
    '''
    Method to parse spectra file in NIST MSP input format into a dictionary.
    The idea is to parse the following :
        
        Name: spectrum1
        DB#: 1
        Num Peaks: 87
        14 8; 15 15; 27 18; 28 15; 29 15; 
        30 11; 32 19; 39 32; 40 12; 41 68;
    
    into: 
    
        dict['spectrum1'] = "14 8; 15 15; 27 18; 28 15; 29 15; 30 11; 32 19; 39 32; 40 12; 41 68;"
    
    @param spectrum_file: spectra file in MSP format (e.g. also the format returned by MsClust)
    '''
    
    output = OrderedDict()
    name = '' 
    spectrum = ''
    for line in open(spectrum_file):
        if line.startswith('Name: '):
            if name != '':
                # store spectrum:
                output[name] = spectrum 
            name = line.split('Name: ')[1].replace('\n','')
            spectrum = ''
        elif line[0].isdigit():
            # parse spectra:
            spectrum += line.replace('\n','')
    
    # store also last spectrum:
    output[name] = spectrum
    
    return output