view rankfilter_GCMS/pdfread.py @ 17:94b62c8be01e

restored pdfread module
author linda.bakker@wur.nl <linda.bakker@wur.nl>
date Thu, 26 Mar 2015 09:27:51 +0100
parents
children
line wrap: on
line source

"""
Copyright (C) 2011 by Velitchka Mihaleva, Wageningen University 

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""

import sys
import csv

def getPDF(filename, print_progress):
    '''
    Parses NIST PDF file
    @param filename: PDF file to parse
    '''
    NistInput = {}
    NistInput_missed = {}
    nist_input = open(filename, 'r').read()

    hitid = []
    rt = []
    name = []
    forward = []
    cas = []
    reverse = []
    prob = []
    lib_id = []
    nist_id = []
    missed_compounds = []
    id_missed_compounds = []
    formula = []

    hit_list = nist_input.split('** Search Report Page 1 of 1 **')
    hit_list.pop(0)
    #number_hits = range(10)
    line_id = 0
    for line in hit_list:
        line = line.strip().translate(None, '\r')
        if line != '':
            hits = line.replace('\n', ' ').replace('\x0c', '').replace('^L', '').split('Hit')  #solution? : if we wouldn't replace the \n by ' ' but by some special sign, then reading formula would be simpler! 
                                                                                                #strange....code seems fine actually...debug! See test/data/download.pdf 
                                                                                                # strange thing is that it looks like the new line does not end up in the text file, eventhough it looks like there is a new line in the pdf...perhaps a bug in the pdf2text command in linux?
            spec_id = hits.pop(0).split(' ')[1]
            j = 0
            for hh in hits:
                cell = hh.split(';')
                if print_progress == True:
                    print 'Processing line: ', line_id, ' with length: ', len(cell), ':\n\t', cell
                line_id += 1
                if len(cell) == 7:  # the compound has CAS number
                    if len(cell[1].split(':')) == 2:
                        forward.append((cell[1].split(':')[1]).strip())
                        # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end
                        if len(cell[0].split(':')) > 2:
                            name_tmp = ':'.join(cell[0].split(':')[1:])
                        else:
                            name_tmp = cell[0].split(':')[1]
                            
                        name.append(name_tmp.replace("  ", " ").strip())
                        name_tmp = name_tmp.strip().split(' ')
                        if name_tmp:
                            # if the name ends with a word that starts with C, F or H, then assume this last word is a formula:
                            if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H':
                                formule = (name_tmp[-1])
                            else:
                                formule = ('not_def')
                        else:
                            formule = ('not_def')
                        formula.append(formule.replace("  ", " "))
                        reverse.append((cell[2].split(':')[1]).strip())
                        prob.append(cell[3].split(' ')[2].replace('%', ''))
                        cas.append((cell[4].split(':')[1]).strip())
                        lib_id.append((cell[5].split(':')[1]).strip())
                        nist_id.append(cell[6].split(':')[1].replace('.', '').strip())
                        j = j + 1
                    else:
                        missed_compounds.append(hh)
                        id_missed_compounds.append(spec_id)

                elif len(cell) == 6:  # the compound has no CAS number
                    if len(cell[1].split(':')) == 2:

                        forward.append((cell[1].split(':')[1]).strip())
                        # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end
                        if len(cell[0].split(':')) > 2:
                            name_tmp = ':'.join(cell[0].split(':')[1:])
                        else:
                            name_tmp = cell[0].split(':')[1]
                        
                        name.append(name_tmp.replace("  ", " ").strip())
                        name_tmp = name_tmp.strip().split(' ')
                        if name_tmp:
                            # if the name ends with a word that starts with C, F or H, then assume this last word is a formula:
                            if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H':
                                formule = (name_tmp[-1])
                            else:
                                formule = ('not_def')
                        else:
                            formule = ('not_def')
                        formula.append(formule.replace("  ", " "))
                        reverse.append((cell[2].split(':')[1]).strip())
                        prob.append(cell[3].split(' ')[2].replace('%', ''))
                        cas.append('undef')
                        lib_id.append((cell[4].split(':')[1]).strip())
                        nist_id.append(cell[5].split(':')[1].replace('.', '').strip())
                        j = j + 1

                    else:
                        missed_compounds.append(hh)
                        id_missed_compounds.append(spec_id)

                else: # Missing columns, report and quit
                    missed_compounds.append(hh)
                    id_missed_compounds.append(spec_id)

            for _ in range(j):
                hitid.append(str(spec_id.replace("  ", " ")))
                #NB: this is the RT as found in the "id" generated by e.g. msclust, so NOT the RT of the library hit:
                rt.append(str(float(spec_id.split('-')[3]) / 1e+06))

    NistInput['ID'] = hitid
    NistInput['R.T.'] = rt
    NistInput['Name'] = name
    NistInput['CAS'] = cas
    NistInput['Formula'] = formula
    NistInput['Forward'] = forward
    NistInput['Reverse'] = reverse
    NistInput['Probability'] = prob
    NistInput['Library'] = lib_id
    NistInput['Library ID'] = nist_id
    NistInput_missed['Missed Compounds'] = missed_compounds
    NistInput_missed['ID missed Compounds'] = id_missed_compounds

    return NistInput, NistInput_missed


def convert_pdftotext2tabular(filename, output_file, error_file, print_progress):
    '''
    Converts NIST PDF file to tabular format
    @param filename: PDF file to parse
    @param output_file: output file for the hits
    @param error_file: output file for failed hits
    '''
    [HitList, HitList_missed] = getPDF(filename, print_progress)
    # save Hitlist as tab seperate file
    Hitlist_as_text = "\t".join(HitList.keys()) + "\n"
    Hitlist_array_of_array = ([HitList[row] for row in HitList.keys()])
    Hitlist_as_text += str("\n".join(["\t".join(e) for e in zip(*Hitlist_array_of_array)]))
    output_fh = open(output_file, 'wb')
    output_fh.write(Hitlist_as_text)
    output_fh.close()

    out_missed_pdf = open(error_file, 'wb')
    for x, y in zip(HitList_missed['Missed Compounds'], HitList_missed['ID missed Compounds']):
        out_missed_pdf.write("Line with incorrect format or unexpected number of fields:\n")
        out_missed_pdf.write('%s\n' % '\t'.join([y, x]))
    out_missed_pdf.close()





def read_tabular_old(filename):
    '''
    Function to read tabular format (created by convert_pdftotext2tabular)
    and output a dict with header of columns as key and value is columns of tabular as list
    @param filename: tabular file to read
    '''
    input_fh = None
    try:
        input_fh = open(filename, 'r')
    except IOError, error:
        raise error
    colnames = input_fh.readline().strip().split('\t')
    cells = []
    for line in input_fh.readlines():
        cells.append(line.strip().split('\t'))
    #transform from row oriented structure to column oriented structure
    cells = zip(*cells)
    #store the list of list in form of final output
    RankFilterGC_format = {}
    for colnumber in range(len(colnames)):
        RankFilterGC_format[colnames[colnumber]] = cells[colnumber]
    return RankFilterGC_format


if __name__ == '__main__':
    convert_pdftotext2tabular(sys.argv[1], sys.argv[2], sys.argv[3], True)