Mercurial > repos > pieterlukasse > prims_metabolomics2

--- a/rankfilter_GCMS/pdfread.py	Fri Mar 20 17:10:04 2015 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,214 +0,0 @@
-"""
-Copyright (C) 2011 by Velitchka Mihaleva, Wageningen University
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-"""
-
-import sys
-import csv
-
-def getPDF(filename, print_progress):
-    '''
-    Parses NIST PDF file
-    @param filename: PDF file to parse
-    '''
-    NistInput = {}
-    NistInput_missed = {}
-    nist_input = open(filename, 'r').read()
-
-    hitid = []
-    rt = []
-    name = []
-    forward = []
-    cas = []
-    reverse = []
-    prob = []
-    lib_id = []
-    nist_id = []
-    missed_compounds = []
-    id_missed_compounds = []
-    formula = []
-
-    hit_list = nist_input.split('** Search Report Page 1 of 1 **')
-    hit_list.pop(0)
-    #number_hits = range(10)
-    line_id = 0
-    for line in hit_list:
-        line = line.strip().translate(None, '\r')
-        if line != '':
-            hits = line.replace('\n', ' ').replace('\x0c', '').replace('^L', '').split('Hit')  #solution? : if we wouldn't replace the \n by ' ' but by some special sign, then reading formula would be simpler!
-                                                                                                #strange....code seems fine actually...debug! See test/data/download.pdf
-                                                                                                # strange thing is that it looks like the new line does not end up in the text file, eventhough it looks like there is a new line in the pdf...perhaps a bug in the pdf2text command in linux?
-            spec_id = hits.pop(0).split(' ')[1]
-            j = 0
-            for hh in hits:
-                cell = hh.split(';')
-                if print_progress == True:
-                    print 'Processing line: ', line_id, ' with length: ', len(cell), ':\n\t', cell
-                line_id += 1
-                if len(cell) == 7:  # the compound has CAS number
-                    if len(cell[1].split(':')) == 2:
-                        forward.append((cell[1].split(':')[1]).strip())
-                        # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end
-                        if len(cell[0].split(':')) > 2:
-                            name_tmp = ':'.join(cell[0].split(':')[1:])
-                        else:
-                            name_tmp = cell[0].split(':')[1]
-
-                        name.append(name_tmp.replace("  ", " ").strip())
-                        name_tmp = name_tmp.strip().split(' ')
-                        if name_tmp:
-                            # if the name ends with a word that starts with C, F or H, then assume this last word is a formula:
-                            if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H':
-                                formule = (name_tmp[-1])
-                            else:
-                                formule = ('not_def')
-                        else:
-                            formule = ('not_def')
-                        formula.append(formule.replace("  ", " "))
-                        reverse.append((cell[2].split(':')[1]).strip())
-                        prob.append(cell[3].split(' ')[2].replace('%', ''))
-                        cas.append((cell[4].split(':')[1]).strip())
-                        lib_id.append((cell[5].split(':')[1]).strip())
-                        nist_id.append(cell[6].split(':')[1].replace('.', '').strip())
-                        j = j + 1
-                    else:
-                        missed_compounds.append(hh)
-                        id_missed_compounds.append(spec_id)
-
-                elif len(cell) == 6:  # the compound has no CAS number
-                    if len(cell[1].split(':')) == 2:
-
-                        forward.append((cell[1].split(':')[1]).strip())
-                        # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end
-                        if len(cell[0].split(':')) > 2:
-                            name_tmp = ':'.join(cell[0].split(':')[1:])
-                        else:
-                            name_tmp = cell[0].split(':')[1]
-
-                        name.append(name_tmp.replace("  ", " ").strip())
-                        name_tmp = name_tmp.strip().split(' ')
-                        if name_tmp:
-                            # if the name ends with a word that starts with C, F or H, then assume this last word is a formula:
-                            if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H':
-                                formule = (name_tmp[-1])
-                            else:
-                                formule = ('not_def')
-                        else:
-                            formule = ('not_def')
-                        formula.append(formule.replace("  ", " "))
-                        reverse.append((cell[2].split(':')[1]).strip())
-                        prob.append(cell[3].split(' ')[2].replace('%', ''))
-                        cas.append('undef')
-                        lib_id.append((cell[4].split(':')[1]).strip())
-                        nist_id.append(cell[5].split(':')[1].replace('.', '').strip())
-                        j = j + 1
-
-                    else:
-                        missed_compounds.append(hh)
-                        id_missed_compounds.append(spec_id)
-
-                else: # Missing columns, report and quit
-                    missed_compounds.append(hh)
-                    id_missed_compounds.append(spec_id)
-
-            for _ in range(j):
-                hitid.append(str(spec_id.replace("  ", " ")))
-                #NB: this is the RT as found in the "id" generated by e.g. msclust, so NOT the RT of the library hit:
-                rt.append(str(float(spec_id.split('-')[3]) / 1e+06))
-
-    NistInput['ID'] = hitid
-    NistInput['R.T.'] = rt
-    NistInput['Name'] = name
-    NistInput['CAS'] = cas
-    NistInput['Formula'] = formula
-    NistInput['Forward'] = forward
-    NistInput['Reverse'] = reverse
-    NistInput['Probability'] = prob
-    NistInput['Library'] = lib_id
-    NistInput['Library ID'] = nist_id
-    NistInput_missed['Missed Compounds'] = missed_compounds
-    NistInput_missed['ID missed Compounds'] = id_missed_compounds
-
-    return NistInput, NistInput_missed
-
-
-def convert_pdftotext2tabular(filename, output_file, error_file, print_progress):
-    '''
-    Converts NIST PDF file to tabular format
-    @param filename: PDF file to parse
-    @param output_file: output file for the hits
-    @param error_file: output file for failed hits
-    '''
-    [HitList, HitList_missed] = getPDF(filename, print_progress)
-    # save Hitlist as tab seperate file
-    Hitlist_as_text = "\t".join(HitList.keys()) + "\n"
-    Hitlist_array_of_array = ([HitList[row] for row in HitList.keys()])
-    Hitlist_as_text += str("\n".join(["\t".join(e) for e in zip(*Hitlist_array_of_array)]))
-    output_fh = open(output_file, 'wb')
-    output_fh.write(Hitlist_as_text)
-    output_fh.close()
-
-    out_missed_pdf = open(error_file, 'wb')
-    for x, y in zip(HitList_missed['Missed Compounds'], HitList_missed['ID missed Compounds']):
-        out_missed_pdf.write("Line with incorrect format or unexpected number of fields:\n")
-        out_missed_pdf.write('%s\n' % '\t'.join([y, x]))
-    out_missed_pdf.close()
-
-
-def read_tabular(in_csv):
-    '''
-    Parses a tab-separated file returning a dictionary with named columns
-    @param in_csv: input filename to be parsed
-    '''
-    data = list(csv.reader(open(in_csv, 'rU'), delimiter='\t'))
-    header = data.pop(0)
-    # Create dictionary with column name as key
-    output = {}
-    for index in xrange(len(header)):
-        output[header[index]] = [row[index] for row in data]
-    return output
-
-
-def read_tabular_old(filename):
-    '''
-    Function to read tabular format (created by convert_pdftotext2tabular)
-    and output a dict with header of columns as key and value is columns of tabular as list
-    @param filename: tabular file to read
-    '''
-    input_fh = None
-    try:
-        input_fh = open(filename, 'r')
-    except IOError, error:
-        raise error
-    colnames = input_fh.readline().strip().split('\t')
-    cells = []
-    for line in input_fh.readlines():
-        cells.append(line.strip().split('\t'))
-    #transform from row oriented structure to column oriented structure
-    cells = zip(*cells)
-    #store the list of list in form of final output
-    RankFilterGC_format = {}
-    for colnumber in range(len(colnames)):
-        RankFilterGC_format[colnames[colnumber]] = cells[colnumber]
-    return RankFilterGC_format
-
-
-if __name__ == '__main__':
-    convert_pdftotext2tabular(sys.argv[1], sys.argv[2], sys.argv[3], True)