Mercurial > repos > pieterlukasse > prims_metabolomics2
changeset 18:cc2f31d1bac0
restored pdfread module
author | linda.bakker@wur.nl <linda.bakker@wur.nl> |
---|---|
date | Thu, 26 Mar 2015 09:39:41 +0100 |
parents | 94b62c8be01e (diff) fe4682eb938c (current diff) |
children | 1cfe2b57d7f4 |
files | |
diffstat | 1 files changed, 203 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rankfilter_GCMS/pdfread.py Thu Mar 26 09:39:41 2015 +0100 @@ -0,0 +1,203 @@ +""" +Copyright (C) 2011 by Velitchka Mihaleva, Wageningen University + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import sys +import csv + +def getPDF(filename, print_progress): + ''' + Parses NIST PDF file + @param filename: PDF file to parse + ''' + NistInput = {} + NistInput_missed = {} + nist_input = open(filename, 'r').read() + + hitid = [] + rt = [] + name = [] + forward = [] + cas = [] + reverse = [] + prob = [] + lib_id = [] + nist_id = [] + missed_compounds = [] + id_missed_compounds = [] + formula = [] + + hit_list = nist_input.split('** Search Report Page 1 of 1 **') + hit_list.pop(0) + #number_hits = range(10) + line_id = 0 + for line in hit_list: + line = line.strip().translate(None, '\r') + if line != '': + hits = line.replace('\n', ' ').replace('\x0c', '').replace('^L', '').split('Hit') #solution? : if we wouldn't replace the \n by ' ' but by some special sign, then reading formula would be simpler! + #strange....code seems fine actually...debug! See test/data/download.pdf + # strange thing is that it looks like the new line does not end up in the text file, eventhough it looks like there is a new line in the pdf...perhaps a bug in the pdf2text command in linux? + spec_id = hits.pop(0).split(' ')[1] + j = 0 + for hh in hits: + cell = hh.split(';') + if print_progress == True: + print 'Processing line: ', line_id, ' with length: ', len(cell), ':\n\t', cell + line_id += 1 + if len(cell) == 7: # the compound has CAS number + if len(cell[1].split(':')) == 2: + forward.append((cell[1].split(':')[1]).strip()) + # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end + if len(cell[0].split(':')) > 2: + name_tmp = ':'.join(cell[0].split(':')[1:]) + else: + name_tmp = cell[0].split(':')[1] + + name.append(name_tmp.replace(" ", " ").strip()) + name_tmp = name_tmp.strip().split(' ') + if name_tmp: + # if the name ends with a word that starts with C, F or H, then assume this last word is a formula: + if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H': + formule = (name_tmp[-1]) + else: + formule = ('not_def') + else: + formule = ('not_def') + formula.append(formule.replace(" ", " ")) + reverse.append((cell[2].split(':')[1]).strip()) + prob.append(cell[3].split(' ')[2].replace('%', '')) + cas.append((cell[4].split(':')[1]).strip()) + lib_id.append((cell[5].split(':')[1]).strip()) + nist_id.append(cell[6].split(':')[1].replace('.', '').strip()) + j = j + 1 + else: + missed_compounds.append(hh) + id_missed_compounds.append(spec_id) + + elif len(cell) == 6: # the compound has no CAS number + if len(cell[1].split(':')) == 2: + + forward.append((cell[1].split(':')[1]).strip()) + # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end + if len(cell[0].split(':')) > 2: + name_tmp = ':'.join(cell[0].split(':')[1:]) + else: + name_tmp = cell[0].split(':')[1] + + name.append(name_tmp.replace(" ", " ").strip()) + name_tmp = name_tmp.strip().split(' ') + if name_tmp: + # if the name ends with a word that starts with C, F or H, then assume this last word is a formula: + if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H': + formule = (name_tmp[-1]) + else: + formule = ('not_def') + else: + formule = ('not_def') + formula.append(formule.replace(" ", " ")) + reverse.append((cell[2].split(':')[1]).strip()) + prob.append(cell[3].split(' ')[2].replace('%', '')) + cas.append('undef') + lib_id.append((cell[4].split(':')[1]).strip()) + nist_id.append(cell[5].split(':')[1].replace('.', '').strip()) + j = j + 1 + + else: + missed_compounds.append(hh) + id_missed_compounds.append(spec_id) + + else: # Missing columns, report and quit + missed_compounds.append(hh) + id_missed_compounds.append(spec_id) + + for _ in range(j): + hitid.append(str(spec_id.replace(" ", " "))) + #NB: this is the RT as found in the "id" generated by e.g. msclust, so NOT the RT of the library hit: + rt.append(str(float(spec_id.split('-')[3]) / 1e+06)) + + NistInput['ID'] = hitid + NistInput['R.T.'] = rt + NistInput['Name'] = name + NistInput['CAS'] = cas + NistInput['Formula'] = formula + NistInput['Forward'] = forward + NistInput['Reverse'] = reverse + NistInput['Probability'] = prob + NistInput['Library'] = lib_id + NistInput['Library ID'] = nist_id + NistInput_missed['Missed Compounds'] = missed_compounds + NistInput_missed['ID missed Compounds'] = id_missed_compounds + + return NistInput, NistInput_missed + + +def convert_pdftotext2tabular(filename, output_file, error_file, print_progress): + ''' + Converts NIST PDF file to tabular format + @param filename: PDF file to parse + @param output_file: output file for the hits + @param error_file: output file for failed hits + ''' + [HitList, HitList_missed] = getPDF(filename, print_progress) + # save Hitlist as tab seperate file + Hitlist_as_text = "\t".join(HitList.keys()) + "\n" + Hitlist_array_of_array = ([HitList[row] for row in HitList.keys()]) + Hitlist_as_text += str("\n".join(["\t".join(e) for e in zip(*Hitlist_array_of_array)])) + output_fh = open(output_file, 'wb') + output_fh.write(Hitlist_as_text) + output_fh.close() + + out_missed_pdf = open(error_file, 'wb') + for x, y in zip(HitList_missed['Missed Compounds'], HitList_missed['ID missed Compounds']): + out_missed_pdf.write("Line with incorrect format or unexpected number of fields:\n") + out_missed_pdf.write('%s\n' % '\t'.join([y, x])) + out_missed_pdf.close() + + + + + +def read_tabular_old(filename): + ''' + Function to read tabular format (created by convert_pdftotext2tabular) + and output a dict with header of columns as key and value is columns of tabular as list + @param filename: tabular file to read + ''' + input_fh = None + try: + input_fh = open(filename, 'r') + except IOError, error: + raise error + colnames = input_fh.readline().strip().split('\t') + cells = [] + for line in input_fh.readlines(): + cells.append(line.strip().split('\t')) + #transform from row oriented structure to column oriented structure + cells = zip(*cells) + #store the list of list in form of final output + RankFilterGC_format = {} + for colnumber in range(len(colnames)): + RankFilterGC_format[colnames[colnumber]] = cells[colnumber] + return RankFilterGC_format + + +if __name__ == '__main__': + convert_pdftotext2tabular(sys.argv[1], sys.argv[2], sys.argv[3], True)