Mercurial > repos > devteam > blast_datatypes
comparison xml.py @ 0:e1c29f302301 draft
Uploaded
| author | devteam |
|---|---|
| date | Fri, 17 Aug 2012 09:10:31 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:e1c29f302301 |
|---|---|
| 1 """ | |
| 2 BlastXml class | |
| 3 """ | |
| 4 | |
| 5 from galaxy.datatypes.data import get_file_peek | |
| 6 from galaxy.datatypes.data import Text | |
| 7 from galaxy.datatypes.xml import GenericXml | |
| 8 | |
| 9 class BlastXml( GenericXml ): | |
| 10 """NCBI Blast XML Output data""" | |
| 11 file_ext = "blastxml" | |
| 12 | |
| 13 def set_peek( self, dataset, is_multi_byte=False ): | |
| 14 """Set the peek and blurb text""" | |
| 15 if not dataset.dataset.purged: | |
| 16 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) | |
| 17 dataset.blurb = 'NCBI Blast XML data' | |
| 18 else: | |
| 19 dataset.peek = 'file does not exist' | |
| 20 dataset.blurb = 'file purged from disk' | |
| 21 def sniff( self, filename ): | |
| 22 """ | |
| 23 Determines whether the file is blastxml | |
| 24 | |
| 25 >>> fname = get_test_fname( 'megablast_xml_parser_test1.blastxml' ) | |
| 26 >>> BlastXml().sniff( fname ) | |
| 27 True | |
| 28 >>> fname = get_test_fname( 'tblastn_four_human_vs_rhodopsin.xml' ) | |
| 29 >>> BlastXml().sniff( fname ) | |
| 30 True | |
| 31 >>> fname = get_test_fname( 'interval.interval' ) | |
| 32 >>> BlastXml().sniff( fname ) | |
| 33 False | |
| 34 """ | |
| 35 #TODO - Use a context manager on Python 2.5+ to close handle | |
| 36 handle = open(filename) | |
| 37 line = handle.readline() | |
| 38 if line.strip() != '<?xml version="1.0"?>': | |
| 39 handle.close() | |
| 40 return False | |
| 41 line = handle.readline() | |
| 42 if line.strip() not in ['<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">', | |
| 43 '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">']: | |
| 44 handle.close() | |
| 45 return False | |
| 46 line = handle.readline() | |
| 47 if line.strip() != '<BlastOutput>': | |
| 48 handle.close() | |
| 49 return False | |
| 50 handle.close() | |
| 51 return True | |
| 52 | |
| 53 def merge(split_files, output_file): | |
| 54 """Merging multiple XML files is non-trivial and must be done in subclasses.""" | |
| 55 if len(split_files) == 1: | |
| 56 #For one file only, use base class method (move/copy) | |
| 57 return Text.merge(split_files, output_file) | |
| 58 out = open(output_file, "w") | |
| 59 h = None | |
| 60 for f in split_files: | |
| 61 h = open(f) | |
| 62 body = False | |
| 63 header = h.readline() | |
| 64 if not header: | |
| 65 out.close() | |
| 66 h.close() | |
| 67 raise ValueError("BLAST XML file %s was empty" % f) | |
| 68 if header.strip() != '<?xml version="1.0"?>': | |
| 69 out.write(header) #for diagnosis | |
| 70 out.close() | |
| 71 h.close() | |
| 72 raise ValueError("%s is not an XML file!" % f) | |
| 73 line = h.readline() | |
| 74 header += line | |
| 75 if line.strip() not in ['<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">', | |
| 76 '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">']: | |
| 77 out.write(header) #for diagnosis | |
| 78 out.close() | |
| 79 h.close() | |
| 80 raise ValueError("%s is not a BLAST XML file!" % f) | |
| 81 while True: | |
| 82 line = h.readline() | |
| 83 if not line: | |
| 84 out.write(header) #for diagnosis | |
| 85 out.close() | |
| 86 h.close() | |
| 87 raise ValueError("BLAST XML file %s ended prematurely" % f) | |
| 88 header += line | |
| 89 if "<Iteration>" in line: | |
| 90 break | |
| 91 if len(header) > 10000: | |
| 92 #Something has gone wrong, don't load too much into memory! | |
| 93 #Write what we have to the merged file for diagnostics | |
| 94 out.write(header) | |
| 95 out.close() | |
| 96 h.close() | |
| 97 raise ValueError("BLAST XML file %s has too long a header!" % f) | |
| 98 if "<BlastOutput>" not in header: | |
| 99 out.close() | |
| 100 h.close() | |
| 101 raise ValueError("%s is not a BLAST XML file:\n%s\n..." % (f, header)) | |
| 102 if f == split_files[0]: | |
| 103 out.write(header) | |
| 104 old_header = header | |
| 105 elif old_header[:300] != header[:300]: | |
| 106 #Enough to check <BlastOutput_program> and <BlastOutput_version> match | |
| 107 out.close() | |
| 108 h.close() | |
| 109 raise ValueError("BLAST XML headers don't match for %s and %s - have:\n%s\n...\n\nAnd:\n%s\n...\n" \ | |
| 110 % (split_files[0], f, old_header[:300], header[:300])) | |
| 111 else: | |
| 112 out.write(" <Iteration>\n") | |
| 113 for line in h: | |
| 114 if "</BlastOutput_iterations>" in line: | |
| 115 break | |
| 116 #TODO - Increment <Iteration_iter-num> and if required automatic query names | |
| 117 #like <Iteration_query-ID>Query_3</Iteration_query-ID> to be increasing? | |
| 118 out.write(line) | |
| 119 h.close() | |
| 120 out.write(" </BlastOutput_iterations>\n") | |
| 121 out.write("</BlastOutput>\n") | |
| 122 out.close() | |
| 123 merge = staticmethod(merge) | |
| 124 |
