Mercurial > repos > devteam > blast_datatypes
comparison xml.py @ 0:e1c29f302301 draft
Uploaded
author | devteam |
---|---|
date | Fri, 17 Aug 2012 09:10:31 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e1c29f302301 |
---|---|
1 """ | |
2 BlastXml class | |
3 """ | |
4 | |
5 from galaxy.datatypes.data import get_file_peek | |
6 from galaxy.datatypes.data import Text | |
7 from galaxy.datatypes.xml import GenericXml | |
8 | |
9 class BlastXml( GenericXml ): | |
10 """NCBI Blast XML Output data""" | |
11 file_ext = "blastxml" | |
12 | |
13 def set_peek( self, dataset, is_multi_byte=False ): | |
14 """Set the peek and blurb text""" | |
15 if not dataset.dataset.purged: | |
16 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) | |
17 dataset.blurb = 'NCBI Blast XML data' | |
18 else: | |
19 dataset.peek = 'file does not exist' | |
20 dataset.blurb = 'file purged from disk' | |
21 def sniff( self, filename ): | |
22 """ | |
23 Determines whether the file is blastxml | |
24 | |
25 >>> fname = get_test_fname( 'megablast_xml_parser_test1.blastxml' ) | |
26 >>> BlastXml().sniff( fname ) | |
27 True | |
28 >>> fname = get_test_fname( 'tblastn_four_human_vs_rhodopsin.xml' ) | |
29 >>> BlastXml().sniff( fname ) | |
30 True | |
31 >>> fname = get_test_fname( 'interval.interval' ) | |
32 >>> BlastXml().sniff( fname ) | |
33 False | |
34 """ | |
35 #TODO - Use a context manager on Python 2.5+ to close handle | |
36 handle = open(filename) | |
37 line = handle.readline() | |
38 if line.strip() != '<?xml version="1.0"?>': | |
39 handle.close() | |
40 return False | |
41 line = handle.readline() | |
42 if line.strip() not in ['<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">', | |
43 '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">']: | |
44 handle.close() | |
45 return False | |
46 line = handle.readline() | |
47 if line.strip() != '<BlastOutput>': | |
48 handle.close() | |
49 return False | |
50 handle.close() | |
51 return True | |
52 | |
53 def merge(split_files, output_file): | |
54 """Merging multiple XML files is non-trivial and must be done in subclasses.""" | |
55 if len(split_files) == 1: | |
56 #For one file only, use base class method (move/copy) | |
57 return Text.merge(split_files, output_file) | |
58 out = open(output_file, "w") | |
59 h = None | |
60 for f in split_files: | |
61 h = open(f) | |
62 body = False | |
63 header = h.readline() | |
64 if not header: | |
65 out.close() | |
66 h.close() | |
67 raise ValueError("BLAST XML file %s was empty" % f) | |
68 if header.strip() != '<?xml version="1.0"?>': | |
69 out.write(header) #for diagnosis | |
70 out.close() | |
71 h.close() | |
72 raise ValueError("%s is not an XML file!" % f) | |
73 line = h.readline() | |
74 header += line | |
75 if line.strip() not in ['<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">', | |
76 '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">']: | |
77 out.write(header) #for diagnosis | |
78 out.close() | |
79 h.close() | |
80 raise ValueError("%s is not a BLAST XML file!" % f) | |
81 while True: | |
82 line = h.readline() | |
83 if not line: | |
84 out.write(header) #for diagnosis | |
85 out.close() | |
86 h.close() | |
87 raise ValueError("BLAST XML file %s ended prematurely" % f) | |
88 header += line | |
89 if "<Iteration>" in line: | |
90 break | |
91 if len(header) > 10000: | |
92 #Something has gone wrong, don't load too much into memory! | |
93 #Write what we have to the merged file for diagnostics | |
94 out.write(header) | |
95 out.close() | |
96 h.close() | |
97 raise ValueError("BLAST XML file %s has too long a header!" % f) | |
98 if "<BlastOutput>" not in header: | |
99 out.close() | |
100 h.close() | |
101 raise ValueError("%s is not a BLAST XML file:\n%s\n..." % (f, header)) | |
102 if f == split_files[0]: | |
103 out.write(header) | |
104 old_header = header | |
105 elif old_header[:300] != header[:300]: | |
106 #Enough to check <BlastOutput_program> and <BlastOutput_version> match | |
107 out.close() | |
108 h.close() | |
109 raise ValueError("BLAST XML headers don't match for %s and %s - have:\n%s\n...\n\nAnd:\n%s\n...\n" \ | |
110 % (split_files[0], f, old_header[:300], header[:300])) | |
111 else: | |
112 out.write(" <Iteration>\n") | |
113 for line in h: | |
114 if "</BlastOutput_iterations>" in line: | |
115 break | |
116 #TODO - Increment <Iteration_iter-num> and if required automatic query names | |
117 #like <Iteration_query-ID>Query_3</Iteration_query-ID> to be increasing? | |
118 out.write(line) | |
119 h.close() | |
120 out.write(" </BlastOutput_iterations>\n") | |
121 out.write("</BlastOutput>\n") | |
122 out.close() | |
123 merge = staticmethod(merge) | |
124 |