Mercurial > repos > basfplant > interproscan_to_excel
comparison export_iprscan_to_Excel/source_files/iprscanToExcel_v20/src/be/cropdesign/iprscan/XMLToExcel.java @ 0:a9762cd6e2e3 draft default tip
Uploaded
author | basfplant |
---|---|
date | Tue, 05 Mar 2013 04:00:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a9762cd6e2e3 |
---|---|
1 package be.cropdesign.iprscan; | |
2 /** | |
3 * Converts the *.xml output file of the InterProScan program to an Excel file (*.xlsx) | |
4 * @author: Katrien Bernaerts and Domantas Motiejunas | |
5 * @date: 21/06/2012 | |
6 * @affiliation: CropDesign N.V., a BASF Plant Science Company - Technologiepark 3, 9052 Zwijnaarde - Belgium | |
7 */ | |
8 import java.awt.Color; | |
9 import java.io.File; | |
10 import java.io.IOException; | |
11 | |
12 import javax.xml.parsers.DocumentBuilder; | |
13 import javax.xml.parsers.DocumentBuilderFactory; | |
14 import javax.xml.parsers.FactoryConfigurationError; | |
15 import javax.xml.parsers.ParserConfigurationException; | |
16 | |
17 import org.apache.poi.ss.usermodel.CellStyle; | |
18 import org.apache.poi.xssf.usermodel.XSSFCell; | |
19 import org.apache.poi.xssf.usermodel.XSSFCellStyle; | |
20 import org.apache.poi.xssf.usermodel.XSSFColor; | |
21 import org.apache.poi.xssf.usermodel.XSSFFont; | |
22 import org.apache.poi.xssf.usermodel.XSSFRow; | |
23 import org.apache.poi.xssf.usermodel.XSSFSheet; | |
24 import org.apache.poi.xssf.usermodel.XSSFWorkbook; | |
25 import org.w3c.dom.Attr; | |
26 import org.w3c.dom.Document; | |
27 import org.w3c.dom.Element; | |
28 import org.w3c.dom.Node; | |
29 import org.w3c.dom.NodeList; | |
30 import org.xml.sax.SAXException; | |
31 | |
32 public class XMLToExcel { | |
33 /** | |
34 * fields needed to generate Excel | |
35 */ | |
36 private XSSFWorkbook wb; | |
37 private XSSFSheet sheet; | |
38 private XSSFRow myRow; | |
39 private XSSFCell myCell; | |
40 private String XMLFile; | |
41 | |
42 /** | |
43 * Fields/counters | |
44 */ | |
45 private int rownr; | |
46 private int colnr; | |
47 private int maxColnr; | |
48 | |
49 /** | |
50 * constructor | |
51 */ | |
52 public XMLToExcel(XSSFWorkbook wb, XSSFSheet sheet,XSSFRow myRow, XSSFCell myCell, String XMLFile){ | |
53 this.wb = wb; | |
54 this.sheet = sheet; | |
55 this.myRow = myRow; | |
56 this.myCell = myCell; | |
57 this.XMLFile = XMLFile; | |
58 rownr = 0; | |
59 colnr = 0; | |
60 makeDOMParser(); | |
61 setColumnWidth(); | |
62 } | |
63 | |
64 /** | |
65 * method to make a DOM parser. This method calls the method parseProteins, which parses the protein nodes, which further calls methods to parse deeper nodes. | |
66 */ | |
67 public void makeDOMParser(){ | |
68 try { | |
69 File file = new File(XMLFile); | |
70 if (file.exists()) { | |
71 DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); | |
72 DocumentBuilder db = dbf.newDocumentBuilder(); | |
73 Document doc = db.parse(file); | |
74 doc.getDocumentElement().normalize(); | |
75 | |
76 // parse protein nodes, the first level subnodes of root node interpro_matches | |
77 parseProteins(doc); | |
78 } else { | |
79 System.err.println("The XML file you try to convert to Excel does not exist."); | |
80 } | |
81 } catch (IOException ioe) { | |
82 System.err.println("Input/output exception: " + ioe); | |
83 } catch (SAXException saxe) { | |
84 System.err.println("SAX parsing exception: " + saxe); | |
85 } catch (FactoryConfigurationError fce) { | |
86 System.err.println("Factory configuration error: " + fce); | |
87 } catch (ParserConfigurationException pce) { | |
88 System.err.println("Parser configuration exception: " + pce); | |
89 } | |
90 } | |
91 | |
92 /************************** | |
93 * parse nodes and subnodes | |
94 *************************/ | |
95 /** | |
96 * method parseProteins to parse the protein nodes, the first level subnodes of root node interpro_matches | |
97 */ | |
98 public void parseProteins(Document doc){ | |
99 // parse through the protein nodes | |
100 NodeList proteinLst = doc.getElementsByTagName("protein"); | |
101 if (proteinLst != null && proteinLst.getLength() > 0) { | |
102 for (int i = 0; i < proteinLst.getLength(); i++) { | |
103 Node proteinNode = proteinLst.item(i); | |
104 if (proteinNode.getNodeType() == Node.ELEMENT_NODE ) { | |
105 Element proteinElement = (Element) proteinNode; | |
106 Attr proteinId = proteinElement.getAttributeNode("id"); | |
107 | |
108 myRow = sheet.createRow((short)rownr); | |
109 makeCellText(colnr, "PROTEIN"); | |
110 formatTitle(); | |
111 makeCellText(colnr+1, proteinId.getValue()); | |
112 formatTitle(); | |
113 makeRow(); | |
114 | |
115 // parse the child nodes of the protein nodes, namely interpro | |
116 parseInterpro(proteinNode); | |
117 }// end if proteinNode | |
118 }// end if proteinLst | |
119 } else { | |
120 System.out.print("XML file empty."); | |
121 } | |
122 } | |
123 | |
124 /** | |
125 * parse interpro nodes, which are child nodes of the protein nodes | |
126 */ | |
127 public void parseInterpro(Node proteinNode){ | |
128 // parse through the interpro nodes, which are subnodes of the protein node | |
129 NodeList interproLst = proteinNode.getChildNodes(); | |
130 if (interproLst != null && interproLst.getLength() > 0) { | |
131 for (int j = 0; j < interproLst.getLength(); j++) { | |
132 Node interproNode = interproLst.item(j); | |
133 if (interproNode.getNodeType() == Node.ELEMENT_NODE && interproNode.getNodeName().equals("interpro")) { | |
134 Element interproElement = (Element) interproNode; | |
135 Attr interproId = interproElement.getAttributeNode("id"); | |
136 makeCellText(colnr, interproId.getValue()); | |
137 makeBold(); | |
138 Attr interproName = interproElement.getAttributeNode("name"); | |
139 makeCellText(colnr+1, interproName.getValue()); | |
140 makeBold(); | |
141 Attr interproParentId = interproElement.getAttributeNode("parent_id"); | |
142 makeRow(); | |
143 | |
144 /* | |
145 * parse match node, a child node of the interpro node | |
146 */ | |
147 parseMatchInterproNode(interproNode); | |
148 | |
149 /* | |
150 * parent information from interpro attribute | |
151 */ | |
152 makeCellText(colnr, "parent "); | |
153 makeItalicRight(); | |
154 if (interproParentId != null) { | |
155 makeCellText(colnr+1, interproParentId.getValue()); | |
156 setUpperBorder(); | |
157 } | |
158 else { | |
159 makeCellText(colnr+1, "No parent"); | |
160 setUpperBorder(); | |
161 } | |
162 makeRow(); | |
163 | |
164 /* | |
165 * child_list subnode from interpro | |
166 */ | |
167 parseInterproChild("child_list ", "No children", interproNode); | |
168 | |
169 /* | |
170 * found_in subnode of interpro | |
171 */ | |
172 parseInterproChild("found_in ", "No entries", interproNode); | |
173 | |
174 /* | |
175 * contains subnode of interpro | |
176 */ | |
177 parseInterproChild("contains ", "No entries", interproNode); | |
178 | |
179 /* | |
180 * GO-terms subnode of interpro | |
181 */ | |
182 parseGOterms(interproNode); | |
183 | |
184 }// end if interproNode | |
185 }// end for interproLst, counter j | |
186 makeRow(); | |
187 }// end if interproLst | |
188 } | |
189 | |
190 /** | |
191 * method parseMatch to parse through the match nodes. | |
192 * The match nodes are child nodes of the interpro nodes | |
193 */ | |
194 public void parseMatchInterproNode(Node interproNode){ | |
195 makeTableHeaders(); | |
196 // parse through the child nodes of a node | |
197 NodeList interproChildLst = interproNode.getChildNodes(); | |
198 if (interproChildLst != null && interproChildLst.getLength() > 0) { | |
199 for (int k = 0; k < interproChildLst.getLength(); k++) { | |
200 Node interproChildNode = interproChildLst.item(k); | |
201 if (interproChildNode.getNodeType() == Node.ELEMENT_NODE && interproChildNode.getNodeName().equals("match")) { | |
202 Element matchElement = (Element) interproChildNode; | |
203 getMatchAttributes(matchElement); | |
204 // parse the location information of the matches. Location nodes are subnodes of match nodes. | |
205 parseLocationMatch(interproChildNode); | |
206 }// end if interproChildNode match | |
207 }// end for interproChildLst, counter k | |
208 }// end if interproChildLst | |
209 /* | |
210 * border line under table | |
211 */ | |
212 for (int i = 1; i < 8; i++){ | |
213 makeCellText(i, " "); | |
214 setUpperBorder(); | |
215 } | |
216 } | |
217 | |
218 /** | |
219 * method parseLocation to parse through the location nodes, which are child nodes of the match node. | |
220 * The match nodes can be one of the childnode types of the interpro node or childnodes of the | |
221 * protein node. | |
222 */ | |
223 public void parseLocationMatch(Node node){ | |
224 NodeList locationLst = node.getChildNodes(); | |
225 if (locationLst != null && locationLst.getLength() > 0) { | |
226 for (int l = 0; l < locationLst.getLength(); l++) { | |
227 Node locationNode = locationLst.item(l); | |
228 if (locationNode.getNodeType() == Node.ELEMENT_NODE) { | |
229 Element locationElement = (Element) locationNode; | |
230 getLocationAttributes(locationElement); | |
231 }// end if locationNode | |
232 }// end for locationLst, counter l | |
233 } // end if locationLst | |
234 } | |
235 | |
236 /** | |
237 * method parseInterProChild to parse the nodes child_list, found_in, contains, GO_terms and their contents. These nodes are childnodes of the interpro node. | |
238 */ | |
239 public void parseInterproChild(String nodeName, String emptyNode, Node interproNode){ | |
240 makeCellText(0, nodeName); | |
241 makeItalicRight(); | |
242 makeCellText(1, emptyNode); | |
243 | |
244 // parse through the child nodes of an interpro node | |
245 NodeList interproChildLst = interproNode.getChildNodes(); | |
246 if (interproChildLst != null && interproChildLst.getLength() > 0) { | |
247 for (int k = 0; k < interproChildLst.getLength(); k++) { | |
248 Node interproChildNode = interproChildLst.item(k); | |
249 // parse the child node of the interpro node with as name nodeName (parameter of the method) | |
250 if (interproChildNode.getNodeType() == Node.ELEMENT_NODE && interproChildNode.getNodeName().equals(nodeName.trim())) { | |
251 // parse through the rel_ref nodes which are childnodes of contains, found_in and child_list | |
252 NodeList rel_refLst = interproChildNode.getChildNodes(); | |
253 if (rel_refLst != null && rel_refLst.getLength() > 0) { | |
254 for (int s = 0; s < rel_refLst.getLength(); s++) { | |
255 Node ref_relNode = rel_refLst.item(s); | |
256 if (ref_relNode.getNodeType() == Node.ELEMENT_NODE) { | |
257 Element ref_relElement = (Element) ref_relNode; | |
258 Attr ipr_ref = ref_relElement.getAttributeNode("ipr_ref"); | |
259 colnr++; | |
260 makeCellText(colnr, ipr_ref.getValue()); | |
261 }// end if ref_relNode | |
262 }//end for ref_relLst | |
263 getMaxColumns(colnr + rel_refLst.getLength() + 1); | |
264 colnr = 0;//reset colnr to 0 in order to let the other items start in the column with id 0 | |
265 }//end if relf_refLst | |
266 } // end if interproChildNode childnode | |
267 }// end for interproChildLst childnode | |
268 }// end if interproChildLst childnode | |
269 makeRow(); | |
270 } | |
271 | |
272 /** | |
273 * parse classification node, which is a subnode of the interpro node | |
274 */ | |
275 public void parseGOterms(Node interproNode){ | |
276 makeCellText(0, "GO terms "); | |
277 makeItalicRight(); | |
278 makeCellText(1, "no GO terms"); | |
279 int rowGO = rownr; | |
280 | |
281 // parse through the child nodes of an interpro node | |
282 NodeList interproChildLst = interproNode.getChildNodes(); | |
283 if (interproChildLst != null && interproChildLst.getLength() > 0) { | |
284 for (int k = 0; k < interproChildLst.getLength(); k++) { | |
285 Node interproChildNode = interproChildLst.item(k); | |
286 if (interproChildNode.getNodeType() == Node.ELEMENT_NODE && interproChildNode.getNodeName().equals("classification")) { | |
287 Element classificationElement = (Element) interproChildNode; | |
288 Attr classificationId = classificationElement.getAttributeNode("id"); | |
289 makeCellText(colnr+2, classificationId.getValue()); | |
290 // parse through the child nodes of a classification node | |
291 NodeList classificationLst = classificationElement.getChildNodes(); | |
292 if (classificationLst != null && classificationLst.getLength() > 0) { | |
293 for (int l=0; l < classificationLst.getLength(); l++) { | |
294 Node classificationChildNode = classificationLst.item(l); | |
295 if (classificationChildNode.getNodeType() == Node.ELEMENT_NODE && classificationChildNode.getNodeName().equals("category")) { | |
296 Element categoryElement = (Element) classificationChildNode; | |
297 makeCellText(colnr+1, categoryElement.getTextContent()); | |
298 } | |
299 if (classificationChildNode.getNodeType() == Node.ELEMENT_NODE && classificationChildNode.getNodeName().equals("description")) { | |
300 Element descriptionElement = (Element) classificationChildNode; | |
301 makeCellText(colnr+3, descriptionElement.getTextContent()); | |
302 } | |
303 } | |
304 }// end if classificationLst | |
305 makeRow(); | |
306 }// end if interproChildNode classification | |
307 }// end for interproChildLst, counter k | |
308 getMaxColumns(colnr + interproChildLst.getLength() + 1); | |
309 }// end if interproChildLst | |
310 /* | |
311 * insert empty row after the row GO term | |
312 */ | |
313 if (sheet.getRow(rowGO).getCell(1).toString().equals("no GO terms")){ | |
314 makeRow(); | |
315 } | |
316 makeRow(); | |
317 } | |
318 | |
319 /** | |
320 * Get the attribute values of the match elements and fill them in in Excel | |
321 * @param matchElement | |
322 */ | |
323 public void getMatchAttributes(Element matchElement){ | |
324 Attr matchDbname = matchElement.getAttributeNode("dbname"); | |
325 makeCellText(colnr+1, matchDbname.getValue()); | |
326 Attr matchId = matchElement.getAttributeNode("id"); | |
327 makeCellText(colnr+2, matchId.getValue()); | |
328 Attr matchName = matchElement.getAttributeNode("name"); | |
329 makeCellText(colnr+3, matchName.getValue()); | |
330 wrapText();//if the cell content does not fit the width of the column, the text is automatically wrapped | |
331 } | |
332 | |
333 /** | |
334 * Get the attribute values of the location elements and fill them in in Excel | |
335 * @param locationElement | |
336 */ | |
337 public void getLocationAttributes(Element locationElement){ | |
338 // get the desired attribute values of the location node in the desired order | |
339 Attr locationScore = locationElement.getAttributeNode("score"); | |
340 Attr locationStart = locationElement.getAttributeNode("start"); | |
341 Attr locationEnd = locationElement.getAttributeNode("end"); | |
342 Attr locationStatus = locationElement.getAttributeNode("status"); | |
343 makeCellText(colnr+4, locationScore.getValue()); | |
344 makeCellNumber(colnr+5, Integer.parseInt(locationStart.getValue())); | |
345 makeCellNumber(colnr+6, Integer.parseInt(locationEnd.getValue())); | |
346 makeCellText(colnr+7, locationStatus.getValue()); | |
347 makeRow(); | |
348 } | |
349 | |
350 /****************************** | |
351 * make rows and cells in Excel | |
352 ******************************/ | |
353 /** | |
354 * method to fill cells of a row in an Excel sheet with text content | |
355 */ | |
356 public void makeCellText(int colnr, String content){ | |
357 myCell = myRow.createCell((short)colnr); | |
358 myCell.setCellValue(content); | |
359 } | |
360 | |
361 /** | |
362 * method to fill cells of a row in an Excel sheet with number formatted content | |
363 */ | |
364 public void makeCellNumber(int colnr, int content){ | |
365 myCell = myRow.createCell((short)colnr); | |
366 myCell.setCellValue(content); | |
367 } | |
368 | |
369 /** | |
370 * method to make a new row in an Excel sheet | |
371 */ | |
372 public void makeRow(){ | |
373 rownr++; | |
374 myRow = sheet.createRow((short)rownr); | |
375 } | |
376 | |
377 /*********************************** | |
378 * table headers and titles in Excel | |
379 ***********************************/ | |
380 /** | |
381 * method to make table headers for the matches table | |
382 */ | |
383 public void makeTableHeaders(){ | |
384 // headers | |
385 makeCellText(0, "matches "); | |
386 makeItalicRight(); | |
387 //headers of table | |
388 String[] headers = {"Method", "Identifier", "Description", "Score", "Start", "End", "Status"}; | |
389 for (int i =0; i < headers.length; i++){ | |
390 makeCellText(colnr+i+1, headers[i]); | |
391 setTitleBorder(); | |
392 } | |
393 getMaxColumns(colnr + headers.length + 1); // needed for method setColumnWidth() | |
394 makeRow(); | |
395 // end headers | |
396 } | |
397 | |
398 /** | |
399 * format the protein titles | |
400 */ | |
401 public void formatTitle(){ | |
402 XSSFCellStyle style = wb.createCellStyle(); | |
403 XSSFFont font = wb.createFont(); | |
404 font.setColor(new XSSFColor(Color.BLUE)); | |
405 font.setBold(true); | |
406 font.setFontHeightInPoints((short)12); | |
407 style.setFont(font); | |
408 myCell.setCellStyle(style); | |
409 } | |
410 | |
411 /** | |
412 * provide the table headers of the matches part with border lines | |
413 */ | |
414 public void setTitleBorder(){ | |
415 XSSFCellStyle border = wb.createCellStyle(); | |
416 border.setBorderBottom(XSSFCellStyle.BORDER_DOUBLE); | |
417 border.setBorderTop(XSSFCellStyle.BORDER_THIN); | |
418 myCell.setCellStyle(border); | |
419 } | |
420 | |
421 /** | |
422 * make a border | |
423 */ | |
424 public void setUpperBorder(){ | |
425 XSSFCellStyle border = wb.createCellStyle(); | |
426 border.setBorderTop(XSSFCellStyle.BORDER_THIN); | |
427 myCell.setCellStyle(border); | |
428 } | |
429 | |
430 /************ | |
431 * formatting | |
432 ***********/ | |
433 /** | |
434 * format text in bold and align text right | |
435 */ | |
436 public void makeBold(){ | |
437 XSSFCellStyle style = wb.createCellStyle(); | |
438 XSSFFont font = wb.createFont(); | |
439 font.setBold(true); | |
440 style.setFont(font); | |
441 myCell.setCellStyle(style); | |
442 } | |
443 | |
444 /** | |
445 * format text as italic and align right | |
446 */ | |
447 public void makeItalicRight(){ | |
448 XSSFCellStyle style = wb.createCellStyle(); | |
449 XSSFFont font = wb.createFont(); | |
450 font.setItalic(true); | |
451 style.setFont(font); | |
452 style.setAlignment(CellStyle.ALIGN_RIGHT); | |
453 myCell.setCellStyle(style); | |
454 } | |
455 | |
456 /** | |
457 * adapt column width depending on the content, except for some cases where the column width would become too big | |
458 */ | |
459 public void setColumnWidth(){ | |
460 for (int col = 0; col < maxColnr; col++){ | |
461 sheet.autoSizeColumn(col); | |
462 } | |
463 //for some columns, fixed size is desired above autosize | |
464 sheet.setColumnWidth(1, 256*15); | |
465 sheet.setColumnWidth(3, 256*27); | |
466 } | |
467 | |
468 /** | |
469 * Helper method for setColumnWitdt() | |
470 * Method determines the number of columns which is filled with content | |
471 * This number is needed to loop through the columns and autofit their width with method setColumnWidth() | |
472 * @return | |
473 */ | |
474 public void getMaxColumns(int colnr){ | |
475 if (colnr > maxColnr){ | |
476 maxColnr = colnr; | |
477 } | |
478 } | |
479 | |
480 /** | |
481 * if text does not fit on one line, this method wraps the text | |
482 */ | |
483 public void wrapText(){ | |
484 XSSFCellStyle wrap=wb.createCellStyle(); | |
485 wrap.setWrapText(true); | |
486 myCell.setCellStyle(wrap); | |
487 } | |
488 } |