0
|
1 package be.cropdesign.iprscan;
|
|
2 /**
|
|
3 * Converts the *.xml output file of the InterProScan program to an Excel file (*.xlsx)
|
|
4 * @author: Katrien Bernaerts and Domantas Motiejunas
|
|
5 * @date: 21/06/2012
|
|
6 * @affiliation: CropDesign N.V., a BASF Plant Science Company - Technologiepark 3, 9052 Zwijnaarde - Belgium
|
|
7 */
|
|
8 import java.awt.Color;
|
|
9 import java.io.File;
|
|
10 import java.io.IOException;
|
|
11
|
|
12 import javax.xml.parsers.DocumentBuilder;
|
|
13 import javax.xml.parsers.DocumentBuilderFactory;
|
|
14 import javax.xml.parsers.FactoryConfigurationError;
|
|
15 import javax.xml.parsers.ParserConfigurationException;
|
|
16
|
|
17 import org.apache.poi.ss.usermodel.CellStyle;
|
|
18 import org.apache.poi.xssf.usermodel.XSSFCell;
|
|
19 import org.apache.poi.xssf.usermodel.XSSFCellStyle;
|
|
20 import org.apache.poi.xssf.usermodel.XSSFColor;
|
|
21 import org.apache.poi.xssf.usermodel.XSSFFont;
|
|
22 import org.apache.poi.xssf.usermodel.XSSFRow;
|
|
23 import org.apache.poi.xssf.usermodel.XSSFSheet;
|
|
24 import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
|
25 import org.w3c.dom.Attr;
|
|
26 import org.w3c.dom.Document;
|
|
27 import org.w3c.dom.Element;
|
|
28 import org.w3c.dom.Node;
|
|
29 import org.w3c.dom.NodeList;
|
|
30 import org.xml.sax.SAXException;
|
|
31
|
|
32 public class XMLToExcel {
|
|
33 /**
|
|
34 * fields needed to generate Excel
|
|
35 */
|
|
36 private XSSFWorkbook wb;
|
|
37 private XSSFSheet sheet;
|
|
38 private XSSFRow myRow;
|
|
39 private XSSFCell myCell;
|
|
40 private String XMLFile;
|
|
41
|
|
42 /**
|
|
43 * Fields/counters
|
|
44 */
|
|
45 private int rownr;
|
|
46 private int colnr;
|
|
47 private int maxColnr;
|
|
48
|
|
49 /**
|
|
50 * constructor
|
|
51 */
|
|
52 public XMLToExcel(XSSFWorkbook wb, XSSFSheet sheet,XSSFRow myRow, XSSFCell myCell, String XMLFile){
|
|
53 this.wb = wb;
|
|
54 this.sheet = sheet;
|
|
55 this.myRow = myRow;
|
|
56 this.myCell = myCell;
|
|
57 this.XMLFile = XMLFile;
|
|
58 rownr = 0;
|
|
59 colnr = 0;
|
|
60 makeDOMParser();
|
|
61 setColumnWidth();
|
|
62 }
|
|
63
|
|
64 /**
|
|
65 * method to make a DOM parser. This method calls the method parseProteins, which parses the protein nodes, which further calls methods to parse deeper nodes.
|
|
66 */
|
|
67 public void makeDOMParser(){
|
|
68 try {
|
|
69 File file = new File(XMLFile);
|
|
70 if (file.exists()) {
|
|
71 DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
|
|
72 DocumentBuilder db = dbf.newDocumentBuilder();
|
|
73 Document doc = db.parse(file);
|
|
74 doc.getDocumentElement().normalize();
|
|
75
|
|
76 // parse protein nodes, the first level subnodes of root node interpro_matches
|
|
77 parseProteins(doc);
|
|
78 } else {
|
|
79 System.err.println("The XML file you try to convert to Excel does not exist.");
|
|
80 }
|
|
81 } catch (IOException ioe) {
|
|
82 System.err.println("Input/output exception: " + ioe);
|
|
83 } catch (SAXException saxe) {
|
|
84 System.err.println("SAX parsing exception: " + saxe);
|
|
85 } catch (FactoryConfigurationError fce) {
|
|
86 System.err.println("Factory configuration error: " + fce);
|
|
87 } catch (ParserConfigurationException pce) {
|
|
88 System.err.println("Parser configuration exception: " + pce);
|
|
89 }
|
|
90 }
|
|
91
|
|
92 /**************************
|
|
93 * parse nodes and subnodes
|
|
94 *************************/
|
|
95 /**
|
|
96 * method parseProteins to parse the protein nodes, the first level subnodes of root node interpro_matches
|
|
97 */
|
|
98 public void parseProteins(Document doc){
|
|
99 // parse through the protein nodes
|
|
100 NodeList proteinLst = doc.getElementsByTagName("protein");
|
|
101 if (proteinLst != null && proteinLst.getLength() > 0) {
|
|
102 for (int i = 0; i < proteinLst.getLength(); i++) {
|
|
103 Node proteinNode = proteinLst.item(i);
|
|
104 if (proteinNode.getNodeType() == Node.ELEMENT_NODE ) {
|
|
105 Element proteinElement = (Element) proteinNode;
|
|
106 Attr proteinId = proteinElement.getAttributeNode("id");
|
|
107
|
|
108 myRow = sheet.createRow((short)rownr);
|
|
109 makeCellText(colnr, "PROTEIN");
|
|
110 formatTitle();
|
|
111 makeCellText(colnr+1, proteinId.getValue());
|
|
112 formatTitle();
|
|
113 makeRow();
|
|
114
|
|
115 // parse the child nodes of the protein nodes, namely interpro
|
|
116 parseInterpro(proteinNode);
|
|
117 }// end if proteinNode
|
|
118 }// end if proteinLst
|
|
119 } else {
|
|
120 System.out.print("XML file empty.");
|
|
121 }
|
|
122 }
|
|
123
|
|
124 /**
|
|
125 * parse interpro nodes, which are child nodes of the protein nodes
|
|
126 */
|
|
127 public void parseInterpro(Node proteinNode){
|
|
128 // parse through the interpro nodes, which are subnodes of the protein node
|
|
129 NodeList interproLst = proteinNode.getChildNodes();
|
|
130 if (interproLst != null && interproLst.getLength() > 0) {
|
|
131 for (int j = 0; j < interproLst.getLength(); j++) {
|
|
132 Node interproNode = interproLst.item(j);
|
|
133 if (interproNode.getNodeType() == Node.ELEMENT_NODE && interproNode.getNodeName().equals("interpro")) {
|
|
134 Element interproElement = (Element) interproNode;
|
|
135 Attr interproId = interproElement.getAttributeNode("id");
|
|
136 makeCellText(colnr, interproId.getValue());
|
|
137 makeBold();
|
|
138 Attr interproName = interproElement.getAttributeNode("name");
|
|
139 makeCellText(colnr+1, interproName.getValue());
|
|
140 makeBold();
|
|
141 Attr interproParentId = interproElement.getAttributeNode("parent_id");
|
|
142 makeRow();
|
|
143
|
|
144 /*
|
|
145 * parse match node, a child node of the interpro node
|
|
146 */
|
|
147 parseMatchInterproNode(interproNode);
|
|
148
|
|
149 /*
|
|
150 * parent information from interpro attribute
|
|
151 */
|
|
152 makeCellText(colnr, "parent ");
|
|
153 makeItalicRight();
|
|
154 if (interproParentId != null) {
|
|
155 makeCellText(colnr+1, interproParentId.getValue());
|
|
156 setUpperBorder();
|
|
157 }
|
|
158 else {
|
|
159 makeCellText(colnr+1, "No parent");
|
|
160 setUpperBorder();
|
|
161 }
|
|
162 makeRow();
|
|
163
|
|
164 /*
|
|
165 * child_list subnode from interpro
|
|
166 */
|
|
167 parseInterproChild("child_list ", "No children", interproNode);
|
|
168
|
|
169 /*
|
|
170 * found_in subnode of interpro
|
|
171 */
|
|
172 parseInterproChild("found_in ", "No entries", interproNode);
|
|
173
|
|
174 /*
|
|
175 * contains subnode of interpro
|
|
176 */
|
|
177 parseInterproChild("contains ", "No entries", interproNode);
|
|
178
|
|
179 /*
|
|
180 * GO-terms subnode of interpro
|
|
181 */
|
|
182 parseGOterms(interproNode);
|
|
183
|
|
184 }// end if interproNode
|
|
185 }// end for interproLst, counter j
|
|
186 makeRow();
|
|
187 }// end if interproLst
|
|
188 }
|
|
189
|
|
190 /**
|
|
191 * method parseMatch to parse through the match nodes.
|
|
192 * The match nodes are child nodes of the interpro nodes
|
|
193 */
|
|
194 public void parseMatchInterproNode(Node interproNode){
|
|
195 makeTableHeaders();
|
|
196 // parse through the child nodes of a node
|
|
197 NodeList interproChildLst = interproNode.getChildNodes();
|
|
198 if (interproChildLst != null && interproChildLst.getLength() > 0) {
|
|
199 for (int k = 0; k < interproChildLst.getLength(); k++) {
|
|
200 Node interproChildNode = interproChildLst.item(k);
|
|
201 if (interproChildNode.getNodeType() == Node.ELEMENT_NODE && interproChildNode.getNodeName().equals("match")) {
|
|
202 Element matchElement = (Element) interproChildNode;
|
|
203 getMatchAttributes(matchElement);
|
|
204 // parse the location information of the matches. Location nodes are subnodes of match nodes.
|
|
205 parseLocationMatch(interproChildNode);
|
|
206 }// end if interproChildNode match
|
|
207 }// end for interproChildLst, counter k
|
|
208 }// end if interproChildLst
|
|
209 /*
|
|
210 * border line under table
|
|
211 */
|
|
212 for (int i = 1; i < 8; i++){
|
|
213 makeCellText(i, " ");
|
|
214 setUpperBorder();
|
|
215 }
|
|
216 }
|
|
217
|
|
218 /**
|
|
219 * method parseLocation to parse through the location nodes, which are child nodes of the match node.
|
|
220 * The match nodes can be one of the childnode types of the interpro node or childnodes of the
|
|
221 * protein node.
|
|
222 */
|
|
223 public void parseLocationMatch(Node node){
|
|
224 NodeList locationLst = node.getChildNodes();
|
|
225 if (locationLst != null && locationLst.getLength() > 0) {
|
|
226 for (int l = 0; l < locationLst.getLength(); l++) {
|
|
227 Node locationNode = locationLst.item(l);
|
|
228 if (locationNode.getNodeType() == Node.ELEMENT_NODE) {
|
|
229 Element locationElement = (Element) locationNode;
|
|
230 getLocationAttributes(locationElement);
|
|
231 }// end if locationNode
|
|
232 }// end for locationLst, counter l
|
|
233 } // end if locationLst
|
|
234 }
|
|
235
|
|
236 /**
|
|
237 * method parseInterProChild to parse the nodes child_list, found_in, contains, GO_terms and their contents. These nodes are childnodes of the interpro node.
|
|
238 */
|
|
239 public void parseInterproChild(String nodeName, String emptyNode, Node interproNode){
|
|
240 makeCellText(0, nodeName);
|
|
241 makeItalicRight();
|
|
242 makeCellText(1, emptyNode);
|
|
243
|
|
244 // parse through the child nodes of an interpro node
|
|
245 NodeList interproChildLst = interproNode.getChildNodes();
|
|
246 if (interproChildLst != null && interproChildLst.getLength() > 0) {
|
|
247 for (int k = 0; k < interproChildLst.getLength(); k++) {
|
|
248 Node interproChildNode = interproChildLst.item(k);
|
|
249 // parse the child node of the interpro node with as name nodeName (parameter of the method)
|
|
250 if (interproChildNode.getNodeType() == Node.ELEMENT_NODE && interproChildNode.getNodeName().equals(nodeName.trim())) {
|
|
251 // parse through the rel_ref nodes which are childnodes of contains, found_in and child_list
|
|
252 NodeList rel_refLst = interproChildNode.getChildNodes();
|
|
253 if (rel_refLst != null && rel_refLst.getLength() > 0) {
|
|
254 for (int s = 0; s < rel_refLst.getLength(); s++) {
|
|
255 Node ref_relNode = rel_refLst.item(s);
|
|
256 if (ref_relNode.getNodeType() == Node.ELEMENT_NODE) {
|
|
257 Element ref_relElement = (Element) ref_relNode;
|
|
258 Attr ipr_ref = ref_relElement.getAttributeNode("ipr_ref");
|
|
259 colnr++;
|
|
260 makeCellText(colnr, ipr_ref.getValue());
|
|
261 }// end if ref_relNode
|
|
262 }//end for ref_relLst
|
|
263 getMaxColumns(colnr + rel_refLst.getLength() + 1);
|
|
264 colnr = 0;//reset colnr to 0 in order to let the other items start in the column with id 0
|
|
265 }//end if relf_refLst
|
|
266 } // end if interproChildNode childnode
|
|
267 }// end for interproChildLst childnode
|
|
268 }// end if interproChildLst childnode
|
|
269 makeRow();
|
|
270 }
|
|
271
|
|
272 /**
|
|
273 * parse classification node, which is a subnode of the interpro node
|
|
274 */
|
|
275 public void parseGOterms(Node interproNode){
|
|
276 makeCellText(0, "GO terms ");
|
|
277 makeItalicRight();
|
|
278 makeCellText(1, "no GO terms");
|
|
279 int rowGO = rownr;
|
|
280
|
|
281 // parse through the child nodes of an interpro node
|
|
282 NodeList interproChildLst = interproNode.getChildNodes();
|
|
283 if (interproChildLst != null && interproChildLst.getLength() > 0) {
|
|
284 for (int k = 0; k < interproChildLst.getLength(); k++) {
|
|
285 Node interproChildNode = interproChildLst.item(k);
|
|
286 if (interproChildNode.getNodeType() == Node.ELEMENT_NODE && interproChildNode.getNodeName().equals("classification")) {
|
|
287 Element classificationElement = (Element) interproChildNode;
|
|
288 Attr classificationId = classificationElement.getAttributeNode("id");
|
|
289 makeCellText(colnr+2, classificationId.getValue());
|
|
290 // parse through the child nodes of a classification node
|
|
291 NodeList classificationLst = classificationElement.getChildNodes();
|
|
292 if (classificationLst != null && classificationLst.getLength() > 0) {
|
|
293 for (int l=0; l < classificationLst.getLength(); l++) {
|
|
294 Node classificationChildNode = classificationLst.item(l);
|
|
295 if (classificationChildNode.getNodeType() == Node.ELEMENT_NODE && classificationChildNode.getNodeName().equals("category")) {
|
|
296 Element categoryElement = (Element) classificationChildNode;
|
|
297 makeCellText(colnr+1, categoryElement.getTextContent());
|
|
298 }
|
|
299 if (classificationChildNode.getNodeType() == Node.ELEMENT_NODE && classificationChildNode.getNodeName().equals("description")) {
|
|
300 Element descriptionElement = (Element) classificationChildNode;
|
|
301 makeCellText(colnr+3, descriptionElement.getTextContent());
|
|
302 }
|
|
303 }
|
|
304 }// end if classificationLst
|
|
305 makeRow();
|
|
306 }// end if interproChildNode classification
|
|
307 }// end for interproChildLst, counter k
|
|
308 getMaxColumns(colnr + interproChildLst.getLength() + 1);
|
|
309 }// end if interproChildLst
|
|
310 /*
|
|
311 * insert empty row after the row GO term
|
|
312 */
|
|
313 if (sheet.getRow(rowGO).getCell(1).toString().equals("no GO terms")){
|
|
314 makeRow();
|
|
315 }
|
|
316 makeRow();
|
|
317 }
|
|
318
|
|
319 /**
|
|
320 * Get the attribute values of the match elements and fill them in in Excel
|
|
321 * @param matchElement
|
|
322 */
|
|
323 public void getMatchAttributes(Element matchElement){
|
|
324 Attr matchDbname = matchElement.getAttributeNode("dbname");
|
|
325 makeCellText(colnr+1, matchDbname.getValue());
|
|
326 Attr matchId = matchElement.getAttributeNode("id");
|
|
327 makeCellText(colnr+2, matchId.getValue());
|
|
328 Attr matchName = matchElement.getAttributeNode("name");
|
|
329 makeCellText(colnr+3, matchName.getValue());
|
|
330 wrapText();//if the cell content does not fit the width of the column, the text is automatically wrapped
|
|
331 }
|
|
332
|
|
333 /**
|
|
334 * Get the attribute values of the location elements and fill them in in Excel
|
|
335 * @param locationElement
|
|
336 */
|
|
337 public void getLocationAttributes(Element locationElement){
|
|
338 // get the desired attribute values of the location node in the desired order
|
|
339 Attr locationScore = locationElement.getAttributeNode("score");
|
|
340 Attr locationStart = locationElement.getAttributeNode("start");
|
|
341 Attr locationEnd = locationElement.getAttributeNode("end");
|
|
342 Attr locationStatus = locationElement.getAttributeNode("status");
|
|
343 makeCellText(colnr+4, locationScore.getValue());
|
|
344 makeCellNumber(colnr+5, Integer.parseInt(locationStart.getValue()));
|
|
345 makeCellNumber(colnr+6, Integer.parseInt(locationEnd.getValue()));
|
|
346 makeCellText(colnr+7, locationStatus.getValue());
|
|
347 makeRow();
|
|
348 }
|
|
349
|
|
350 /******************************
|
|
351 * make rows and cells in Excel
|
|
352 ******************************/
|
|
353 /**
|
|
354 * method to fill cells of a row in an Excel sheet with text content
|
|
355 */
|
|
356 public void makeCellText(int colnr, String content){
|
|
357 myCell = myRow.createCell((short)colnr);
|
|
358 myCell.setCellValue(content);
|
|
359 }
|
|
360
|
|
361 /**
|
|
362 * method to fill cells of a row in an Excel sheet with number formatted content
|
|
363 */
|
|
364 public void makeCellNumber(int colnr, int content){
|
|
365 myCell = myRow.createCell((short)colnr);
|
|
366 myCell.setCellValue(content);
|
|
367 }
|
|
368
|
|
369 /**
|
|
370 * method to make a new row in an Excel sheet
|
|
371 */
|
|
372 public void makeRow(){
|
|
373 rownr++;
|
|
374 myRow = sheet.createRow((short)rownr);
|
|
375 }
|
|
376
|
|
377 /***********************************
|
|
378 * table headers and titles in Excel
|
|
379 ***********************************/
|
|
380 /**
|
|
381 * method to make table headers for the matches table
|
|
382 */
|
|
383 public void makeTableHeaders(){
|
|
384 // headers
|
|
385 makeCellText(0, "matches ");
|
|
386 makeItalicRight();
|
|
387 //headers of table
|
|
388 String[] headers = {"Method", "Identifier", "Description", "Score", "Start", "End", "Status"};
|
|
389 for (int i =0; i < headers.length; i++){
|
|
390 makeCellText(colnr+i+1, headers[i]);
|
|
391 setTitleBorder();
|
|
392 }
|
|
393 getMaxColumns(colnr + headers.length + 1); // needed for method setColumnWidth()
|
|
394 makeRow();
|
|
395 // end headers
|
|
396 }
|
|
397
|
|
398 /**
|
|
399 * format the protein titles
|
|
400 */
|
|
401 public void formatTitle(){
|
|
402 XSSFCellStyle style = wb.createCellStyle();
|
|
403 XSSFFont font = wb.createFont();
|
|
404 font.setColor(new XSSFColor(Color.BLUE));
|
|
405 font.setBold(true);
|
|
406 font.setFontHeightInPoints((short)12);
|
|
407 style.setFont(font);
|
|
408 myCell.setCellStyle(style);
|
|
409 }
|
|
410
|
|
411 /**
|
|
412 * provide the table headers of the matches part with border lines
|
|
413 */
|
|
414 public void setTitleBorder(){
|
|
415 XSSFCellStyle border = wb.createCellStyle();
|
|
416 border.setBorderBottom(XSSFCellStyle.BORDER_DOUBLE);
|
|
417 border.setBorderTop(XSSFCellStyle.BORDER_THIN);
|
|
418 myCell.setCellStyle(border);
|
|
419 }
|
|
420
|
|
421 /**
|
|
422 * make a border
|
|
423 */
|
|
424 public void setUpperBorder(){
|
|
425 XSSFCellStyle border = wb.createCellStyle();
|
|
426 border.setBorderTop(XSSFCellStyle.BORDER_THIN);
|
|
427 myCell.setCellStyle(border);
|
|
428 }
|
|
429
|
|
430 /************
|
|
431 * formatting
|
|
432 ***********/
|
|
433 /**
|
|
434 * format text in bold and align text right
|
|
435 */
|
|
436 public void makeBold(){
|
|
437 XSSFCellStyle style = wb.createCellStyle();
|
|
438 XSSFFont font = wb.createFont();
|
|
439 font.setBold(true);
|
|
440 style.setFont(font);
|
|
441 myCell.setCellStyle(style);
|
|
442 }
|
|
443
|
|
444 /**
|
|
445 * format text as italic and align right
|
|
446 */
|
|
447 public void makeItalicRight(){
|
|
448 XSSFCellStyle style = wb.createCellStyle();
|
|
449 XSSFFont font = wb.createFont();
|
|
450 font.setItalic(true);
|
|
451 style.setFont(font);
|
|
452 style.setAlignment(CellStyle.ALIGN_RIGHT);
|
|
453 myCell.setCellStyle(style);
|
|
454 }
|
|
455
|
|
456 /**
|
|
457 * adapt column width depending on the content, except for some cases where the column width would become too big
|
|
458 */
|
|
459 public void setColumnWidth(){
|
|
460 for (int col = 0; col < maxColnr; col++){
|
|
461 sheet.autoSizeColumn(col);
|
|
462 }
|
|
463 //for some columns, fixed size is desired above autosize
|
|
464 sheet.setColumnWidth(1, 256*15);
|
|
465 sheet.setColumnWidth(3, 256*27);
|
|
466 }
|
|
467
|
|
468 /**
|
|
469 * Helper method for setColumnWitdt()
|
|
470 * Method determines the number of columns which is filled with content
|
|
471 * This number is needed to loop through the columns and autofit their width with method setColumnWidth()
|
|
472 * @return
|
|
473 */
|
|
474 public void getMaxColumns(int colnr){
|
|
475 if (colnr > maxColnr){
|
|
476 maxColnr = colnr;
|
|
477 }
|
|
478 }
|
|
479
|
|
480 /**
|
|
481 * if text does not fit on one line, this method wraps the text
|
|
482 */
|
|
483 public void wrapText(){
|
|
484 XSSFCellStyle wrap=wb.createCellStyle();
|
|
485 wrap.setWrapText(true);
|
|
486 myCell.setCellStyle(wrap);
|
|
487 }
|
|
488 }
|