comparison export_iprscan_to_Excel/source_files/iprscanToExcel_v20/src/be/cropdesign/iprscan/XMLToExcel.java @ 0:a9762cd6e2e3 draft default tip

Uploaded
author basfplant
date Tue, 05 Mar 2013 04:00:19 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:a9762cd6e2e3
1 package be.cropdesign.iprscan;
2 /**
3 * Converts the *.xml output file of the InterProScan program to an Excel file (*.xlsx)
4 * @author: Katrien Bernaerts and Domantas Motiejunas
5 * @date: 21/06/2012
6 * @affiliation: CropDesign N.V., a BASF Plant Science Company - Technologiepark 3, 9052 Zwijnaarde - Belgium
7 */
8 import java.awt.Color;
9 import java.io.File;
10 import java.io.IOException;
11
12 import javax.xml.parsers.DocumentBuilder;
13 import javax.xml.parsers.DocumentBuilderFactory;
14 import javax.xml.parsers.FactoryConfigurationError;
15 import javax.xml.parsers.ParserConfigurationException;
16
17 import org.apache.poi.ss.usermodel.CellStyle;
18 import org.apache.poi.xssf.usermodel.XSSFCell;
19 import org.apache.poi.xssf.usermodel.XSSFCellStyle;
20 import org.apache.poi.xssf.usermodel.XSSFColor;
21 import org.apache.poi.xssf.usermodel.XSSFFont;
22 import org.apache.poi.xssf.usermodel.XSSFRow;
23 import org.apache.poi.xssf.usermodel.XSSFSheet;
24 import org.apache.poi.xssf.usermodel.XSSFWorkbook;
25 import org.w3c.dom.Attr;
26 import org.w3c.dom.Document;
27 import org.w3c.dom.Element;
28 import org.w3c.dom.Node;
29 import org.w3c.dom.NodeList;
30 import org.xml.sax.SAXException;
31
32 public class XMLToExcel {
33 /**
34 * fields needed to generate Excel
35 */
36 private XSSFWorkbook wb;
37 private XSSFSheet sheet;
38 private XSSFRow myRow;
39 private XSSFCell myCell;
40 private String XMLFile;
41
42 /**
43 * Fields/counters
44 */
45 private int rownr;
46 private int colnr;
47 private int maxColnr;
48
49 /**
50 * constructor
51 */
52 public XMLToExcel(XSSFWorkbook wb, XSSFSheet sheet,XSSFRow myRow, XSSFCell myCell, String XMLFile){
53 this.wb = wb;
54 this.sheet = sheet;
55 this.myRow = myRow;
56 this.myCell = myCell;
57 this.XMLFile = XMLFile;
58 rownr = 0;
59 colnr = 0;
60 makeDOMParser();
61 setColumnWidth();
62 }
63
64 /**
65 * method to make a DOM parser. This method calls the method parseProteins, which parses the protein nodes, which further calls methods to parse deeper nodes.
66 */
67 public void makeDOMParser(){
68 try {
69 File file = new File(XMLFile);
70 if (file.exists()) {
71 DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
72 DocumentBuilder db = dbf.newDocumentBuilder();
73 Document doc = db.parse(file);
74 doc.getDocumentElement().normalize();
75
76 // parse protein nodes, the first level subnodes of root node interpro_matches
77 parseProteins(doc);
78 } else {
79 System.err.println("The XML file you try to convert to Excel does not exist.");
80 }
81 } catch (IOException ioe) {
82 System.err.println("Input/output exception: " + ioe);
83 } catch (SAXException saxe) {
84 System.err.println("SAX parsing exception: " + saxe);
85 } catch (FactoryConfigurationError fce) {
86 System.err.println("Factory configuration error: " + fce);
87 } catch (ParserConfigurationException pce) {
88 System.err.println("Parser configuration exception: " + pce);
89 }
90 }
91
92 /**************************
93 * parse nodes and subnodes
94 *************************/
95 /**
96 * method parseProteins to parse the protein nodes, the first level subnodes of root node interpro_matches
97 */
98 public void parseProteins(Document doc){
99 // parse through the protein nodes
100 NodeList proteinLst = doc.getElementsByTagName("protein");
101 if (proteinLst != null && proteinLst.getLength() > 0) {
102 for (int i = 0; i < proteinLst.getLength(); i++) {
103 Node proteinNode = proteinLst.item(i);
104 if (proteinNode.getNodeType() == Node.ELEMENT_NODE ) {
105 Element proteinElement = (Element) proteinNode;
106 Attr proteinId = proteinElement.getAttributeNode("id");
107
108 myRow = sheet.createRow((short)rownr);
109 makeCellText(colnr, "PROTEIN");
110 formatTitle();
111 makeCellText(colnr+1, proteinId.getValue());
112 formatTitle();
113 makeRow();
114
115 // parse the child nodes of the protein nodes, namely interpro
116 parseInterpro(proteinNode);
117 }// end if proteinNode
118 }// end if proteinLst
119 } else {
120 System.out.print("XML file empty.");
121 }
122 }
123
124 /**
125 * parse interpro nodes, which are child nodes of the protein nodes
126 */
127 public void parseInterpro(Node proteinNode){
128 // parse through the interpro nodes, which are subnodes of the protein node
129 NodeList interproLst = proteinNode.getChildNodes();
130 if (interproLst != null && interproLst.getLength() > 0) {
131 for (int j = 0; j < interproLst.getLength(); j++) {
132 Node interproNode = interproLst.item(j);
133 if (interproNode.getNodeType() == Node.ELEMENT_NODE && interproNode.getNodeName().equals("interpro")) {
134 Element interproElement = (Element) interproNode;
135 Attr interproId = interproElement.getAttributeNode("id");
136 makeCellText(colnr, interproId.getValue());
137 makeBold();
138 Attr interproName = interproElement.getAttributeNode("name");
139 makeCellText(colnr+1, interproName.getValue());
140 makeBold();
141 Attr interproParentId = interproElement.getAttributeNode("parent_id");
142 makeRow();
143
144 /*
145 * parse match node, a child node of the interpro node
146 */
147 parseMatchInterproNode(interproNode);
148
149 /*
150 * parent information from interpro attribute
151 */
152 makeCellText(colnr, "parent ");
153 makeItalicRight();
154 if (interproParentId != null) {
155 makeCellText(colnr+1, interproParentId.getValue());
156 setUpperBorder();
157 }
158 else {
159 makeCellText(colnr+1, "No parent");
160 setUpperBorder();
161 }
162 makeRow();
163
164 /*
165 * child_list subnode from interpro
166 */
167 parseInterproChild("child_list ", "No children", interproNode);
168
169 /*
170 * found_in subnode of interpro
171 */
172 parseInterproChild("found_in ", "No entries", interproNode);
173
174 /*
175 * contains subnode of interpro
176 */
177 parseInterproChild("contains ", "No entries", interproNode);
178
179 /*
180 * GO-terms subnode of interpro
181 */
182 parseGOterms(interproNode);
183
184 }// end if interproNode
185 }// end for interproLst, counter j
186 makeRow();
187 }// end if interproLst
188 }
189
190 /**
191 * method parseMatch to parse through the match nodes.
192 * The match nodes are child nodes of the interpro nodes
193 */
194 public void parseMatchInterproNode(Node interproNode){
195 makeTableHeaders();
196 // parse through the child nodes of a node
197 NodeList interproChildLst = interproNode.getChildNodes();
198 if (interproChildLst != null && interproChildLst.getLength() > 0) {
199 for (int k = 0; k < interproChildLst.getLength(); k++) {
200 Node interproChildNode = interproChildLst.item(k);
201 if (interproChildNode.getNodeType() == Node.ELEMENT_NODE && interproChildNode.getNodeName().equals("match")) {
202 Element matchElement = (Element) interproChildNode;
203 getMatchAttributes(matchElement);
204 // parse the location information of the matches. Location nodes are subnodes of match nodes.
205 parseLocationMatch(interproChildNode);
206 }// end if interproChildNode match
207 }// end for interproChildLst, counter k
208 }// end if interproChildLst
209 /*
210 * border line under table
211 */
212 for (int i = 1; i < 8; i++){
213 makeCellText(i, " ");
214 setUpperBorder();
215 }
216 }
217
218 /**
219 * method parseLocation to parse through the location nodes, which are child nodes of the match node.
220 * The match nodes can be one of the childnode types of the interpro node or childnodes of the
221 * protein node.
222 */
223 public void parseLocationMatch(Node node){
224 NodeList locationLst = node.getChildNodes();
225 if (locationLst != null && locationLst.getLength() > 0) {
226 for (int l = 0; l < locationLst.getLength(); l++) {
227 Node locationNode = locationLst.item(l);
228 if (locationNode.getNodeType() == Node.ELEMENT_NODE) {
229 Element locationElement = (Element) locationNode;
230 getLocationAttributes(locationElement);
231 }// end if locationNode
232 }// end for locationLst, counter l
233 } // end if locationLst
234 }
235
236 /**
237 * method parseInterProChild to parse the nodes child_list, found_in, contains, GO_terms and their contents. These nodes are childnodes of the interpro node.
238 */
239 public void parseInterproChild(String nodeName, String emptyNode, Node interproNode){
240 makeCellText(0, nodeName);
241 makeItalicRight();
242 makeCellText(1, emptyNode);
243
244 // parse through the child nodes of an interpro node
245 NodeList interproChildLst = interproNode.getChildNodes();
246 if (interproChildLst != null && interproChildLst.getLength() > 0) {
247 for (int k = 0; k < interproChildLst.getLength(); k++) {
248 Node interproChildNode = interproChildLst.item(k);
249 // parse the child node of the interpro node with as name nodeName (parameter of the method)
250 if (interproChildNode.getNodeType() == Node.ELEMENT_NODE && interproChildNode.getNodeName().equals(nodeName.trim())) {
251 // parse through the rel_ref nodes which are childnodes of contains, found_in and child_list
252 NodeList rel_refLst = interproChildNode.getChildNodes();
253 if (rel_refLst != null && rel_refLst.getLength() > 0) {
254 for (int s = 0; s < rel_refLst.getLength(); s++) {
255 Node ref_relNode = rel_refLst.item(s);
256 if (ref_relNode.getNodeType() == Node.ELEMENT_NODE) {
257 Element ref_relElement = (Element) ref_relNode;
258 Attr ipr_ref = ref_relElement.getAttributeNode("ipr_ref");
259 colnr++;
260 makeCellText(colnr, ipr_ref.getValue());
261 }// end if ref_relNode
262 }//end for ref_relLst
263 getMaxColumns(colnr + rel_refLst.getLength() + 1);
264 colnr = 0;//reset colnr to 0 in order to let the other items start in the column with id 0
265 }//end if relf_refLst
266 } // end if interproChildNode childnode
267 }// end for interproChildLst childnode
268 }// end if interproChildLst childnode
269 makeRow();
270 }
271
272 /**
273 * parse classification node, which is a subnode of the interpro node
274 */
275 public void parseGOterms(Node interproNode){
276 makeCellText(0, "GO terms ");
277 makeItalicRight();
278 makeCellText(1, "no GO terms");
279 int rowGO = rownr;
280
281 // parse through the child nodes of an interpro node
282 NodeList interproChildLst = interproNode.getChildNodes();
283 if (interproChildLst != null && interproChildLst.getLength() > 0) {
284 for (int k = 0; k < interproChildLst.getLength(); k++) {
285 Node interproChildNode = interproChildLst.item(k);
286 if (interproChildNode.getNodeType() == Node.ELEMENT_NODE && interproChildNode.getNodeName().equals("classification")) {
287 Element classificationElement = (Element) interproChildNode;
288 Attr classificationId = classificationElement.getAttributeNode("id");
289 makeCellText(colnr+2, classificationId.getValue());
290 // parse through the child nodes of a classification node
291 NodeList classificationLst = classificationElement.getChildNodes();
292 if (classificationLst != null && classificationLst.getLength() > 0) {
293 for (int l=0; l < classificationLst.getLength(); l++) {
294 Node classificationChildNode = classificationLst.item(l);
295 if (classificationChildNode.getNodeType() == Node.ELEMENT_NODE && classificationChildNode.getNodeName().equals("category")) {
296 Element categoryElement = (Element) classificationChildNode;
297 makeCellText(colnr+1, categoryElement.getTextContent());
298 }
299 if (classificationChildNode.getNodeType() == Node.ELEMENT_NODE && classificationChildNode.getNodeName().equals("description")) {
300 Element descriptionElement = (Element) classificationChildNode;
301 makeCellText(colnr+3, descriptionElement.getTextContent());
302 }
303 }
304 }// end if classificationLst
305 makeRow();
306 }// end if interproChildNode classification
307 }// end for interproChildLst, counter k
308 getMaxColumns(colnr + interproChildLst.getLength() + 1);
309 }// end if interproChildLst
310 /*
311 * insert empty row after the row GO term
312 */
313 if (sheet.getRow(rowGO).getCell(1).toString().equals("no GO terms")){
314 makeRow();
315 }
316 makeRow();
317 }
318
319 /**
320 * Get the attribute values of the match elements and fill them in in Excel
321 * @param matchElement
322 */
323 public void getMatchAttributes(Element matchElement){
324 Attr matchDbname = matchElement.getAttributeNode("dbname");
325 makeCellText(colnr+1, matchDbname.getValue());
326 Attr matchId = matchElement.getAttributeNode("id");
327 makeCellText(colnr+2, matchId.getValue());
328 Attr matchName = matchElement.getAttributeNode("name");
329 makeCellText(colnr+3, matchName.getValue());
330 wrapText();//if the cell content does not fit the width of the column, the text is automatically wrapped
331 }
332
333 /**
334 * Get the attribute values of the location elements and fill them in in Excel
335 * @param locationElement
336 */
337 public void getLocationAttributes(Element locationElement){
338 // get the desired attribute values of the location node in the desired order
339 Attr locationScore = locationElement.getAttributeNode("score");
340 Attr locationStart = locationElement.getAttributeNode("start");
341 Attr locationEnd = locationElement.getAttributeNode("end");
342 Attr locationStatus = locationElement.getAttributeNode("status");
343 makeCellText(colnr+4, locationScore.getValue());
344 makeCellNumber(colnr+5, Integer.parseInt(locationStart.getValue()));
345 makeCellNumber(colnr+6, Integer.parseInt(locationEnd.getValue()));
346 makeCellText(colnr+7, locationStatus.getValue());
347 makeRow();
348 }
349
350 /******************************
351 * make rows and cells in Excel
352 ******************************/
353 /**
354 * method to fill cells of a row in an Excel sheet with text content
355 */
356 public void makeCellText(int colnr, String content){
357 myCell = myRow.createCell((short)colnr);
358 myCell.setCellValue(content);
359 }
360
361 /**
362 * method to fill cells of a row in an Excel sheet with number formatted content
363 */
364 public void makeCellNumber(int colnr, int content){
365 myCell = myRow.createCell((short)colnr);
366 myCell.setCellValue(content);
367 }
368
369 /**
370 * method to make a new row in an Excel sheet
371 */
372 public void makeRow(){
373 rownr++;
374 myRow = sheet.createRow((short)rownr);
375 }
376
377 /***********************************
378 * table headers and titles in Excel
379 ***********************************/
380 /**
381 * method to make table headers for the matches table
382 */
383 public void makeTableHeaders(){
384 // headers
385 makeCellText(0, "matches ");
386 makeItalicRight();
387 //headers of table
388 String[] headers = {"Method", "Identifier", "Description", "Score", "Start", "End", "Status"};
389 for (int i =0; i < headers.length; i++){
390 makeCellText(colnr+i+1, headers[i]);
391 setTitleBorder();
392 }
393 getMaxColumns(colnr + headers.length + 1); // needed for method setColumnWidth()
394 makeRow();
395 // end headers
396 }
397
398 /**
399 * format the protein titles
400 */
401 public void formatTitle(){
402 XSSFCellStyle style = wb.createCellStyle();
403 XSSFFont font = wb.createFont();
404 font.setColor(new XSSFColor(Color.BLUE));
405 font.setBold(true);
406 font.setFontHeightInPoints((short)12);
407 style.setFont(font);
408 myCell.setCellStyle(style);
409 }
410
411 /**
412 * provide the table headers of the matches part with border lines
413 */
414 public void setTitleBorder(){
415 XSSFCellStyle border = wb.createCellStyle();
416 border.setBorderBottom(XSSFCellStyle.BORDER_DOUBLE);
417 border.setBorderTop(XSSFCellStyle.BORDER_THIN);
418 myCell.setCellStyle(border);
419 }
420
421 /**
422 * make a border
423 */
424 public void setUpperBorder(){
425 XSSFCellStyle border = wb.createCellStyle();
426 border.setBorderTop(XSSFCellStyle.BORDER_THIN);
427 myCell.setCellStyle(border);
428 }
429
430 /************
431 * formatting
432 ***********/
433 /**
434 * format text in bold and align text right
435 */
436 public void makeBold(){
437 XSSFCellStyle style = wb.createCellStyle();
438 XSSFFont font = wb.createFont();
439 font.setBold(true);
440 style.setFont(font);
441 myCell.setCellStyle(style);
442 }
443
444 /**
445 * format text as italic and align right
446 */
447 public void makeItalicRight(){
448 XSSFCellStyle style = wb.createCellStyle();
449 XSSFFont font = wb.createFont();
450 font.setItalic(true);
451 style.setFont(font);
452 style.setAlignment(CellStyle.ALIGN_RIGHT);
453 myCell.setCellStyle(style);
454 }
455
456 /**
457 * adapt column width depending on the content, except for some cases where the column width would become too big
458 */
459 public void setColumnWidth(){
460 for (int col = 0; col < maxColnr; col++){
461 sheet.autoSizeColumn(col);
462 }
463 //for some columns, fixed size is desired above autosize
464 sheet.setColumnWidth(1, 256*15);
465 sheet.setColumnWidth(3, 256*27);
466 }
467
468 /**
469 * Helper method for setColumnWitdt()
470 * Method determines the number of columns which is filled with content
471 * This number is needed to loop through the columns and autofit their width with method setColumnWidth()
472 * @return
473 */
474 public void getMaxColumns(int colnr){
475 if (colnr > maxColnr){
476 maxColnr = colnr;
477 }
478 }
479
480 /**
481 * if text does not fit on one line, this method wraps the text
482 */
483 public void wrapText(){
484 XSSFCellStyle wrap=wb.createCellStyle();
485 wrap.setWrapText(true);
486 myCell.setCellStyle(wrap);
487 }
488 }