Mercurial > repos > basfplant > interproscan_to_excel
view export_iprscan_to_Excel/source_files/iprscanToExcel_v20/src/be/cropdesign/iprscan/XMLToExcel.java @ 0:a9762cd6e2e3 draft default tip
Uploaded
author | basfplant |
---|---|
date | Tue, 05 Mar 2013 04:00:19 -0500 |
parents | |
children |
line wrap: on
line source
package be.cropdesign.iprscan; /** * Converts the *.xml output file of the InterProScan program to an Excel file (*.xlsx) * @author: Katrien Bernaerts and Domantas Motiejunas * @date: 21/06/2012 * @affiliation: CropDesign N.V., a BASF Plant Science Company - Technologiepark 3, 9052 Zwijnaarde - Belgium */ import java.awt.Color; import java.io.File; import java.io.IOException; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.FactoryConfigurationError; import javax.xml.parsers.ParserConfigurationException; import org.apache.poi.ss.usermodel.CellStyle; import org.apache.poi.xssf.usermodel.XSSFCell; import org.apache.poi.xssf.usermodel.XSSFCellStyle; import org.apache.poi.xssf.usermodel.XSSFColor; import org.apache.poi.xssf.usermodel.XSSFFont; import org.apache.poi.xssf.usermodel.XSSFRow; import org.apache.poi.xssf.usermodel.XSSFSheet; import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.w3c.dom.Attr; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; public class XMLToExcel { /** * fields needed to generate Excel */ private XSSFWorkbook wb; private XSSFSheet sheet; private XSSFRow myRow; private XSSFCell myCell; private String XMLFile; /** * Fields/counters */ private int rownr; private int colnr; private int maxColnr; /** * constructor */ public XMLToExcel(XSSFWorkbook wb, XSSFSheet sheet,XSSFRow myRow, XSSFCell myCell, String XMLFile){ this.wb = wb; this.sheet = sheet; this.myRow = myRow; this.myCell = myCell; this.XMLFile = XMLFile; rownr = 0; colnr = 0; makeDOMParser(); setColumnWidth(); } /** * method to make a DOM parser. This method calls the method parseProteins, which parses the protein nodes, which further calls methods to parse deeper nodes. */ public void makeDOMParser(){ try { File file = new File(XMLFile); if (file.exists()) { DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); DocumentBuilder db = dbf.newDocumentBuilder(); Document doc = db.parse(file); doc.getDocumentElement().normalize(); // parse protein nodes, the first level subnodes of root node interpro_matches parseProteins(doc); } else { System.err.println("The XML file you try to convert to Excel does not exist."); } } catch (IOException ioe) { System.err.println("Input/output exception: " + ioe); } catch (SAXException saxe) { System.err.println("SAX parsing exception: " + saxe); } catch (FactoryConfigurationError fce) { System.err.println("Factory configuration error: " + fce); } catch (ParserConfigurationException pce) { System.err.println("Parser configuration exception: " + pce); } } /************************** * parse nodes and subnodes *************************/ /** * method parseProteins to parse the protein nodes, the first level subnodes of root node interpro_matches */ public void parseProteins(Document doc){ // parse through the protein nodes NodeList proteinLst = doc.getElementsByTagName("protein"); if (proteinLst != null && proteinLst.getLength() > 0) { for (int i = 0; i < proteinLst.getLength(); i++) { Node proteinNode = proteinLst.item(i); if (proteinNode.getNodeType() == Node.ELEMENT_NODE ) { Element proteinElement = (Element) proteinNode; Attr proteinId = proteinElement.getAttributeNode("id"); myRow = sheet.createRow((short)rownr); makeCellText(colnr, "PROTEIN"); formatTitle(); makeCellText(colnr+1, proteinId.getValue()); formatTitle(); makeRow(); // parse the child nodes of the protein nodes, namely interpro parseInterpro(proteinNode); }// end if proteinNode }// end if proteinLst } else { System.out.print("XML file empty."); } } /** * parse interpro nodes, which are child nodes of the protein nodes */ public void parseInterpro(Node proteinNode){ // parse through the interpro nodes, which are subnodes of the protein node NodeList interproLst = proteinNode.getChildNodes(); if (interproLst != null && interproLst.getLength() > 0) { for (int j = 0; j < interproLst.getLength(); j++) { Node interproNode = interproLst.item(j); if (interproNode.getNodeType() == Node.ELEMENT_NODE && interproNode.getNodeName().equals("interpro")) { Element interproElement = (Element) interproNode; Attr interproId = interproElement.getAttributeNode("id"); makeCellText(colnr, interproId.getValue()); makeBold(); Attr interproName = interproElement.getAttributeNode("name"); makeCellText(colnr+1, interproName.getValue()); makeBold(); Attr interproParentId = interproElement.getAttributeNode("parent_id"); makeRow(); /* * parse match node, a child node of the interpro node */ parseMatchInterproNode(interproNode); /* * parent information from interpro attribute */ makeCellText(colnr, "parent "); makeItalicRight(); if (interproParentId != null) { makeCellText(colnr+1, interproParentId.getValue()); setUpperBorder(); } else { makeCellText(colnr+1, "No parent"); setUpperBorder(); } makeRow(); /* * child_list subnode from interpro */ parseInterproChild("child_list ", "No children", interproNode); /* * found_in subnode of interpro */ parseInterproChild("found_in ", "No entries", interproNode); /* * contains subnode of interpro */ parseInterproChild("contains ", "No entries", interproNode); /* * GO-terms subnode of interpro */ parseGOterms(interproNode); }// end if interproNode }// end for interproLst, counter j makeRow(); }// end if interproLst } /** * method parseMatch to parse through the match nodes. * The match nodes are child nodes of the interpro nodes */ public void parseMatchInterproNode(Node interproNode){ makeTableHeaders(); // parse through the child nodes of a node NodeList interproChildLst = interproNode.getChildNodes(); if (interproChildLst != null && interproChildLst.getLength() > 0) { for (int k = 0; k < interproChildLst.getLength(); k++) { Node interproChildNode = interproChildLst.item(k); if (interproChildNode.getNodeType() == Node.ELEMENT_NODE && interproChildNode.getNodeName().equals("match")) { Element matchElement = (Element) interproChildNode; getMatchAttributes(matchElement); // parse the location information of the matches. Location nodes are subnodes of match nodes. parseLocationMatch(interproChildNode); }// end if interproChildNode match }// end for interproChildLst, counter k }// end if interproChildLst /* * border line under table */ for (int i = 1; i < 8; i++){ makeCellText(i, " "); setUpperBorder(); } } /** * method parseLocation to parse through the location nodes, which are child nodes of the match node. * The match nodes can be one of the childnode types of the interpro node or childnodes of the * protein node. */ public void parseLocationMatch(Node node){ NodeList locationLst = node.getChildNodes(); if (locationLst != null && locationLst.getLength() > 0) { for (int l = 0; l < locationLst.getLength(); l++) { Node locationNode = locationLst.item(l); if (locationNode.getNodeType() == Node.ELEMENT_NODE) { Element locationElement = (Element) locationNode; getLocationAttributes(locationElement); }// end if locationNode }// end for locationLst, counter l } // end if locationLst } /** * method parseInterProChild to parse the nodes child_list, found_in, contains, GO_terms and their contents. These nodes are childnodes of the interpro node. */ public void parseInterproChild(String nodeName, String emptyNode, Node interproNode){ makeCellText(0, nodeName); makeItalicRight(); makeCellText(1, emptyNode); // parse through the child nodes of an interpro node NodeList interproChildLst = interproNode.getChildNodes(); if (interproChildLst != null && interproChildLst.getLength() > 0) { for (int k = 0; k < interproChildLst.getLength(); k++) { Node interproChildNode = interproChildLst.item(k); // parse the child node of the interpro node with as name nodeName (parameter of the method) if (interproChildNode.getNodeType() == Node.ELEMENT_NODE && interproChildNode.getNodeName().equals(nodeName.trim())) { // parse through the rel_ref nodes which are childnodes of contains, found_in and child_list NodeList rel_refLst = interproChildNode.getChildNodes(); if (rel_refLst != null && rel_refLst.getLength() > 0) { for (int s = 0; s < rel_refLst.getLength(); s++) { Node ref_relNode = rel_refLst.item(s); if (ref_relNode.getNodeType() == Node.ELEMENT_NODE) { Element ref_relElement = (Element) ref_relNode; Attr ipr_ref = ref_relElement.getAttributeNode("ipr_ref"); colnr++; makeCellText(colnr, ipr_ref.getValue()); }// end if ref_relNode }//end for ref_relLst getMaxColumns(colnr + rel_refLst.getLength() + 1); colnr = 0;//reset colnr to 0 in order to let the other items start in the column with id 0 }//end if relf_refLst } // end if interproChildNode childnode }// end for interproChildLst childnode }// end if interproChildLst childnode makeRow(); } /** * parse classification node, which is a subnode of the interpro node */ public void parseGOterms(Node interproNode){ makeCellText(0, "GO terms "); makeItalicRight(); makeCellText(1, "no GO terms"); int rowGO = rownr; // parse through the child nodes of an interpro node NodeList interproChildLst = interproNode.getChildNodes(); if (interproChildLst != null && interproChildLst.getLength() > 0) { for (int k = 0; k < interproChildLst.getLength(); k++) { Node interproChildNode = interproChildLst.item(k); if (interproChildNode.getNodeType() == Node.ELEMENT_NODE && interproChildNode.getNodeName().equals("classification")) { Element classificationElement = (Element) interproChildNode; Attr classificationId = classificationElement.getAttributeNode("id"); makeCellText(colnr+2, classificationId.getValue()); // parse through the child nodes of a classification node NodeList classificationLst = classificationElement.getChildNodes(); if (classificationLst != null && classificationLst.getLength() > 0) { for (int l=0; l < classificationLst.getLength(); l++) { Node classificationChildNode = classificationLst.item(l); if (classificationChildNode.getNodeType() == Node.ELEMENT_NODE && classificationChildNode.getNodeName().equals("category")) { Element categoryElement = (Element) classificationChildNode; makeCellText(colnr+1, categoryElement.getTextContent()); } if (classificationChildNode.getNodeType() == Node.ELEMENT_NODE && classificationChildNode.getNodeName().equals("description")) { Element descriptionElement = (Element) classificationChildNode; makeCellText(colnr+3, descriptionElement.getTextContent()); } } }// end if classificationLst makeRow(); }// end if interproChildNode classification }// end for interproChildLst, counter k getMaxColumns(colnr + interproChildLst.getLength() + 1); }// end if interproChildLst /* * insert empty row after the row GO term */ if (sheet.getRow(rowGO).getCell(1).toString().equals("no GO terms")){ makeRow(); } makeRow(); } /** * Get the attribute values of the match elements and fill them in in Excel * @param matchElement */ public void getMatchAttributes(Element matchElement){ Attr matchDbname = matchElement.getAttributeNode("dbname"); makeCellText(colnr+1, matchDbname.getValue()); Attr matchId = matchElement.getAttributeNode("id"); makeCellText(colnr+2, matchId.getValue()); Attr matchName = matchElement.getAttributeNode("name"); makeCellText(colnr+3, matchName.getValue()); wrapText();//if the cell content does not fit the width of the column, the text is automatically wrapped } /** * Get the attribute values of the location elements and fill them in in Excel * @param locationElement */ public void getLocationAttributes(Element locationElement){ // get the desired attribute values of the location node in the desired order Attr locationScore = locationElement.getAttributeNode("score"); Attr locationStart = locationElement.getAttributeNode("start"); Attr locationEnd = locationElement.getAttributeNode("end"); Attr locationStatus = locationElement.getAttributeNode("status"); makeCellText(colnr+4, locationScore.getValue()); makeCellNumber(colnr+5, Integer.parseInt(locationStart.getValue())); makeCellNumber(colnr+6, Integer.parseInt(locationEnd.getValue())); makeCellText(colnr+7, locationStatus.getValue()); makeRow(); } /****************************** * make rows and cells in Excel ******************************/ /** * method to fill cells of a row in an Excel sheet with text content */ public void makeCellText(int colnr, String content){ myCell = myRow.createCell((short)colnr); myCell.setCellValue(content); } /** * method to fill cells of a row in an Excel sheet with number formatted content */ public void makeCellNumber(int colnr, int content){ myCell = myRow.createCell((short)colnr); myCell.setCellValue(content); } /** * method to make a new row in an Excel sheet */ public void makeRow(){ rownr++; myRow = sheet.createRow((short)rownr); } /*********************************** * table headers and titles in Excel ***********************************/ /** * method to make table headers for the matches table */ public void makeTableHeaders(){ // headers makeCellText(0, "matches "); makeItalicRight(); //headers of table String[] headers = {"Method", "Identifier", "Description", "Score", "Start", "End", "Status"}; for (int i =0; i < headers.length; i++){ makeCellText(colnr+i+1, headers[i]); setTitleBorder(); } getMaxColumns(colnr + headers.length + 1); // needed for method setColumnWidth() makeRow(); // end headers } /** * format the protein titles */ public void formatTitle(){ XSSFCellStyle style = wb.createCellStyle(); XSSFFont font = wb.createFont(); font.setColor(new XSSFColor(Color.BLUE)); font.setBold(true); font.setFontHeightInPoints((short)12); style.setFont(font); myCell.setCellStyle(style); } /** * provide the table headers of the matches part with border lines */ public void setTitleBorder(){ XSSFCellStyle border = wb.createCellStyle(); border.setBorderBottom(XSSFCellStyle.BORDER_DOUBLE); border.setBorderTop(XSSFCellStyle.BORDER_THIN); myCell.setCellStyle(border); } /** * make a border */ public void setUpperBorder(){ XSSFCellStyle border = wb.createCellStyle(); border.setBorderTop(XSSFCellStyle.BORDER_THIN); myCell.setCellStyle(border); } /************ * formatting ***********/ /** * format text in bold and align text right */ public void makeBold(){ XSSFCellStyle style = wb.createCellStyle(); XSSFFont font = wb.createFont(); font.setBold(true); style.setFont(font); myCell.setCellStyle(style); } /** * format text as italic and align right */ public void makeItalicRight(){ XSSFCellStyle style = wb.createCellStyle(); XSSFFont font = wb.createFont(); font.setItalic(true); style.setFont(font); style.setAlignment(CellStyle.ALIGN_RIGHT); myCell.setCellStyle(style); } /** * adapt column width depending on the content, except for some cases where the column width would become too big */ public void setColumnWidth(){ for (int col = 0; col < maxColnr; col++){ sheet.autoSizeColumn(col); } //for some columns, fixed size is desired above autosize sheet.setColumnWidth(1, 256*15); sheet.setColumnWidth(3, 256*27); } /** * Helper method for setColumnWitdt() * Method determines the number of columns which is filled with content * This number is needed to loop through the columns and autofit their width with method setColumnWidth() * @return */ public void getMaxColumns(int colnr){ if (colnr > maxColnr){ maxColnr = colnr; } } /** * if text does not fit on one line, this method wraps the text */ public void wrapText(){ XSSFCellStyle wrap=wb.createCellStyle(); wrap.setWrapText(true); myCell.setCellStyle(wrap); } }