Mercurial > repos > basfplant > interproscan_to_excel
diff export_iprscan_to_Excel/source_files/iprscanToExcel_v20/src/be/cropdesign/iprscan/XMLToExcel.java @ 0:a9762cd6e2e3 draft default tip
Uploaded
author | basfplant |
---|---|
date | Tue, 05 Mar 2013 04:00:19 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/export_iprscan_to_Excel/source_files/iprscanToExcel_v20/src/be/cropdesign/iprscan/XMLToExcel.java Tue Mar 05 04:00:19 2013 -0500 @@ -0,0 +1,488 @@ +package be.cropdesign.iprscan; +/** + * Converts the *.xml output file of the InterProScan program to an Excel file (*.xlsx) + * @author: Katrien Bernaerts and Domantas Motiejunas + * @date: 21/06/2012 + * @affiliation: CropDesign N.V., a BASF Plant Science Company - Technologiepark 3, 9052 Zwijnaarde - Belgium + */ +import java.awt.Color; +import java.io.File; +import java.io.IOException; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.FactoryConfigurationError; +import javax.xml.parsers.ParserConfigurationException; + +import org.apache.poi.ss.usermodel.CellStyle; +import org.apache.poi.xssf.usermodel.XSSFCell; +import org.apache.poi.xssf.usermodel.XSSFCellStyle; +import org.apache.poi.xssf.usermodel.XSSFColor; +import org.apache.poi.xssf.usermodel.XSSFFont; +import org.apache.poi.xssf.usermodel.XSSFRow; +import org.apache.poi.xssf.usermodel.XSSFSheet; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.w3c.dom.Attr; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.SAXException; + +public class XMLToExcel { + /** + * fields needed to generate Excel + */ + private XSSFWorkbook wb; + private XSSFSheet sheet; + private XSSFRow myRow; + private XSSFCell myCell; + private String XMLFile; + + /** + * Fields/counters + */ + private int rownr; + private int colnr; + private int maxColnr; + + /** + * constructor + */ + public XMLToExcel(XSSFWorkbook wb, XSSFSheet sheet,XSSFRow myRow, XSSFCell myCell, String XMLFile){ + this.wb = wb; + this.sheet = sheet; + this.myRow = myRow; + this.myCell = myCell; + this.XMLFile = XMLFile; + rownr = 0; + colnr = 0; + makeDOMParser(); + setColumnWidth(); + } + + /** + * method to make a DOM parser. This method calls the method parseProteins, which parses the protein nodes, which further calls methods to parse deeper nodes. + */ + public void makeDOMParser(){ + try { + File file = new File(XMLFile); + if (file.exists()) { + DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); + DocumentBuilder db = dbf.newDocumentBuilder(); + Document doc = db.parse(file); + doc.getDocumentElement().normalize(); + + // parse protein nodes, the first level subnodes of root node interpro_matches + parseProteins(doc); + } else { + System.err.println("The XML file you try to convert to Excel does not exist."); + } + } catch (IOException ioe) { + System.err.println("Input/output exception: " + ioe); + } catch (SAXException saxe) { + System.err.println("SAX parsing exception: " + saxe); + } catch (FactoryConfigurationError fce) { + System.err.println("Factory configuration error: " + fce); + } catch (ParserConfigurationException pce) { + System.err.println("Parser configuration exception: " + pce); + } + } + + /************************** + * parse nodes and subnodes + *************************/ + /** + * method parseProteins to parse the protein nodes, the first level subnodes of root node interpro_matches + */ + public void parseProteins(Document doc){ + // parse through the protein nodes + NodeList proteinLst = doc.getElementsByTagName("protein"); + if (proteinLst != null && proteinLst.getLength() > 0) { + for (int i = 0; i < proteinLst.getLength(); i++) { + Node proteinNode = proteinLst.item(i); + if (proteinNode.getNodeType() == Node.ELEMENT_NODE ) { + Element proteinElement = (Element) proteinNode; + Attr proteinId = proteinElement.getAttributeNode("id"); + + myRow = sheet.createRow((short)rownr); + makeCellText(colnr, "PROTEIN"); + formatTitle(); + makeCellText(colnr+1, proteinId.getValue()); + formatTitle(); + makeRow(); + + // parse the child nodes of the protein nodes, namely interpro + parseInterpro(proteinNode); + }// end if proteinNode + }// end if proteinLst + } else { + System.out.print("XML file empty."); + } + } + + /** + * parse interpro nodes, which are child nodes of the protein nodes + */ + public void parseInterpro(Node proteinNode){ + // parse through the interpro nodes, which are subnodes of the protein node + NodeList interproLst = proteinNode.getChildNodes(); + if (interproLst != null && interproLst.getLength() > 0) { + for (int j = 0; j < interproLst.getLength(); j++) { + Node interproNode = interproLst.item(j); + if (interproNode.getNodeType() == Node.ELEMENT_NODE && interproNode.getNodeName().equals("interpro")) { + Element interproElement = (Element) interproNode; + Attr interproId = interproElement.getAttributeNode("id"); + makeCellText(colnr, interproId.getValue()); + makeBold(); + Attr interproName = interproElement.getAttributeNode("name"); + makeCellText(colnr+1, interproName.getValue()); + makeBold(); + Attr interproParentId = interproElement.getAttributeNode("parent_id"); + makeRow(); + + /* + * parse match node, a child node of the interpro node + */ + parseMatchInterproNode(interproNode); + + /* + * parent information from interpro attribute + */ + makeCellText(colnr, "parent "); + makeItalicRight(); + if (interproParentId != null) { + makeCellText(colnr+1, interproParentId.getValue()); + setUpperBorder(); + } + else { + makeCellText(colnr+1, "No parent"); + setUpperBorder(); + } + makeRow(); + + /* + * child_list subnode from interpro + */ + parseInterproChild("child_list ", "No children", interproNode); + + /* + * found_in subnode of interpro + */ + parseInterproChild("found_in ", "No entries", interproNode); + + /* + * contains subnode of interpro + */ + parseInterproChild("contains ", "No entries", interproNode); + + /* + * GO-terms subnode of interpro + */ + parseGOterms(interproNode); + + }// end if interproNode + }// end for interproLst, counter j + makeRow(); + }// end if interproLst + } + + /** + * method parseMatch to parse through the match nodes. + * The match nodes are child nodes of the interpro nodes + */ + public void parseMatchInterproNode(Node interproNode){ + makeTableHeaders(); + // parse through the child nodes of a node + NodeList interproChildLst = interproNode.getChildNodes(); + if (interproChildLst != null && interproChildLst.getLength() > 0) { + for (int k = 0; k < interproChildLst.getLength(); k++) { + Node interproChildNode = interproChildLst.item(k); + if (interproChildNode.getNodeType() == Node.ELEMENT_NODE && interproChildNode.getNodeName().equals("match")) { + Element matchElement = (Element) interproChildNode; + getMatchAttributes(matchElement); + // parse the location information of the matches. Location nodes are subnodes of match nodes. + parseLocationMatch(interproChildNode); + }// end if interproChildNode match + }// end for interproChildLst, counter k + }// end if interproChildLst + /* + * border line under table + */ + for (int i = 1; i < 8; i++){ + makeCellText(i, " "); + setUpperBorder(); + } + } + + /** + * method parseLocation to parse through the location nodes, which are child nodes of the match node. + * The match nodes can be one of the childnode types of the interpro node or childnodes of the + * protein node. + */ + public void parseLocationMatch(Node node){ + NodeList locationLst = node.getChildNodes(); + if (locationLst != null && locationLst.getLength() > 0) { + for (int l = 0; l < locationLst.getLength(); l++) { + Node locationNode = locationLst.item(l); + if (locationNode.getNodeType() == Node.ELEMENT_NODE) { + Element locationElement = (Element) locationNode; + getLocationAttributes(locationElement); + }// end if locationNode + }// end for locationLst, counter l + } // end if locationLst + } + + /** + * method parseInterProChild to parse the nodes child_list, found_in, contains, GO_terms and their contents. These nodes are childnodes of the interpro node. + */ + public void parseInterproChild(String nodeName, String emptyNode, Node interproNode){ + makeCellText(0, nodeName); + makeItalicRight(); + makeCellText(1, emptyNode); + + // parse through the child nodes of an interpro node + NodeList interproChildLst = interproNode.getChildNodes(); + if (interproChildLst != null && interproChildLst.getLength() > 0) { + for (int k = 0; k < interproChildLst.getLength(); k++) { + Node interproChildNode = interproChildLst.item(k); + // parse the child node of the interpro node with as name nodeName (parameter of the method) + if (interproChildNode.getNodeType() == Node.ELEMENT_NODE && interproChildNode.getNodeName().equals(nodeName.trim())) { + // parse through the rel_ref nodes which are childnodes of contains, found_in and child_list + NodeList rel_refLst = interproChildNode.getChildNodes(); + if (rel_refLst != null && rel_refLst.getLength() > 0) { + for (int s = 0; s < rel_refLst.getLength(); s++) { + Node ref_relNode = rel_refLst.item(s); + if (ref_relNode.getNodeType() == Node.ELEMENT_NODE) { + Element ref_relElement = (Element) ref_relNode; + Attr ipr_ref = ref_relElement.getAttributeNode("ipr_ref"); + colnr++; + makeCellText(colnr, ipr_ref.getValue()); + }// end if ref_relNode + }//end for ref_relLst + getMaxColumns(colnr + rel_refLst.getLength() + 1); + colnr = 0;//reset colnr to 0 in order to let the other items start in the column with id 0 + }//end if relf_refLst + } // end if interproChildNode childnode + }// end for interproChildLst childnode + }// end if interproChildLst childnode + makeRow(); + } + + /** + * parse classification node, which is a subnode of the interpro node + */ + public void parseGOterms(Node interproNode){ + makeCellText(0, "GO terms "); + makeItalicRight(); + makeCellText(1, "no GO terms"); + int rowGO = rownr; + + // parse through the child nodes of an interpro node + NodeList interproChildLst = interproNode.getChildNodes(); + if (interproChildLst != null && interproChildLst.getLength() > 0) { + for (int k = 0; k < interproChildLst.getLength(); k++) { + Node interproChildNode = interproChildLst.item(k); + if (interproChildNode.getNodeType() == Node.ELEMENT_NODE && interproChildNode.getNodeName().equals("classification")) { + Element classificationElement = (Element) interproChildNode; + Attr classificationId = classificationElement.getAttributeNode("id"); + makeCellText(colnr+2, classificationId.getValue()); + // parse through the child nodes of a classification node + NodeList classificationLst = classificationElement.getChildNodes(); + if (classificationLst != null && classificationLst.getLength() > 0) { + for (int l=0; l < classificationLst.getLength(); l++) { + Node classificationChildNode = classificationLst.item(l); + if (classificationChildNode.getNodeType() == Node.ELEMENT_NODE && classificationChildNode.getNodeName().equals("category")) { + Element categoryElement = (Element) classificationChildNode; + makeCellText(colnr+1, categoryElement.getTextContent()); + } + if (classificationChildNode.getNodeType() == Node.ELEMENT_NODE && classificationChildNode.getNodeName().equals("description")) { + Element descriptionElement = (Element) classificationChildNode; + makeCellText(colnr+3, descriptionElement.getTextContent()); + } + } + }// end if classificationLst + makeRow(); + }// end if interproChildNode classification + }// end for interproChildLst, counter k + getMaxColumns(colnr + interproChildLst.getLength() + 1); + }// end if interproChildLst + /* + * insert empty row after the row GO term + */ + if (sheet.getRow(rowGO).getCell(1).toString().equals("no GO terms")){ + makeRow(); + } + makeRow(); + } + + /** + * Get the attribute values of the match elements and fill them in in Excel + * @param matchElement + */ + public void getMatchAttributes(Element matchElement){ + Attr matchDbname = matchElement.getAttributeNode("dbname"); + makeCellText(colnr+1, matchDbname.getValue()); + Attr matchId = matchElement.getAttributeNode("id"); + makeCellText(colnr+2, matchId.getValue()); + Attr matchName = matchElement.getAttributeNode("name"); + makeCellText(colnr+3, matchName.getValue()); + wrapText();//if the cell content does not fit the width of the column, the text is automatically wrapped + } + + /** + * Get the attribute values of the location elements and fill them in in Excel + * @param locationElement + */ + public void getLocationAttributes(Element locationElement){ + // get the desired attribute values of the location node in the desired order + Attr locationScore = locationElement.getAttributeNode("score"); + Attr locationStart = locationElement.getAttributeNode("start"); + Attr locationEnd = locationElement.getAttributeNode("end"); + Attr locationStatus = locationElement.getAttributeNode("status"); + makeCellText(colnr+4, locationScore.getValue()); + makeCellNumber(colnr+5, Integer.parseInt(locationStart.getValue())); + makeCellNumber(colnr+6, Integer.parseInt(locationEnd.getValue())); + makeCellText(colnr+7, locationStatus.getValue()); + makeRow(); + } + + /****************************** + * make rows and cells in Excel + ******************************/ + /** + * method to fill cells of a row in an Excel sheet with text content + */ + public void makeCellText(int colnr, String content){ + myCell = myRow.createCell((short)colnr); + myCell.setCellValue(content); + } + + /** + * method to fill cells of a row in an Excel sheet with number formatted content + */ + public void makeCellNumber(int colnr, int content){ + myCell = myRow.createCell((short)colnr); + myCell.setCellValue(content); + } + + /** + * method to make a new row in an Excel sheet + */ + public void makeRow(){ + rownr++; + myRow = sheet.createRow((short)rownr); + } + + /*********************************** + * table headers and titles in Excel + ***********************************/ + /** + * method to make table headers for the matches table + */ + public void makeTableHeaders(){ + // headers + makeCellText(0, "matches "); + makeItalicRight(); + //headers of table + String[] headers = {"Method", "Identifier", "Description", "Score", "Start", "End", "Status"}; + for (int i =0; i < headers.length; i++){ + makeCellText(colnr+i+1, headers[i]); + setTitleBorder(); + } + getMaxColumns(colnr + headers.length + 1); // needed for method setColumnWidth() + makeRow(); + // end headers + } + + /** + * format the protein titles + */ + public void formatTitle(){ + XSSFCellStyle style = wb.createCellStyle(); + XSSFFont font = wb.createFont(); + font.setColor(new XSSFColor(Color.BLUE)); + font.setBold(true); + font.setFontHeightInPoints((short)12); + style.setFont(font); + myCell.setCellStyle(style); + } + + /** + * provide the table headers of the matches part with border lines + */ + public void setTitleBorder(){ + XSSFCellStyle border = wb.createCellStyle(); + border.setBorderBottom(XSSFCellStyle.BORDER_DOUBLE); + border.setBorderTop(XSSFCellStyle.BORDER_THIN); + myCell.setCellStyle(border); + } + + /** + * make a border + */ + public void setUpperBorder(){ + XSSFCellStyle border = wb.createCellStyle(); + border.setBorderTop(XSSFCellStyle.BORDER_THIN); + myCell.setCellStyle(border); + } + + /************ + * formatting + ***********/ + /** + * format text in bold and align text right + */ + public void makeBold(){ + XSSFCellStyle style = wb.createCellStyle(); + XSSFFont font = wb.createFont(); + font.setBold(true); + style.setFont(font); + myCell.setCellStyle(style); + } + + /** + * format text as italic and align right + */ + public void makeItalicRight(){ + XSSFCellStyle style = wb.createCellStyle(); + XSSFFont font = wb.createFont(); + font.setItalic(true); + style.setFont(font); + style.setAlignment(CellStyle.ALIGN_RIGHT); + myCell.setCellStyle(style); + } + + /** + * adapt column width depending on the content, except for some cases where the column width would become too big + */ + public void setColumnWidth(){ + for (int col = 0; col < maxColnr; col++){ + sheet.autoSizeColumn(col); + } + //for some columns, fixed size is desired above autosize + sheet.setColumnWidth(1, 256*15); + sheet.setColumnWidth(3, 256*27); + } + + /** + * Helper method for setColumnWitdt() + * Method determines the number of columns which is filled with content + * This number is needed to loop through the columns and autofit their width with method setColumnWidth() + * @return + */ + public void getMaxColumns(int colnr){ + if (colnr > maxColnr){ + maxColnr = colnr; + } + } + + /** + * if text does not fit on one line, this method wraps the text + */ + public void wrapText(){ + XSSFCellStyle wrap=wb.createCellStyle(); + wrap.setWrapText(true); + myCell.setCellStyle(wrap); + } +}