diff export_iprscan_to_Excel/source_files/iprscanToExcel_v20/src/be/cropdesign/iprscan/XMLToExcel.java @ 0:a9762cd6e2e3 draft default tip

Uploaded
author basfplant
date Tue, 05 Mar 2013 04:00:19 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/export_iprscan_to_Excel/source_files/iprscanToExcel_v20/src/be/cropdesign/iprscan/XMLToExcel.java	Tue Mar 05 04:00:19 2013 -0500
@@ -0,0 +1,488 @@
+package be.cropdesign.iprscan;
+/**
+ * Converts the *.xml output file of the InterProScan program to an Excel file (*.xlsx)
+ * @author: Katrien Bernaerts and Domantas Motiejunas
+ * @date: 21/06/2012
+ * @affiliation: CropDesign N.V., a BASF Plant Science Company - Technologiepark 3, 9052 Zwijnaarde - Belgium
+ */
+import java.awt.Color;
+import java.io.File;
+import java.io.IOException;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.FactoryConfigurationError;
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.apache.poi.ss.usermodel.CellStyle;
+import org.apache.poi.xssf.usermodel.XSSFCell;
+import org.apache.poi.xssf.usermodel.XSSFCellStyle;
+import org.apache.poi.xssf.usermodel.XSSFColor;
+import org.apache.poi.xssf.usermodel.XSSFFont;
+import org.apache.poi.xssf.usermodel.XSSFRow;
+import org.apache.poi.xssf.usermodel.XSSFSheet;
+import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+import org.w3c.dom.Attr;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
+
+public class XMLToExcel {
+	/**
+	 * fields needed to generate Excel
+	 */
+	private XSSFWorkbook wb;
+	private XSSFSheet sheet;
+	private XSSFRow myRow;
+	private XSSFCell myCell;
+	private String XMLFile;
+	
+	/**
+	 * Fields/counters
+	 */
+	private int rownr;
+	private int colnr;
+	private int maxColnr;
+	
+	/**
+	 * constructor
+	 */
+	public XMLToExcel(XSSFWorkbook wb, XSSFSheet sheet,XSSFRow myRow, XSSFCell myCell, String XMLFile){
+		this.wb = wb;
+		this.sheet = sheet;
+		this.myRow = myRow;
+		this.myCell = myCell;
+		this.XMLFile = XMLFile;
+		rownr = 0;
+		colnr = 0;
+		makeDOMParser();
+		setColumnWidth();
+	}
+	
+	/**
+	 * method to make a DOM parser. This method calls the method parseProteins, which parses the protein nodes, which further calls methods to parse deeper nodes.
+	 */	
+	public void makeDOMParser(){
+		try {
+			File file = new File(XMLFile);
+			if (file.exists()) {
+				DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
+				DocumentBuilder db = dbf.newDocumentBuilder();
+				Document doc = db.parse(file);
+				doc.getDocumentElement().normalize();
+				
+				// parse protein nodes, the first level subnodes of root node interpro_matches
+				parseProteins(doc);
+			} else {
+				System.err.println("The XML file you try to convert to Excel does not exist.");
+			}
+		} catch (IOException ioe) {
+			System.err.println("Input/output exception: " + ioe);
+		} catch (SAXException saxe) {
+			System.err.println("SAX parsing exception: " + saxe);
+		} catch (FactoryConfigurationError fce) {
+			System.err.println("Factory configuration error: " + fce);
+		} catch (ParserConfigurationException pce) {
+			System.err.println("Parser configuration exception: " + pce);
+		}
+	}
+	 
+	/**************************
+	 * parse nodes and subnodes
+	 *************************/
+	/**
+	 * method parseProteins to parse the protein nodes, the first level subnodes of root node interpro_matches
+	 */
+	public void parseProteins(Document doc){
+		// parse through the protein nodes
+		NodeList proteinLst = doc.getElementsByTagName("protein");
+		if (proteinLst != null && proteinLst.getLength() > 0) {
+			for (int i = 0; i < proteinLst.getLength(); i++) {
+				Node proteinNode = proteinLst.item(i);
+				if (proteinNode.getNodeType() == Node.ELEMENT_NODE ) {
+					Element proteinElement = (Element) proteinNode;
+					Attr proteinId = proteinElement.getAttributeNode("id");
+					
+					myRow = sheet.createRow((short)rownr);
+					makeCellText(colnr, "PROTEIN");
+					formatTitle();
+					makeCellText(colnr+1, proteinId.getValue());
+					formatTitle();
+					makeRow();
+							
+					// parse the child nodes of the protein nodes, namely interpro
+					parseInterpro(proteinNode);
+				}// end if proteinNode
+			}// end if proteinLst
+		} else {
+			System.out.print("XML file empty.");
+		}
+	}
+	
+	/**
+	 * parse interpro nodes, which are child nodes of the protein nodes
+	 */
+	public void parseInterpro(Node proteinNode){
+		// parse through the interpro nodes, which are subnodes of the protein node
+		NodeList interproLst = proteinNode.getChildNodes();
+		if (interproLst != null	&& interproLst.getLength() > 0) {
+			for (int j = 0; j < interproLst.getLength(); j++) {
+				Node interproNode = interproLst.item(j);
+				if (interproNode.getNodeType() == Node.ELEMENT_NODE && interproNode.getNodeName().equals("interpro")) {
+					Element interproElement = (Element) interproNode;
+					Attr interproId = interproElement.getAttributeNode("id");
+					makeCellText(colnr, interproId.getValue());
+					makeBold();
+					Attr interproName = interproElement.getAttributeNode("name");
+					makeCellText(colnr+1, interproName.getValue());
+					makeBold();
+					Attr interproParentId = interproElement.getAttributeNode("parent_id");
+					makeRow();
+											
+					/*
+					 *  parse match node, a child node of the interpro node
+					 */
+					parseMatchInterproNode(interproNode);
+
+					/*
+					 * parent information from interpro attribute
+					 */
+					makeCellText(colnr, "parent ");
+					makeItalicRight();
+					if (interproParentId != null) {
+						makeCellText(colnr+1, interproParentId.getValue());
+						setUpperBorder();
+					}
+					else {
+						makeCellText(colnr+1, "No parent");
+						setUpperBorder();
+					}
+					makeRow();
+					
+					/*
+					 * child_list subnode from interpro
+					 */
+					parseInterproChild("child_list ", "No children", interproNode);
+					
+					/*
+					 *  found_in subnode of interpro
+					 */
+					parseInterproChild("found_in ", "No entries", interproNode);
+					
+					/*
+					 *  contains subnode of interpro
+					 */
+					parseInterproChild("contains ", "No entries", interproNode);
+					
+					/*
+					 *  GO-terms subnode of interpro
+					 */
+					parseGOterms(interproNode);
+						
+				}// end if interproNode
+			}// end for interproLst, counter j
+			makeRow();
+		}// end if interproLst
+	}
+
+	/**
+	 * method parseMatch to parse through the match nodes.
+	 * The match nodes are child nodes of the interpro nodes
+	 */
+	public void parseMatchInterproNode(Node interproNode){
+		makeTableHeaders();
+		// parse through the child nodes of a node
+		NodeList interproChildLst = interproNode.getChildNodes();
+		if (interproChildLst != null && interproChildLst.getLength() > 0) {
+			for (int k = 0; k < interproChildLst.getLength(); k++) {
+				Node interproChildNode = interproChildLst.item(k);
+				if (interproChildNode.getNodeType() == Node.ELEMENT_NODE && interproChildNode.getNodeName().equals("match")) {
+					Element matchElement = (Element) interproChildNode;
+					getMatchAttributes(matchElement);
+					// parse the location information of the matches. Location nodes are subnodes of match nodes.
+					parseLocationMatch(interproChildNode);
+				}// end if interproChildNode match
+			}// end for interproChildLst, counter k
+		}// end if interproChildLst
+		/*
+		 * border line under table
+		 */
+		for (int i = 1; i < 8; i++){
+			makeCellText(i, " ");
+			setUpperBorder();
+		}
+	}
+
+	/**
+	 * method parseLocation to parse through the location nodes, which are child nodes of the match node. 
+	 * The match nodes can be one of the childnode types of the interpro node or childnodes of the 
+	 * protein node.
+	 */
+	public void parseLocationMatch(Node node){
+		NodeList locationLst = node.getChildNodes();
+		if (locationLst != null	&& locationLst.getLength() > 0) {
+			for (int l = 0; l < locationLst.getLength(); l++) {
+				Node locationNode = locationLst.item(l);
+				if (locationNode.getNodeType() == Node.ELEMENT_NODE) {
+					Element locationElement = (Element) locationNode;
+					getLocationAttributes(locationElement);
+				}// end if locationNode
+			}// end for locationLst, counter l
+		} // end if locationLst
+	}
+
+	/**
+	 * method parseInterProChild to parse the nodes child_list, found_in, contains, GO_terms and their contents. These nodes are childnodes of the interpro node.
+	 */
+	public void parseInterproChild(String nodeName, String emptyNode, Node interproNode){
+		makeCellText(0, nodeName);
+		makeItalicRight();
+		makeCellText(1, emptyNode);
+		
+		// parse through the child nodes of an interpro node
+		NodeList interproChildLst = interproNode.getChildNodes();
+		if (interproChildLst != null && interproChildLst.getLength() > 0) {
+			for (int k = 0; k < interproChildLst.getLength(); k++) {
+				Node interproChildNode = interproChildLst.item(k);
+				// parse the child node of the interpro node with as name nodeName (parameter of the method) 
+				if (interproChildNode.getNodeType() == Node.ELEMENT_NODE && interproChildNode.getNodeName().equals(nodeName.trim())) {
+					// parse through the rel_ref nodes which are childnodes of contains, found_in and child_list
+					NodeList rel_refLst = interproChildNode.getChildNodes();
+					if (rel_refLst != null	&& rel_refLst.getLength() > 0) {
+						for (int s = 0; s < rel_refLst.getLength(); s++) {
+							Node ref_relNode = rel_refLst.item(s);
+							if (ref_relNode.getNodeType() == Node.ELEMENT_NODE) {
+								Element ref_relElement = (Element) ref_relNode;
+								Attr ipr_ref = ref_relElement.getAttributeNode("ipr_ref");
+								colnr++;
+								makeCellText(colnr, ipr_ref.getValue());
+							}// end if ref_relNode	
+						}//end for ref_relLst
+						getMaxColumns(colnr + rel_refLst.getLength() + 1); 
+						colnr = 0;//reset colnr to 0 in order to let the other items start in the column with id 0
+					}//end if relf_refLst
+				} // end if interproChildNode childnode
+			}// end for interproChildLst childnode
+		}// end if interproChildLst childnode
+		makeRow();
+	}
+
+	/**
+	 * parse classification node, which is a subnode of the interpro node
+	 */
+	public void parseGOterms(Node interproNode){
+		makeCellText(0, "GO terms ");
+		makeItalicRight();
+		makeCellText(1, "no GO terms");
+		int rowGO = rownr;
+	
+		// parse through the child nodes of an interpro node
+		NodeList interproChildLst = interproNode.getChildNodes();
+		if (interproChildLst != null && interproChildLst.getLength() > 0) {
+			for (int k = 0; k < interproChildLst.getLength(); k++) {
+				Node interproChildNode = interproChildLst.item(k);
+				if (interproChildNode.getNodeType() == Node.ELEMENT_NODE && interproChildNode.getNodeName().equals("classification")) {
+					Element classificationElement = (Element) interproChildNode;
+					Attr classificationId = classificationElement.getAttributeNode("id");
+					makeCellText(colnr+2, classificationId.getValue());
+					// parse through the child nodes of a classification node
+					NodeList classificationLst = classificationElement.getChildNodes();
+					if (classificationLst != null && classificationLst.getLength() > 0) {
+						for (int l=0; l < classificationLst.getLength(); l++) {
+							Node classificationChildNode = classificationLst.item(l);
+							if (classificationChildNode.getNodeType() == Node.ELEMENT_NODE && classificationChildNode.getNodeName().equals("category")) {
+								Element categoryElement = (Element) classificationChildNode; 
+								makeCellText(colnr+1, categoryElement.getTextContent());
+							}
+							if (classificationChildNode.getNodeType() == Node.ELEMENT_NODE && classificationChildNode.getNodeName().equals("description")) {
+								Element descriptionElement = (Element) classificationChildNode;
+								makeCellText(colnr+3, descriptionElement.getTextContent());
+							}
+						}
+					}// end if classificationLst
+					makeRow();
+				}// end if interproChildNode classification
+			}// end for interproChildLst, counter k
+			getMaxColumns(colnr + interproChildLst.getLength() + 1); 
+		}// end if interproChildLst
+		/*
+		 * insert empty row after the row GO term
+		 */
+		if (sheet.getRow(rowGO).getCell(1).toString().equals("no GO terms")){
+			makeRow();
+		}
+		makeRow();
+	}
+	
+	/**
+	 * Get the attribute values of the match elements and fill them in in Excel
+	 * @param matchElement
+	 */
+	public void getMatchAttributes(Element matchElement){
+		Attr matchDbname = matchElement.getAttributeNode("dbname");
+		makeCellText(colnr+1, matchDbname.getValue());
+		Attr matchId = matchElement.getAttributeNode("id");
+		makeCellText(colnr+2, matchId.getValue());
+		Attr matchName = matchElement.getAttributeNode("name");
+		makeCellText(colnr+3, matchName.getValue());
+		wrapText();//if the cell content does not fit the width of the column, the text is automatically wrapped
+	}
+	
+	/**
+	 * Get the attribute values of the location elements and fill them in in Excel
+	 * @param locationElement
+	 */
+	public void getLocationAttributes(Element locationElement){
+		// get the desired attribute values of the location node in the desired order
+		Attr locationScore = locationElement.getAttributeNode("score");
+		Attr locationStart = locationElement.getAttributeNode("start");
+		Attr locationEnd = locationElement.getAttributeNode("end");
+		Attr locationStatus = locationElement.getAttributeNode("status");
+		makeCellText(colnr+4, locationScore.getValue());
+		makeCellNumber(colnr+5, Integer.parseInt(locationStart.getValue()));
+		makeCellNumber(colnr+6, Integer.parseInt(locationEnd.getValue()));
+		makeCellText(colnr+7, locationStatus.getValue());
+		makeRow();
+	}
+	
+	/******************************
+	 * make rows and cells in Excel
+	 ******************************/
+	/**
+	 * method to fill cells of a row in an Excel sheet with text content
+	 */
+	public void makeCellText(int colnr, String content){
+		myCell = myRow.createCell((short)colnr);
+		myCell.setCellValue(content);
+	}
+	
+	/**
+	 * method to fill cells of a row in an Excel sheet with number formatted content
+	 */
+	public void makeCellNumber(int colnr, int content){
+		myCell = myRow.createCell((short)colnr);
+		myCell.setCellValue(content);
+	}
+	
+	/**
+	 * method to make a new row in an Excel sheet
+	 */
+	public void makeRow(){
+		rownr++;
+		myRow = sheet.createRow((short)rownr);
+	}
+	
+	/***********************************
+	 * table headers and titles in Excel
+	 ***********************************/
+	/**
+	 * method to make table headers for the matches table
+	 */
+	public void makeTableHeaders(){
+		// headers
+		makeCellText(0, "matches ");
+		makeItalicRight();
+		//headers of table
+		String[] headers = {"Method", "Identifier", "Description", "Score", "Start", "End", "Status"};
+		for (int i =0; i < headers.length; i++){
+			makeCellText(colnr+i+1, headers[i]);
+			setTitleBorder();
+		}
+		getMaxColumns(colnr + headers.length + 1); // needed for method setColumnWidth()
+		makeRow();
+		// end headers
+	}
+
+	/**
+	 * format the protein titles
+	 */
+	public void formatTitle(){
+		XSSFCellStyle style = wb.createCellStyle();
+	    XSSFFont font = wb.createFont();
+	    font.setColor(new XSSFColor(Color.BLUE));
+	    font.setBold(true);
+	    font.setFontHeightInPoints((short)12);
+	    style.setFont(font);
+	    myCell.setCellStyle(style);
+	}
+	
+	/**
+	 * provide the table headers of the matches part with border lines
+	 */
+	public void setTitleBorder(){
+		XSSFCellStyle border = wb.createCellStyle();
+		border.setBorderBottom(XSSFCellStyle.BORDER_DOUBLE);
+		border.setBorderTop(XSSFCellStyle.BORDER_THIN);
+	    myCell.setCellStyle(border);
+	}
+	
+	/**
+	 * make a border
+	 */
+	public void setUpperBorder(){
+		XSSFCellStyle border = wb.createCellStyle();
+		border.setBorderTop(XSSFCellStyle.BORDER_THIN);
+	    myCell.setCellStyle(border);
+	}
+	
+	/************
+	 * formatting
+	 ***********/
+	/**
+	 * format text in bold and align text right
+	 */
+	public void makeBold(){
+		XSSFCellStyle style = wb.createCellStyle();
+	    XSSFFont font = wb.createFont();
+	    font.setBold(true);
+	    style.setFont(font);
+	    myCell.setCellStyle(style);
+	}
+	
+	/**
+	 * format text as italic and align right
+	 */
+	public void makeItalicRight(){
+		XSSFCellStyle style = wb.createCellStyle();
+	    XSSFFont font = wb.createFont();
+	    font.setItalic(true);
+	    style.setFont(font);
+	    style.setAlignment(CellStyle.ALIGN_RIGHT);
+	    myCell.setCellStyle(style);
+	}
+	
+	/**
+	 * adapt column width depending on the content, except for some cases where the column width would become too big
+	 */
+	public void setColumnWidth(){
+		for (int col = 0; col < maxColnr; col++){
+			sheet.autoSizeColumn(col);
+		}
+		//for some columns, fixed size is desired above autosize
+		sheet.setColumnWidth(1, 256*15);
+		sheet.setColumnWidth(3, 256*27);
+	}
+	
+	/**
+	 * Helper method for setColumnWitdt()
+	 * Method determines the number of columns which is filled with content
+	 * This number is needed to loop through the columns and autofit their width with method setColumnWidth()
+	 * @return
+	 */
+	public void getMaxColumns(int colnr){
+		if (colnr > maxColnr){
+			maxColnr = colnr;
+		}
+	}
+	
+	/**
+	 * if text does not fit on one line, this method wraps the text
+	 */
+	public void wrapText(){
+		XSSFCellStyle wrap=wb.createCellStyle();
+		wrap.setWrapText(true);
+		myCell.setCellStyle(wrap);
+	}	
+}