Mercurial > repos > basfplant > interproscan_to_excel

package be.cropdesign.iprscan;
/**
 * Converts the *.xml output file of the InterProScan program to an Excel file (*.xlsx)
 * @author: Katrien Bernaerts and Domantas Motiejunas
 * @date: 21/06/2012
 * @affiliation: CropDesign N.V., a BASF Plant Science Company - Technologiepark 3, 9052 Zwijnaarde - Belgium
 */
import java.awt.Color;
import java.io.File;
import java.io.IOException;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.FactoryConfigurationError;
import javax.xml.parsers.ParserConfigurationException;

import org.apache.poi.ss.usermodel.CellStyle;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFCellStyle;
import org.apache.poi.xssf.usermodel.XSSFColor;
import org.apache.poi.xssf.usermodel.XSSFFont;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

public class XMLToExcel {
	/**
	 * fields needed to generate Excel
	 */
	private XSSFWorkbook wb;
	private XSSFSheet sheet;
	private XSSFRow myRow;
	private XSSFCell myCell;
	private String XMLFile;

	/**
	 * Fields/counters
	 */
	private int rownr;
	private int colnr;
	private int maxColnr;

	/**
	 * constructor
	 */
	public XMLToExcel(XSSFWorkbook wb, XSSFSheet sheet,XSSFRow myRow, XSSFCell myCell, String XMLFile){
		this.wb = wb;
		this.sheet = sheet;
		this.myRow = myRow;
		this.myCell = myCell;
		this.XMLFile = XMLFile;
		rownr = 0;
		colnr = 0;
		makeDOMParser();
		setColumnWidth();
	}

	/**
	 * method to make a DOM parser. This method calls the method parseProteins, which parses the protein nodes, which further calls methods to parse deeper nodes.
	 */
	public void makeDOMParser(){
		try {
			File file = new File(XMLFile);
			if (file.exists()) {
				DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
				DocumentBuilder db = dbf.newDocumentBuilder();
				Document doc = db.parse(file);
				doc.getDocumentElement().normalize();

				// parse protein nodes, the first level subnodes of root node interpro_matches
				parseProteins(doc);
			} else {
				System.err.println("The XML file you try to convert to Excel does not exist.");
			}
		} catch (IOException ioe) {
			System.err.println("Input/output exception: " + ioe);
		} catch (SAXException saxe) {
			System.err.println("SAX parsing exception: " + saxe);
		} catch (FactoryConfigurationError fce) {
			System.err.println("Factory configuration error: " + fce);
		} catch (ParserConfigurationException pce) {
			System.err.println("Parser configuration exception: " + pce);
		}
	}

	/**************************
	 * parse nodes and subnodes
	 *************************/
	/**
	 * method parseProteins to parse the protein nodes, the first level subnodes of root node interpro_matches
	 */
	public void parseProteins(Document doc){
		// parse through the protein nodes
		NodeList proteinLst = doc.getElementsByTagName("protein");
		if (proteinLst != null && proteinLst.getLength() > 0) {
			for (int i = 0; i < proteinLst.getLength(); i++) {
				Node proteinNode = proteinLst.item(i);
				if (proteinNode.getNodeType() == Node.ELEMENT_NODE ) {
					Element proteinElement = (Element) proteinNode;
					Attr proteinId = proteinElement.getAttributeNode("id");

					myRow = sheet.createRow((short)rownr);
					makeCellText(colnr, "PROTEIN");
					formatTitle();
					makeCellText(colnr+1, proteinId.getValue());
					formatTitle();
					makeRow();

					// parse the child nodes of the protein nodes, namely interpro
					parseInterpro(proteinNode);
				}// end if proteinNode
			}// end if proteinLst
		} else {
			System.out.print("XML file empty.");
		}
	}

	/**
	 * parse interpro nodes, which are child nodes of the protein nodes
	 */
	public void parseInterpro(Node proteinNode){
		// parse through the interpro nodes, which are subnodes of the protein node
		NodeList interproLst = proteinNode.getChildNodes();
		if (interproLst != null	&& interproLst.getLength() > 0) {
			for (int j = 0; j < interproLst.getLength(); j++) {
				Node interproNode = interproLst.item(j);
				if (interproNode.getNodeType() == Node.ELEMENT_NODE && interproNode.getNodeName().equals("interpro")) {
					Element interproElement = (Element) interproNode;
					Attr interproId = interproElement.getAttributeNode("id");
					makeCellText(colnr, interproId.getValue());
					makeBold();
					Attr interproName = interproElement.getAttributeNode("name");
					makeCellText(colnr+1, interproName.getValue());
					makeBold();
					Attr interproParentId = interproElement.getAttributeNode("parent_id");
					makeRow();

					/*
					 *  parse match node, a child node of the interpro node
					 */
					parseMatchInterproNode(interproNode);

					/*
					 * parent information from interpro attribute
					 */
					makeCellText(colnr, "parent ");
					makeItalicRight();
					if (interproParentId != null) {
						makeCellText(colnr+1, interproParentId.getValue());
						setUpperBorder();
					}
					else {
						makeCellText(colnr+1, "No parent");
						setUpperBorder();
					}
					makeRow();

					/*
					 * child_list subnode from interpro
					 */
					parseInterproChild("child_list ", "No children", interproNode);

					/*
					 *  found_in subnode of interpro
					 */
					parseInterproChild("found_in ", "No entries", interproNode);

					/*
					 *  contains subnode of interpro
					 */
					parseInterproChild("contains ", "No entries", interproNode);

					/*
					 *  GO-terms subnode of interpro
					 */
					parseGOterms(interproNode);

				}// end if interproNode
			}// end for interproLst, counter j
			makeRow();
		}// end if interproLst
	}

	/**
	 * method parseMatch to parse through the match nodes.
	 * The match nodes are child nodes of the interpro nodes
	 */
	public void parseMatchInterproNode(Node interproNode){
		makeTableHeaders();
		// parse through the child nodes of a node
		NodeList interproChildLst = interproNode.getChildNodes();
		if (interproChildLst != null && interproChildLst.getLength() > 0) {
			for (int k = 0; k < interproChildLst.getLength(); k++) {
				Node interproChildNode = interproChildLst.item(k);
				if (interproChildNode.getNodeType() == Node.ELEMENT_NODE && interproChildNode.getNodeName().equals("match")) {
					Element matchElement = (Element) interproChildNode;
					getMatchAttributes(matchElement);
					// parse the location information of the matches. Location nodes are subnodes of match nodes.
					parseLocationMatch(interproChildNode);
				}// end if interproChildNode match
			}// end for interproChildLst, counter k
		}// end if interproChildLst
		/*
		 * border line under table
		 */
		for (int i = 1; i < 8; i++){
			makeCellText(i, " ");
			setUpperBorder();
		}
	}

	/**
	 * method parseLocation to parse through the location nodes, which are child nodes of the match node.
	 * The match nodes can be one of the childnode types of the interpro node or childnodes of the
	 * protein node.
	 */
	public void parseLocationMatch(Node node){
		NodeList locationLst = node.getChildNodes();
		if (locationLst != null	&& locationLst.getLength() > 0) {
			for (int l = 0; l < locationLst.getLength(); l++) {
				Node locationNode = locationLst.item(l);
				if (locationNode.getNodeType() == Node.ELEMENT_NODE) {
					Element locationElement = (Element) locationNode;
					getLocationAttributes(locationElement);
				}// end if locationNode
			}// end for locationLst, counter l
		} // end if locationLst
	}

	/**
	 * method parseInterProChild to parse the nodes child_list, found_in, contains, GO_terms and their contents. These nodes are childnodes of the interpro node.
	 */
	public void parseInterproChild(String nodeName, String emptyNode, Node interproNode){
		makeCellText(0, nodeName);
		makeItalicRight();
		makeCellText(1, emptyNode);

		// parse through the child nodes of an interpro node
		NodeList interproChildLst = interproNode.getChildNodes();
		if (interproChildLst != null && interproChildLst.getLength() > 0) {
			for (int k = 0; k < interproChildLst.getLength(); k++) {
				Node interproChildNode = interproChildLst.item(k);
				// parse the child node of the interpro node with as name nodeName (parameter of the method)
				if (interproChildNode.getNodeType() == Node.ELEMENT_NODE && interproChildNode.getNodeName().equals(nodeName.trim())) {
					// parse through the rel_ref nodes which are childnodes of contains, found_in and child_list
					NodeList rel_refLst = interproChildNode.getChildNodes();
					if (rel_refLst != null	&& rel_refLst.getLength() > 0) {
						for (int s = 0; s < rel_refLst.getLength(); s++) {
							Node ref_relNode = rel_refLst.item(s);
							if (ref_relNode.getNodeType() == Node.ELEMENT_NODE) {
								Element ref_relElement = (Element) ref_relNode;
								Attr ipr_ref = ref_relElement.getAttributeNode("ipr_ref");
								colnr++;
								makeCellText(colnr, ipr_ref.getValue());
							}// end if ref_relNode
						}//end for ref_relLst
						getMaxColumns(colnr + rel_refLst.getLength() + 1);
						colnr = 0;//reset colnr to 0 in order to let the other items start in the column with id 0
					}//end if relf_refLst
				} // end if interproChildNode childnode
			}// end for interproChildLst childnode
		}// end if interproChildLst childnode
		makeRow();
	}

	/**
	 * parse classification node, which is a subnode of the interpro node
	 */
	public void parseGOterms(Node interproNode){
		makeCellText(0, "GO terms ");
		makeItalicRight();
		makeCellText(1, "no GO terms");
		int rowGO = rownr;

		// parse through the child nodes of an interpro node
		NodeList interproChildLst = interproNode.getChildNodes();
		if (interproChildLst != null && interproChildLst.getLength() > 0) {
			for (int k = 0; k < interproChildLst.getLength(); k++) {
				Node interproChildNode = interproChildLst.item(k);
				if (interproChildNode.getNodeType() == Node.ELEMENT_NODE && interproChildNode.getNodeName().equals("classification")) {
					Element classificationElement = (Element) interproChildNode;
					Attr classificationId = classificationElement.getAttributeNode("id");
					makeCellText(colnr+2, classificationId.getValue());
					// parse through the child nodes of a classification node
					NodeList classificationLst = classificationElement.getChildNodes();
					if (classificationLst != null && classificationLst.getLength() > 0) {
						for (int l=0; l < classificationLst.getLength(); l++) {
							Node classificationChildNode = classificationLst.item(l);
							if (classificationChildNode.getNodeType() == Node.ELEMENT_NODE && classificationChildNode.getNodeName().equals("category")) {
								Element categoryElement = (Element) classificationChildNode;
								makeCellText(colnr+1, categoryElement.getTextContent());
							}
							if (classificationChildNode.getNodeType() == Node.ELEMENT_NODE && classificationChildNode.getNodeName().equals("description")) {
								Element descriptionElement = (Element) classificationChildNode;
								makeCellText(colnr+3, descriptionElement.getTextContent());
							}
						}
					}// end if classificationLst
					makeRow();
				}// end if interproChildNode classification
			}// end for interproChildLst, counter k
			getMaxColumns(colnr + interproChildLst.getLength() + 1);
		}// end if interproChildLst
		/*
		 * insert empty row after the row GO term
		 */
		if (sheet.getRow(rowGO).getCell(1).toString().equals("no GO terms")){
			makeRow();
		}
		makeRow();
	}

	/**
	 * Get the attribute values of the match elements and fill them in in Excel
	 * @param matchElement
	 */
	public void getMatchAttributes(Element matchElement){
		Attr matchDbname = matchElement.getAttributeNode("dbname");
		makeCellText(colnr+1, matchDbname.getValue());
		Attr matchId = matchElement.getAttributeNode("id");
		makeCellText(colnr+2, matchId.getValue());
		Attr matchName = matchElement.getAttributeNode("name");
		makeCellText(colnr+3, matchName.getValue());
		wrapText();//if the cell content does not fit the width of the column, the text is automatically wrapped
	}

	/**
	 * Get the attribute values of the location elements and fill them in in Excel
	 * @param locationElement
	 */
	public void getLocationAttributes(Element locationElement){
		// get the desired attribute values of the location node in the desired order
		Attr locationScore = locationElement.getAttributeNode("score");
		Attr locationStart = locationElement.getAttributeNode("start");
		Attr locationEnd = locationElement.getAttributeNode("end");
		Attr locationStatus = locationElement.getAttributeNode("status");
		makeCellText(colnr+4, locationScore.getValue());
		makeCellNumber(colnr+5, Integer.parseInt(locationStart.getValue()));
		makeCellNumber(colnr+6, Integer.parseInt(locationEnd.getValue()));
		makeCellText(colnr+7, locationStatus.getValue());
		makeRow();
	}

	/******************************
	 * make rows and cells in Excel
	 ******************************/
	/**
	 * method to fill cells of a row in an Excel sheet with text content
	 */
	public void makeCellText(int colnr, String content){
		myCell = myRow.createCell((short)colnr);
		myCell.setCellValue(content);
	}

	/**
	 * method to fill cells of a row in an Excel sheet with number formatted content
	 */
	public void makeCellNumber(int colnr, int content){
		myCell = myRow.createCell((short)colnr);
		myCell.setCellValue(content);
	}

	/**
	 * method to make a new row in an Excel sheet
	 */
	public void makeRow(){
		rownr++;
		myRow = sheet.createRow((short)rownr);
	}

	/***********************************
	 * table headers and titles in Excel
	 ***********************************/
	/**
	 * method to make table headers for the matches table
	 */
	public void makeTableHeaders(){
		// headers
		makeCellText(0, "matches ");
		makeItalicRight();
		//headers of table
		String[] headers = {"Method", "Identifier", "Description", "Score", "Start", "End", "Status"};
		for (int i =0; i < headers.length; i++){
			makeCellText(colnr+i+1, headers[i]);
			setTitleBorder();
		}
		getMaxColumns(colnr + headers.length + 1); // needed for method setColumnWidth()
		makeRow();
		// end headers
	}

	/**
	 * format the protein titles
	 */
	public void formatTitle(){
		XSSFCellStyle style = wb.createCellStyle();
	    XSSFFont font = wb.createFont();
	    font.setColor(new XSSFColor(Color.BLUE));
	    font.setBold(true);
	    font.setFontHeightInPoints((short)12);
	    style.setFont(font);
	    myCell.setCellStyle(style);
	}

	/**
	 * provide the table headers of the matches part with border lines
	 */
	public void setTitleBorder(){
		XSSFCellStyle border = wb.createCellStyle();
		border.setBorderBottom(XSSFCellStyle.BORDER_DOUBLE);
		border.setBorderTop(XSSFCellStyle.BORDER_THIN);
	    myCell.setCellStyle(border);
	}

	/**
	 * make a border
	 */
	public void setUpperBorder(){
		XSSFCellStyle border = wb.createCellStyle();
		border.setBorderTop(XSSFCellStyle.BORDER_THIN);
	    myCell.setCellStyle(border);
	}

	/************
	 * formatting
	 ***********/
	/**
	 * format text in bold and align text right
	 */
	public void makeBold(){
		XSSFCellStyle style = wb.createCellStyle();
	    XSSFFont font = wb.createFont();
	    font.setBold(true);
	    style.setFont(font);
	    myCell.setCellStyle(style);
	}

	/**
	 * format text as italic and align right
	 */
	public void makeItalicRight(){
		XSSFCellStyle style = wb.createCellStyle();
	    XSSFFont font = wb.createFont();
	    font.setItalic(true);
	    style.setFont(font);
	    style.setAlignment(CellStyle.ALIGN_RIGHT);
	    myCell.setCellStyle(style);
	}

	/**
	 * adapt column width depending on the content, except for some cases where the column width would become too big
	 */
	public void setColumnWidth(){
		for (int col = 0; col < maxColnr; col++){
			sheet.autoSizeColumn(col);
		}
		//for some columns, fixed size is desired above autosize
		sheet.setColumnWidth(1, 256*15);
		sheet.setColumnWidth(3, 256*27);
	}

	/**
	 * Helper method for setColumnWitdt()
	 * Method determines the number of columns which is filled with content
	 * This number is needed to loop through the columns and autofit their width with method setColumnWidth()
	 * @return
	 */
	public void getMaxColumns(int colnr){
		if (colnr > maxColnr){
			maxColnr = colnr;
		}
	}

	/**
	 * if text does not fit on one line, this method wraps the text
	 */
	public void wrapText(){
		XSSFCellStyle wrap=wb.createCellStyle();
		wrap.setWrapText(true);
		myCell.setCellStyle(wrap);
	}
}
author	basfplant
date	Tue, 05 Mar 2013 04:00:19 -0500
parents
children