Mercurial > repos > basfplant > interproscan_to_excel

package be.cropdesign.iprscan;
/**
 * Converts the *.raw output file of the InterProScan program to an Excel file (*.xlsx)
 * @author: Katrien Bernaerts and Domantas Motiejunas
 * @date: 21/06/2012
 * @affiliation: CropDesign N.V., a BASF Plant Science Company - Technologiepark 3, 9052 Zwijnaarde - Belgium
 */
import java.awt.Color;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFCellStyle;
import org.apache.poi.xssf.usermodel.XSSFColor;
import org.apache.poi.xssf.usermodel.XSSFDataFormat;
import org.apache.poi.xssf.usermodel.XSSFFont;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;

public class RawToExcel {

	private int colnr;
	private int maxNr;
	private int rownr;
	private int maxNrOfGOTerms;

	/**
	 * fields needed to generate Excel
	 */
	private XSSFWorkbook wb;
	private XSSFSheet sheet;
	private XSSFRow myRow;
	private XSSFCell myCell;
	private String rawFile;
	private boolean interproInfo; // this info is not standard in every .raw file, but can occur
	private boolean GOInfo; // this info is not standard in every .raw file, but can occur

	private ArrayList<String> titles;

	/**
	 * constructor
	 * @param wb
	 * @param sheet3
	 * @param myRow
	 * @param myCell
	 * @param rawFile
	 */
	public RawToExcel(XSSFWorkbook wb, XSSFSheet sheet3,XSSFRow myRow, XSSFCell myCell, String rawFile){
		this.wb = wb;
		this.sheet = sheet3;
		this.myRow = myRow;
		this.myCell = myCell;
		this.rawFile = rawFile;
		maxNr = 0;
		maxNrOfGOTerms = 0;
		interproInfo = false;
		GOInfo = false;
		titles = new ArrayList<String>();
		parseRaw();
		addHeaderTitles();
		formatStyle();
	}

	/**********************************
	 * parsing of the .raw file content
	 **********************************/
	/**
	 * method to parse the data in the .raw file with a BufferedReader/FileReader. The parsed data
	 * are written to Excel using the Apache POI library.
	 */
	public void parseRaw() {
		BufferedReader reader = null;
		try {
			reader = new BufferedReader(new FileReader(rawFile));

			String line = reader.readLine();

			/*
			 * start with rownr = 1 to write content because an empty row has to be reserved for the titles
			 * The title row cannot be filled yet at this point because we first have to determine how many
			 * columns are needed, and by consequence, how many titles. The problem is that different .raw files
			 * can have a different amount of columns.
			 * For each line read with the buffered reader, the number of columns is compared to the maxColnr,
			 * because sometimes not all lines of the same file have the same number of columns, e.g. because the
			 * last column is empty. However, if for some rows the last column is empty, a column title is still needed
			 * for the other rows. Therefore, the method getMaxColumns(colnr) is called for each line.
			 */
			rownr = 1;
			while(line != null) {
				String[] splits = line.split("\t"); // tab delimited file
				myRow = sheet.createRow(rownr);
				colnr = 0;
				for (String string : splits){
					myCell = myRow.createCell(colnr);
					myCell.setCellValue(string);

					/*
					 * interProScan info is not present in all raw files. For the creation of the right
					 * header titles, it is important to know whether the parsed raw file contained
					 * interProScan info
					 */
					if (string.contains("IPR")){
						interproInfo = true;
					}

					/*
					 * format the cell content as Integer for the columns protein length (colnr=2),
					 * start (colnr=6) or end (colnr=7). To know which input only contains integers,
					 * a regex is used. If only numbers or spaces are found in the input string,
					 * and if the input string is not empty, the corresponding Excel cell is
					 * formatted as Integer.
					 * If the cell content is not formatted as number, sorting etc. via the filters
					 * in the headers does not work correct.
					 */
					if (checkRegex("^([0-9]+\\s*)*$", string)){
						myCell.setCellValue(Integer.parseInt(string));
					}

					/*
					 * create a cell style that formats numbers in scientific notation (exponential)
					 * for the score column (index 8)
					 */
					if (checkRegex("^[-+]?([0-9]*\\.?[0-9]+([eE][-+]?[0-9]+))*$", string)){
						formatExponential(string);
					}
					/*
					 * split up the line with GO classification information further such
					 * that the splitted line of GO information can be stored in different
					 * Excel cells instead of all information concatenated into one cell (like it is
					 * in the original .raw file generated by iprscan). First the information
					 * in the splitted line is stored in a double array. In a second step (at the end of the
					 * current method, the double array content is written to Excel.
					 */
					if (string.contains("GO:")){
						GOInfo = true;
						splitGOTerms(string);
					}
					getMax(colnr);
					colnr++;
				}
				line = reader.readLine();
				rownr++;
				myRow = sheet.createRow(rownr);
			}
		} catch (FileNotFoundException e) {
				System.err.println("The .raw file cannot be found.");
		} catch (IOException e) {
			System.err.println("An input/output exception occurred while reading the .raw file.");
		}
		finally {
			if (reader != null) {
				try {
					reader.close();
				}
				catch (IOException e){}
			}
		}
	}

	/**
	 * Helper method for parseRaw()
	 * Split a string containing GO information. A typical string looks like:
	 * "Molecular Function: sequence-specific DNA binding transcription factor activity (GO:0003700), Cellular Component: nucleus (GO:0005634), Biological Process: regulation of transcription, DNA-dependent (GO:0006355), Molecular Function: sequence-specific DNA binding (GO:0043565)"
	 * or in more general terms:
	 * "Title1: description1 (GO number1), Title2: description2 (GO number2), Title3: description3 (GO number3)"
	 * The string should be splitted in three parts: title, description and GO number.
	 * In fact we are dealing with comma delimited strings, but split may not happen at every comma,
	 * only when comma is preceded by (
	 * Split may for example not happen at the comma in case of "Molecular Function: transferase activity,
	 * transferring phosphorus-containing groups (GO:0016772)"
	 * In order to assure that the splitting occurs at the right place, the comma's at the places where splitting has to
	 * occur are replaced by the unique splitting character ;
	 * @return
	 */
	public void splitGOTerms(String string){
		if (string != null &&!string.isEmpty() && !string.trim().isEmpty()){

			String modifiedString = string.replace("),", ");");
			/*
			 * the modifiedString is splitted at the ;
			 */
			String[] splitsClassification = modifiedString.split("; ");
			int numberOfGoTerms = splitsClassification.length;
			getMaxNrOfGOTerms(numberOfGoTerms);
			for (int i = 0; i < splitsClassification.length; i++){
				myCell = myRow.createCell(colnr);
				myCell.setCellValue(splitsClassification[i].substring(0, splitsClassification[i].indexOf(':'))); //title
				colnr++;
				myCell = myRow.createCell(colnr);
				myCell.setCellValue(splitsClassification[i].substring(splitsClassification[i].indexOf('(')+1, splitsClassification[i].indexOf(')'))); // GO term
				colnr++;
				myCell = myRow.createCell(colnr);
				myCell.setCellValue(splitsClassification[i].substring(splitsClassification[i].indexOf(':')+2, splitsClassification[i].indexOf('('))); // description)
				colnr++;
			}
		}
	}

	/**
	 * Helper method for parseRaw()
	 * Check whether a certain input string (stringToMatch) matches a certain regular expression.
	 * @param regex
	 * @param stringToMatch
	 * @return
	 */
	public boolean checkRegex(String regex, String stringToMatch){
		Pattern p = Pattern.compile(regex);
		Matcher m = p.matcher(stringToMatch);
		if(m.matches() && stringToMatch != null &&!stringToMatch.isEmpty() && !stringToMatch.trim().isEmpty()){
			return true;
		} else {
			return false;
		}
	}

	/**********************************
	 * header titles of the Excel sheet
	 *********************************/
	/**
	 * the header titles are generated in the first row (index 0) of the spreadsheet
	 * All the potential column titles are added to the titles ArrayList. The .raw file always contains
	 * some fixed part (standard titles), but can also contain more information (titles which are not
	 * required for every .raw file).
	 * This last category of headers is only added in case the information occurs in the .raw file.
	 */
	public void addHeaderTitles(){
		/*
		 *  standard titles
		 */
		titles.add("protein ID");
		titles.add("protein crc64");
		titles.add("protein length");
		titles.add("match dbname");
		titles.add("classification id");
		titles.add("classification description");
		titles.add("start");
		titles.add("end");
		titles.add("score");
		titles.add("status");
		titles.add("date");
		/*
		 * titles which are not required for every .raw file
		 */
		if (interproInfo){ // only if the .raw file contains "IPR" boolean interproInfo becomes true
			titles.add("interpro ID");
			titles.add("interpro name");
		}

		if (GOInfo){
			for (int i = 0; i < maxNrOfGOTerms; i++){
				titles.add("title");
				titles.add("GO number");
				titles.add("description");
			}
		}

		myRow = sheet.createRow(0);
		// show the headers in the table
		for (int i = 0; i < titles.size() ; i ++){
			myCell = myRow.createCell(i);
			myCell.setCellValue(titles.get(i)); // content of the headercell
			formatHeader(); // color of the headercell
		}
	}

	/**
	 * method to find the number of Excel columns needed for the GO terms information (title, description and GO number)n
	 * The method looks for the maximal number of columns needed, because sometimes there are
	 * rows with no GO information and other rows which have GO information.
	 * As long as in one Excel sheet there is one row with GO information, the titles for the GO information
	 * have to be showed correctly. The method getNrOfGOTerms helps in this task.
	 */
	public void getMaxNrOfGOTerms(int nr){
		if (nr > maxNrOfGOTerms){
			maxNrOfGOTerms = nr;
		}
	}


	/*******************************
	 * formatting of the Excel sheet
	 ******************************/
	/**
	 * method to apply all formatting to the Excel tabsheet containing the .raw data
	 */
	public void formatStyle(){
		setAutoFilters();
		autoSizeColumns();
		freezeRow();
	}

	/**
	 * make autofilters of the column headers in Excel
	 * ref: http://stackoverflow.com/questions/3114220/poi-auto-filter
	 */
	public void setAutoFilters(){
		if (GOInfo){
			/*
			 * for some reason, the first empty column contains an autofilter in the case there is
			 * GO information. However, only the columns which are not empty should have and autofilter.
			 * In order to avoid this small bug, the autofilter method was changed
			 * slightly: maxNr -1 instead of maxNr
			 * method: public XSSFAutoFilter setAutoFilter(CellRangeAddress, range);
			 */
			sheet.setAutoFilter(org.apache.poi.ss.util.CellRangeAddress.valueOf("A1:"+ (Character.toString((char)( 65+maxNr-1)))+"1"));
		} else {
			sheet.setAutoFilter(org.apache.poi.ss.util.CellRangeAddress.valueOf("A1:"+ (Character.toString((char)( 65+maxNr)))+"1"));
		}
	}

	/**
	 * set the column width automatically to the width of the content
	 */
	public void autoSizeColumns(){
		for(int column = 0; column < maxNr; column++){
			sheet.autoSizeColumn(column);
		}
	}

	/**
	 * helper method for setAutoFilters() and autoSizeColumns() to find the number
	 * of columns present in the tab sheet of the Excel file.
	 * Sometimes, there are columns which are empty in a certain row, but filled in
	 * another row. We always have to cope with the most extreme situation. Therefore,
	 * the maximum number of columns is determined.
	 * @param nr
	 */
	public void getMax(int nr){
		if (nr > maxNr){
			maxNr = nr;
		}
	}

	/**
	 * give the header cells a blue color and bold formatting
	 */
	public void formatHeader(){
		XSSFCellStyle style = wb.createCellStyle();
	    XSSFFont font = wb.createFont();
	    font.setColor(new XSSFColor(Color.BLUE));
	    font.setBold(true);
	    style.setFont(font);
	    myCell.setCellStyle(style);
	}

	/**
	 * freeze the header row
	 * method: public void createFreezePane(int colSplit, int rowSplit, intleftmostColumn, int topRow)
	 */
	public void freezeRow(){
		sheet.createFreezePane(0, 1, 0, 1);
	}

	/**
	 * create a cell style that formats numbers in scientific notation (exponential)
	 * for the score column (index 8)
	 * Differentiate the text content (NA) from the exponential values via an if ... else
	 * because otherwise the formatting as exponential value is not OK.
	 */
	public void formatExponential(String s){
		if (s.equals("NA")){
			myCell.setCellValue("NA");
		}
		else {
			XSSFCellStyle cs = wb.createCellStyle();
			XSSFDataFormat df = wb.createDataFormat();
			cs.setDataFormat(df.getFormat("0.0E+0"));
			myCell.setCellValue(Double.parseDouble(s));
			myCell.setCellStyle(cs);
		}
	}

	/*********************
	 * getters and setters
	 *********************/
	public void setColnr(int colnr) {
		this.colnr = colnr;
	}

	public int getColnr() {
		return colnr;
	}

	/**
	 * @param nrOfGOTerms the nrOfGOTerms to set
	 */
	public void setNrOfGOTerms(int nrOfGOTerms) {
		this.maxNrOfGOTerms = nrOfGOTerms;
	}

	/**
	 * @return the nrOfGOTerms
	 */
	public int getNrOfGOTerms() {
		return maxNrOfGOTerms;
	}
}
author	basfplant
date	Tue, 05 Mar 2013 04:00:19 -0500
parents
children