Mercurial > repos > basfplant > interproscan_to_excel

diff export_iprscan_to_Excel/source_files/iprscanToExcel_v20/src/be/cropdesign/iprscan/RawToExcel.java @ 0:a9762cd6e2e3 draft default tip
Uploaded
author: basfplant
date: Tue, 05 Mar 2013 04:00:19 -0500
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/export_iprscan_to_Excel/source_files/iprscanToExcel_v20/src/be/cropdesign/iprscan/RawToExcel.java	Tue Mar 05 04:00:19 2013 -0500
@@ -0,0 +1,403 @@
+package be.cropdesign.iprscan;
+/**
+ * Converts the *.raw output file of the InterProScan program to an Excel file (*.xlsx)
+ * @author: Katrien Bernaerts and Domantas Motiejunas
+ * @date: 21/06/2012
+ * @affiliation: CropDesign N.V., a BASF Plant Science Company - Technologiepark 3, 9052 Zwijnaarde - Belgium
+ */
+import java.awt.Color;
+import java.io.BufferedReader;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.poi.xssf.usermodel.XSSFCell;
+import org.apache.poi.xssf.usermodel.XSSFCellStyle;
+import org.apache.poi.xssf.usermodel.XSSFColor;
+import org.apache.poi.xssf.usermodel.XSSFDataFormat;
+import org.apache.poi.xssf.usermodel.XSSFFont;
+import org.apache.poi.xssf.usermodel.XSSFRow;
+import org.apache.poi.xssf.usermodel.XSSFSheet;
+import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+
+public class RawToExcel {
+	
+	private int colnr;
+	private int maxNr;
+	private int rownr;
+	private int maxNrOfGOTerms;
+
+	/**
+	 * fields needed to generate Excel
+	 */
+	private XSSFWorkbook wb;
+	private XSSFSheet sheet;
+	private XSSFRow myRow;
+	private XSSFCell myCell;
+	private String rawFile;
+	private boolean interproInfo; // this info is not standard in every .raw file, but can occur
+	private boolean GOInfo; // this info is not standard in every .raw file, but can occur
+	
+	private ArrayList<String> titles;
+		
+	/**
+	 * constructor
+	 * @param wb
+	 * @param sheet3
+	 * @param myRow
+	 * @param myCell
+	 * @param rawFile
+	 */
+	public RawToExcel(XSSFWorkbook wb, XSSFSheet sheet3,XSSFRow myRow, XSSFCell myCell, String rawFile){
+		this.wb = wb;
+		this.sheet = sheet3;
+		this.myRow = myRow;
+		this.myCell = myCell;
+		this.rawFile = rawFile;
+		maxNr = 0;
+		maxNrOfGOTerms = 0;
+		interproInfo = false;
+		GOInfo = false;
+		titles = new ArrayList<String>();
+		parseRaw();
+		addHeaderTitles();
+		formatStyle();
+	}
+	
+	/**********************************
+	 * parsing of the .raw file content
+	 **********************************/
+	/**
+	 * method to parse the data in the .raw file with a BufferedReader/FileReader. The parsed data
+	 * are written to Excel using the Apache POI library.
+	 */
+	public void parseRaw() {
+		BufferedReader reader = null;
+		try {
+			reader = new BufferedReader(new FileReader(rawFile));
+
+			String line = reader.readLine();
+
+			/*
+			 * start with rownr = 1 to write content because an empty row has to be reserved for the titles
+			 * The title row cannot be filled yet at this point because we first have to determine how many
+			 * columns are needed, and by consequence, how many titles. The problem is that different .raw files 
+			 * can have a different amount of columns.
+			 * For each line read with the buffered reader, the number of columns is compared to the maxColnr,
+			 * because sometimes not all lines of the same file have the same number of columns, e.g. because the
+			 * last column is empty. However, if for some rows the last column is empty, a column title is still needed
+			 * for the other rows. Therefore, the method getMaxColumns(colnr) is called for each line.
+			 */
+			rownr = 1;
+			while(line != null) {
+				String[] splits = line.split("\t"); // tab delimited file
+				myRow = sheet.createRow(rownr);
+				colnr = 0;
+				for (String string : splits){
+					myCell = myRow.createCell(colnr);
+					myCell.setCellValue(string);
+					
+					/*
+					 * interProScan info is not present in all raw files. For the creation of the right
+					 * header titles, it is important to know whether the parsed raw file contained 
+					 * interProScan info
+					 */
+					if (string.contains("IPR")){
+						interproInfo = true;
+					}
+					
+					/*
+					 * format the cell content as Integer for the columns protein length (colnr=2),
+					 * start (colnr=6) or end (colnr=7). To know which input only contains integers,
+					 * a regex is used. If only numbers or spaces are found in the input string, 
+					 * and if the input string is not empty, the corresponding Excel cell is 
+					 * formatted as Integer.
+					 * If the cell content is not formatted as number, sorting etc. via the filters
+					 * in the headers does not work correct.
+					 */
+					if (checkRegex("^([0-9]+\\s*)*$", string)){
+						myCell.setCellValue(Integer.parseInt(string));
+					}
+					
+					/*
+					 * create a cell style that formats numbers in scientific notation (exponential) 
+					 * for the score column (index 8)
+					 */
+					if (checkRegex("^[-+]?([0-9]*\\.?[0-9]+([eE][-+]?[0-9]+))*$", string)){
+						formatExponential(string);
+					}
+					/*
+					 * split up the line with GO classification information further such 
+					 * that the splitted line of GO information can be stored in different 
+					 * Excel cells instead of all information concatenated into one cell (like it is
+					 * in the original .raw file generated by iprscan). First the information
+					 * in the splitted line is stored in a double array. In a second step (at the end of the
+					 * current method, the double array content is written to Excel.
+					 */
+					if (string.contains("GO:")){
+						GOInfo = true;
+						splitGOTerms(string);
+					}
+					getMax(colnr);
+					colnr++;
+				} 
+				line = reader.readLine();
+				rownr++;
+				myRow = sheet.createRow(rownr);
+			}
+		} catch (FileNotFoundException e) {
+				System.err.println("The .raw file cannot be found.");
+		} catch (IOException e) {
+			System.err.println("An input/output exception occurred while reading the .raw file.");
+		}
+		finally {
+			if (reader != null) {
+				try {
+					reader.close();
+				}
+				catch (IOException e){}
+			}
+		}
+	}
+	
+	/**
+	 * Helper method for parseRaw()
+	 * Split a string containing GO information. A typical string looks like:
+	 * "Molecular Function: sequence-specific DNA binding transcription factor activity (GO:0003700), Cellular Component: nucleus (GO:0005634), Biological Process: regulation of transcription, DNA-dependent (GO:0006355), Molecular Function: sequence-specific DNA binding (GO:0043565)"
+	 * or in more general terms:
+	 * "Title1: description1 (GO number1), Title2: description2 (GO number2), Title3: description3 (GO number3)"
+	 * The string should be splitted in three parts: title, description and GO number.
+	 * In fact we are dealing with comma delimited strings, but split may not happen at every comma, 
+	 * only when comma is preceded by (
+	 * Split may for example not happen at the comma in case of "Molecular Function: transferase activity, 
+	 * transferring phosphorus-containing groups (GO:0016772)"
+	 * In order to assure that the splitting occurs at the right place, the comma's at the places where splitting has to 
+	 * occur are replaced by the unique splitting character ;
+	 * @return
+	 */
+	public void splitGOTerms(String string){
+		if (string != null &&!string.isEmpty() && !string.trim().isEmpty()){
+
+			String modifiedString = string.replace("),", ");");
+			/*
+			 * the modifiedString is splitted at the ;
+			 */
+			String[] splitsClassification = modifiedString.split("; ");
+			int numberOfGoTerms = splitsClassification.length;
+			getMaxNrOfGOTerms(numberOfGoTerms);
+			for (int i = 0; i < splitsClassification.length; i++){
+				myCell = myRow.createCell(colnr);
+				myCell.setCellValue(splitsClassification[i].substring(0, splitsClassification[i].indexOf(':'))); //title
+				colnr++;
+				myCell = myRow.createCell(colnr);
+				myCell.setCellValue(splitsClassification[i].substring(splitsClassification[i].indexOf('(')+1, splitsClassification[i].indexOf(')'))); // GO term
+				colnr++;
+				myCell = myRow.createCell(colnr);
+				myCell.setCellValue(splitsClassification[i].substring(splitsClassification[i].indexOf(':')+2, splitsClassification[i].indexOf('('))); // description)
+				colnr++;
+			}
+		}
+	}
+
+	/**
+	 * Helper method for parseRaw()
+	 * Check whether a certain input string (stringToMatch) matches a certain regular expression.
+	 * @param regex
+	 * @param stringToMatch
+	 * @return
+	 */
+	public boolean checkRegex(String regex, String stringToMatch){
+		Pattern p = Pattern.compile(regex);
+		Matcher m = p.matcher(stringToMatch);
+		if(m.matches() && stringToMatch != null &&!stringToMatch.isEmpty() && !stringToMatch.trim().isEmpty()){
+			return true;
+		} else {
+			return false;
+		}
+	}
+	
+	/**********************************
+	 * header titles of the Excel sheet
+	 *********************************/
+	/**
+	 * the header titles are generated in the first row (index 0) of the spreadsheet
+	 * All the potential column titles are added to the titles ArrayList. The .raw file always contains
+	 * some fixed part (standard titles), but can also contain more information (titles which are not 
+	 * required for every .raw file).
+	 * This last category of headers is only added in case the information occurs in the .raw file.
+	 */
+	public void addHeaderTitles(){
+		/*
+		 *  standard titles
+		 */
+		titles.add("protein ID");
+		titles.add("protein crc64");
+		titles.add("protein length");
+		titles.add("match dbname");
+		titles.add("classification id");
+		titles.add("classification description");
+		titles.add("start");
+		titles.add("end");
+		titles.add("score");
+		titles.add("status");
+		titles.add("date");
+		/*
+		 * titles which are not required for every .raw file
+		 */
+		if (interproInfo){ // only if the .raw file contains "IPR" boolean interproInfo becomes true
+			titles.add("interpro ID");
+			titles.add("interpro name");
+		}
+
+		if (GOInfo){
+			for (int i = 0; i < maxNrOfGOTerms; i++){
+				titles.add("title");
+				titles.add("GO number");
+				titles.add("description");
+			}
+		}
+		
+		myRow = sheet.createRow(0);
+		// show the headers in the table
+		for (int i = 0; i < titles.size() ; i ++){
+			myCell = myRow.createCell(i);
+			myCell.setCellValue(titles.get(i)); // content of the headercell
+			formatHeader(); // color of the headercell
+		}
+	}
+	
+	/**
+	 * method to find the number of Excel columns needed for the GO terms information (title, description and GO number)n
+	 * The method looks for the maximal number of columns needed, because sometimes there are
+	 * rows with no GO information and other rows which have GO information.
+	 * As long as in one Excel sheet there is one row with GO information, the titles for the GO information
+	 * have to be showed correctly. The method getNrOfGOTerms helps in this task.
+	 */
+	public void getMaxNrOfGOTerms(int nr){
+		if (nr > maxNrOfGOTerms){
+			maxNrOfGOTerms = nr;
+		}
+	}
+
+	
+	/*******************************
+	 * formatting of the Excel sheet
+	 ******************************/
+	/**
+	 * method to apply all formatting to the Excel tabsheet containing the .raw data
+	 */
+	public void formatStyle(){
+		setAutoFilters();
+		autoSizeColumns();
+		freezeRow();
+	}
+	
+	/**
+	 * make autofilters of the column headers in Excel
+	 * ref: http://stackoverflow.com/questions/3114220/poi-auto-filter
+	 */ 
+	public void setAutoFilters(){
+		if (GOInfo){
+			/*
+			 * for some reason, the first empty column contains an autofilter in the case there is
+			 * GO information. However, only the columns which are not empty should have and autofilter.
+			 * In order to avoid this small bug, the autofilter method was changed
+			 * slightly: maxNr -1 instead of maxNr
+			 * method: public XSSFAutoFilter setAutoFilter(CellRangeAddress, range);
+			 */
+			sheet.setAutoFilter(org.apache.poi.ss.util.CellRangeAddress.valueOf("A1:"+ (Character.toString((char)( 65+maxNr-1)))+"1"));
+		} else {
+			sheet.setAutoFilter(org.apache.poi.ss.util.CellRangeAddress.valueOf("A1:"+ (Character.toString((char)( 65+maxNr)))+"1"));
+		}
+	}
+
+	/**
+	 * set the column width automatically to the width of the content
+	 */
+	public void autoSizeColumns(){
+		for(int column = 0; column < maxNr; column++){
+			sheet.autoSizeColumn(column);
+		}
+	}
+	
+	/**
+	 * helper method for setAutoFilters() and autoSizeColumns() to find the number 
+	 * of columns present in the tab sheet of the Excel file.
+	 * Sometimes, there are columns which are empty in a certain row, but filled in 
+	 * another row. We always have to cope with the most extreme situation. Therefore,
+	 * the maximum number of columns is determined.
+	 * @param nr
+	 */
+	public void getMax(int nr){
+		if (nr > maxNr){
+			maxNr = nr;
+		}
+	}
+	
+	/**
+	 * give the header cells a blue color and bold formatting
+	 */
+	public void formatHeader(){
+		XSSFCellStyle style = wb.createCellStyle();
+	    XSSFFont font = wb.createFont();
+	    font.setColor(new XSSFColor(Color.BLUE));
+	    font.setBold(true);
+	    style.setFont(font);
+	    myCell.setCellStyle(style);
+	}
+	
+	/**
+	 * freeze the header row
+	 * method: public void createFreezePane(int colSplit, int rowSplit, intleftmostColumn, int topRow)
+	 */
+	public void freezeRow(){
+		sheet.createFreezePane(0, 1, 0, 1);
+	}
+	
+	/**
+	 * create a cell style that formats numbers in scientific notation (exponential) 
+	 * for the score column (index 8)
+	 * Differentiate the text content (NA) from the exponential values via an if ... else
+	 * because otherwise the formatting as exponential value is not OK.
+	 */
+	public void formatExponential(String s){
+		if (s.equals("NA")){
+			myCell.setCellValue("NA");
+		}
+		else {
+			XSSFCellStyle cs = wb.createCellStyle();
+			XSSFDataFormat df = wb.createDataFormat();
+			cs.setDataFormat(df.getFormat("0.0E+0"));
+			myCell.setCellValue(Double.parseDouble(s));
+			myCell.setCellStyle(cs);
+		}
+	}
+	
+	/*********************
+	 * getters and setters
+	 *********************/
+	public void setColnr(int colnr) {
+		this.colnr = colnr;
+	}
+
+	public int getColnr() {
+		return colnr;
+	}
+
+	/**
+	 * @param nrOfGOTerms the nrOfGOTerms to set
+	 */
+	public void setNrOfGOTerms(int nrOfGOTerms) {
+		this.maxNrOfGOTerms = nrOfGOTerms;
+	}
+
+	/**
+	 * @return the nrOfGOTerms
+	 */
+	public int getNrOfGOTerms() {
+		return maxNrOfGOTerms;
+	}
+}
author	basfplant
date	Tue, 05 Mar 2013 04:00:19 -0500
parents
children