Mercurial > repos > basfplant > interproscan_to_excel
diff export_iprscan_to_Excel/source_files/iprscanToExcel_v20/src/be/cropdesign/iprscan/RawToExcel.java @ 0:a9762cd6e2e3 draft default tip
Uploaded
author | basfplant |
---|---|
date | Tue, 05 Mar 2013 04:00:19 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/export_iprscan_to_Excel/source_files/iprscanToExcel_v20/src/be/cropdesign/iprscan/RawToExcel.java Tue Mar 05 04:00:19 2013 -0500 @@ -0,0 +1,403 @@ +package be.cropdesign.iprscan; +/** + * Converts the *.raw output file of the InterProScan program to an Excel file (*.xlsx) + * @author: Katrien Bernaerts and Domantas Motiejunas + * @date: 21/06/2012 + * @affiliation: CropDesign N.V., a BASF Plant Science Company - Technologiepark 3, 9052 Zwijnaarde - Belgium + */ +import java.awt.Color; +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.poi.xssf.usermodel.XSSFCell; +import org.apache.poi.xssf.usermodel.XSSFCellStyle; +import org.apache.poi.xssf.usermodel.XSSFColor; +import org.apache.poi.xssf.usermodel.XSSFDataFormat; +import org.apache.poi.xssf.usermodel.XSSFFont; +import org.apache.poi.xssf.usermodel.XSSFRow; +import org.apache.poi.xssf.usermodel.XSSFSheet; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; + +public class RawToExcel { + + private int colnr; + private int maxNr; + private int rownr; + private int maxNrOfGOTerms; + + /** + * fields needed to generate Excel + */ + private XSSFWorkbook wb; + private XSSFSheet sheet; + private XSSFRow myRow; + private XSSFCell myCell; + private String rawFile; + private boolean interproInfo; // this info is not standard in every .raw file, but can occur + private boolean GOInfo; // this info is not standard in every .raw file, but can occur + + private ArrayList<String> titles; + + /** + * constructor + * @param wb + * @param sheet3 + * @param myRow + * @param myCell + * @param rawFile + */ + public RawToExcel(XSSFWorkbook wb, XSSFSheet sheet3,XSSFRow myRow, XSSFCell myCell, String rawFile){ + this.wb = wb; + this.sheet = sheet3; + this.myRow = myRow; + this.myCell = myCell; + this.rawFile = rawFile; + maxNr = 0; + maxNrOfGOTerms = 0; + interproInfo = false; + GOInfo = false; + titles = new ArrayList<String>(); + parseRaw(); + addHeaderTitles(); + formatStyle(); + } + + /********************************** + * parsing of the .raw file content + **********************************/ + /** + * method to parse the data in the .raw file with a BufferedReader/FileReader. The parsed data + * are written to Excel using the Apache POI library. + */ + public void parseRaw() { + BufferedReader reader = null; + try { + reader = new BufferedReader(new FileReader(rawFile)); + + String line = reader.readLine(); + + /* + * start with rownr = 1 to write content because an empty row has to be reserved for the titles + * The title row cannot be filled yet at this point because we first have to determine how many + * columns are needed, and by consequence, how many titles. The problem is that different .raw files + * can have a different amount of columns. + * For each line read with the buffered reader, the number of columns is compared to the maxColnr, + * because sometimes not all lines of the same file have the same number of columns, e.g. because the + * last column is empty. However, if for some rows the last column is empty, a column title is still needed + * for the other rows. Therefore, the method getMaxColumns(colnr) is called for each line. + */ + rownr = 1; + while(line != null) { + String[] splits = line.split("\t"); // tab delimited file + myRow = sheet.createRow(rownr); + colnr = 0; + for (String string : splits){ + myCell = myRow.createCell(colnr); + myCell.setCellValue(string); + + /* + * interProScan info is not present in all raw files. For the creation of the right + * header titles, it is important to know whether the parsed raw file contained + * interProScan info + */ + if (string.contains("IPR")){ + interproInfo = true; + } + + /* + * format the cell content as Integer for the columns protein length (colnr=2), + * start (colnr=6) or end (colnr=7). To know which input only contains integers, + * a regex is used. If only numbers or spaces are found in the input string, + * and if the input string is not empty, the corresponding Excel cell is + * formatted as Integer. + * If the cell content is not formatted as number, sorting etc. via the filters + * in the headers does not work correct. + */ + if (checkRegex("^([0-9]+\\s*)*$", string)){ + myCell.setCellValue(Integer.parseInt(string)); + } + + /* + * create a cell style that formats numbers in scientific notation (exponential) + * for the score column (index 8) + */ + if (checkRegex("^[-+]?([0-9]*\\.?[0-9]+([eE][-+]?[0-9]+))*$", string)){ + formatExponential(string); + } + /* + * split up the line with GO classification information further such + * that the splitted line of GO information can be stored in different + * Excel cells instead of all information concatenated into one cell (like it is + * in the original .raw file generated by iprscan). First the information + * in the splitted line is stored in a double array. In a second step (at the end of the + * current method, the double array content is written to Excel. + */ + if (string.contains("GO:")){ + GOInfo = true; + splitGOTerms(string); + } + getMax(colnr); + colnr++; + } + line = reader.readLine(); + rownr++; + myRow = sheet.createRow(rownr); + } + } catch (FileNotFoundException e) { + System.err.println("The .raw file cannot be found."); + } catch (IOException e) { + System.err.println("An input/output exception occurred while reading the .raw file."); + } + finally { + if (reader != null) { + try { + reader.close(); + } + catch (IOException e){} + } + } + } + + /** + * Helper method for parseRaw() + * Split a string containing GO information. A typical string looks like: + * "Molecular Function: sequence-specific DNA binding transcription factor activity (GO:0003700), Cellular Component: nucleus (GO:0005634), Biological Process: regulation of transcription, DNA-dependent (GO:0006355), Molecular Function: sequence-specific DNA binding (GO:0043565)" + * or in more general terms: + * "Title1: description1 (GO number1), Title2: description2 (GO number2), Title3: description3 (GO number3)" + * The string should be splitted in three parts: title, description and GO number. + * In fact we are dealing with comma delimited strings, but split may not happen at every comma, + * only when comma is preceded by ( + * Split may for example not happen at the comma in case of "Molecular Function: transferase activity, + * transferring phosphorus-containing groups (GO:0016772)" + * In order to assure that the splitting occurs at the right place, the comma's at the places where splitting has to + * occur are replaced by the unique splitting character ; + * @return + */ + public void splitGOTerms(String string){ + if (string != null &&!string.isEmpty() && !string.trim().isEmpty()){ + + String modifiedString = string.replace("),", ");"); + /* + * the modifiedString is splitted at the ; + */ + String[] splitsClassification = modifiedString.split("; "); + int numberOfGoTerms = splitsClassification.length; + getMaxNrOfGOTerms(numberOfGoTerms); + for (int i = 0; i < splitsClassification.length; i++){ + myCell = myRow.createCell(colnr); + myCell.setCellValue(splitsClassification[i].substring(0, splitsClassification[i].indexOf(':'))); //title + colnr++; + myCell = myRow.createCell(colnr); + myCell.setCellValue(splitsClassification[i].substring(splitsClassification[i].indexOf('(')+1, splitsClassification[i].indexOf(')'))); // GO term + colnr++; + myCell = myRow.createCell(colnr); + myCell.setCellValue(splitsClassification[i].substring(splitsClassification[i].indexOf(':')+2, splitsClassification[i].indexOf('('))); // description) + colnr++; + } + } + } + + /** + * Helper method for parseRaw() + * Check whether a certain input string (stringToMatch) matches a certain regular expression. + * @param regex + * @param stringToMatch + * @return + */ + public boolean checkRegex(String regex, String stringToMatch){ + Pattern p = Pattern.compile(regex); + Matcher m = p.matcher(stringToMatch); + if(m.matches() && stringToMatch != null &&!stringToMatch.isEmpty() && !stringToMatch.trim().isEmpty()){ + return true; + } else { + return false; + } + } + + /********************************** + * header titles of the Excel sheet + *********************************/ + /** + * the header titles are generated in the first row (index 0) of the spreadsheet + * All the potential column titles are added to the titles ArrayList. The .raw file always contains + * some fixed part (standard titles), but can also contain more information (titles which are not + * required for every .raw file). + * This last category of headers is only added in case the information occurs in the .raw file. + */ + public void addHeaderTitles(){ + /* + * standard titles + */ + titles.add("protein ID"); + titles.add("protein crc64"); + titles.add("protein length"); + titles.add("match dbname"); + titles.add("classification id"); + titles.add("classification description"); + titles.add("start"); + titles.add("end"); + titles.add("score"); + titles.add("status"); + titles.add("date"); + /* + * titles which are not required for every .raw file + */ + if (interproInfo){ // only if the .raw file contains "IPR" boolean interproInfo becomes true + titles.add("interpro ID"); + titles.add("interpro name"); + } + + if (GOInfo){ + for (int i = 0; i < maxNrOfGOTerms; i++){ + titles.add("title"); + titles.add("GO number"); + titles.add("description"); + } + } + + myRow = sheet.createRow(0); + // show the headers in the table + for (int i = 0; i < titles.size() ; i ++){ + myCell = myRow.createCell(i); + myCell.setCellValue(titles.get(i)); // content of the headercell + formatHeader(); // color of the headercell + } + } + + /** + * method to find the number of Excel columns needed for the GO terms information (title, description and GO number)n + * The method looks for the maximal number of columns needed, because sometimes there are + * rows with no GO information and other rows which have GO information. + * As long as in one Excel sheet there is one row with GO information, the titles for the GO information + * have to be showed correctly. The method getNrOfGOTerms helps in this task. + */ + public void getMaxNrOfGOTerms(int nr){ + if (nr > maxNrOfGOTerms){ + maxNrOfGOTerms = nr; + } + } + + + /******************************* + * formatting of the Excel sheet + ******************************/ + /** + * method to apply all formatting to the Excel tabsheet containing the .raw data + */ + public void formatStyle(){ + setAutoFilters(); + autoSizeColumns(); + freezeRow(); + } + + /** + * make autofilters of the column headers in Excel + * ref: http://stackoverflow.com/questions/3114220/poi-auto-filter + */ + public void setAutoFilters(){ + if (GOInfo){ + /* + * for some reason, the first empty column contains an autofilter in the case there is + * GO information. However, only the columns which are not empty should have and autofilter. + * In order to avoid this small bug, the autofilter method was changed + * slightly: maxNr -1 instead of maxNr + * method: public XSSFAutoFilter setAutoFilter(CellRangeAddress, range); + */ + sheet.setAutoFilter(org.apache.poi.ss.util.CellRangeAddress.valueOf("A1:"+ (Character.toString((char)( 65+maxNr-1)))+"1")); + } else { + sheet.setAutoFilter(org.apache.poi.ss.util.CellRangeAddress.valueOf("A1:"+ (Character.toString((char)( 65+maxNr)))+"1")); + } + } + + /** + * set the column width automatically to the width of the content + */ + public void autoSizeColumns(){ + for(int column = 0; column < maxNr; column++){ + sheet.autoSizeColumn(column); + } + } + + /** + * helper method for setAutoFilters() and autoSizeColumns() to find the number + * of columns present in the tab sheet of the Excel file. + * Sometimes, there are columns which are empty in a certain row, but filled in + * another row. We always have to cope with the most extreme situation. Therefore, + * the maximum number of columns is determined. + * @param nr + */ + public void getMax(int nr){ + if (nr > maxNr){ + maxNr = nr; + } + } + + /** + * give the header cells a blue color and bold formatting + */ + public void formatHeader(){ + XSSFCellStyle style = wb.createCellStyle(); + XSSFFont font = wb.createFont(); + font.setColor(new XSSFColor(Color.BLUE)); + font.setBold(true); + style.setFont(font); + myCell.setCellStyle(style); + } + + /** + * freeze the header row + * method: public void createFreezePane(int colSplit, int rowSplit, intleftmostColumn, int topRow) + */ + public void freezeRow(){ + sheet.createFreezePane(0, 1, 0, 1); + } + + /** + * create a cell style that formats numbers in scientific notation (exponential) + * for the score column (index 8) + * Differentiate the text content (NA) from the exponential values via an if ... else + * because otherwise the formatting as exponential value is not OK. + */ + public void formatExponential(String s){ + if (s.equals("NA")){ + myCell.setCellValue("NA"); + } + else { + XSSFCellStyle cs = wb.createCellStyle(); + XSSFDataFormat df = wb.createDataFormat(); + cs.setDataFormat(df.getFormat("0.0E+0")); + myCell.setCellValue(Double.parseDouble(s)); + myCell.setCellStyle(cs); + } + } + + /********************* + * getters and setters + *********************/ + public void setColnr(int colnr) { + this.colnr = colnr; + } + + public int getColnr() { + return colnr; + } + + /** + * @param nrOfGOTerms the nrOfGOTerms to set + */ + public void setNrOfGOTerms(int nrOfGOTerms) { + this.maxNrOfGOTerms = nrOfGOTerms; + } + + /** + * @return the nrOfGOTerms + */ + public int getNrOfGOTerms() { + return maxNrOfGOTerms; + } +}