Mercurial > repos > basfplant > interproscan_to_excel
view export_iprscan_to_Excel/source_files/iprscanToExcel_v20/src/be/cropdesign/iprscan/RawToExcel.java @ 0:a9762cd6e2e3 draft default tip
Uploaded
author | basfplant |
---|---|
date | Tue, 05 Mar 2013 04:00:19 -0500 |
parents | |
children |
line wrap: on
line source
package be.cropdesign.iprscan; /** * Converts the *.raw output file of the InterProScan program to an Excel file (*.xlsx) * @author: Katrien Bernaerts and Domantas Motiejunas * @date: 21/06/2012 * @affiliation: CropDesign N.V., a BASF Plant Science Company - Technologiepark 3, 9052 Zwijnaarde - Belgium */ import java.awt.Color; import java.io.BufferedReader; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.poi.xssf.usermodel.XSSFCell; import org.apache.poi.xssf.usermodel.XSSFCellStyle; import org.apache.poi.xssf.usermodel.XSSFColor; import org.apache.poi.xssf.usermodel.XSSFDataFormat; import org.apache.poi.xssf.usermodel.XSSFFont; import org.apache.poi.xssf.usermodel.XSSFRow; import org.apache.poi.xssf.usermodel.XSSFSheet; import org.apache.poi.xssf.usermodel.XSSFWorkbook; public class RawToExcel { private int colnr; private int maxNr; private int rownr; private int maxNrOfGOTerms; /** * fields needed to generate Excel */ private XSSFWorkbook wb; private XSSFSheet sheet; private XSSFRow myRow; private XSSFCell myCell; private String rawFile; private boolean interproInfo; // this info is not standard in every .raw file, but can occur private boolean GOInfo; // this info is not standard in every .raw file, but can occur private ArrayList<String> titles; /** * constructor * @param wb * @param sheet3 * @param myRow * @param myCell * @param rawFile */ public RawToExcel(XSSFWorkbook wb, XSSFSheet sheet3,XSSFRow myRow, XSSFCell myCell, String rawFile){ this.wb = wb; this.sheet = sheet3; this.myRow = myRow; this.myCell = myCell; this.rawFile = rawFile; maxNr = 0; maxNrOfGOTerms = 0; interproInfo = false; GOInfo = false; titles = new ArrayList<String>(); parseRaw(); addHeaderTitles(); formatStyle(); } /********************************** * parsing of the .raw file content **********************************/ /** * method to parse the data in the .raw file with a BufferedReader/FileReader. The parsed data * are written to Excel using the Apache POI library. */ public void parseRaw() { BufferedReader reader = null; try { reader = new BufferedReader(new FileReader(rawFile)); String line = reader.readLine(); /* * start with rownr = 1 to write content because an empty row has to be reserved for the titles * The title row cannot be filled yet at this point because we first have to determine how many * columns are needed, and by consequence, how many titles. The problem is that different .raw files * can have a different amount of columns. * For each line read with the buffered reader, the number of columns is compared to the maxColnr, * because sometimes not all lines of the same file have the same number of columns, e.g. because the * last column is empty. However, if for some rows the last column is empty, a column title is still needed * for the other rows. Therefore, the method getMaxColumns(colnr) is called for each line. */ rownr = 1; while(line != null) { String[] splits = line.split("\t"); // tab delimited file myRow = sheet.createRow(rownr); colnr = 0; for (String string : splits){ myCell = myRow.createCell(colnr); myCell.setCellValue(string); /* * interProScan info is not present in all raw files. For the creation of the right * header titles, it is important to know whether the parsed raw file contained * interProScan info */ if (string.contains("IPR")){ interproInfo = true; } /* * format the cell content as Integer for the columns protein length (colnr=2), * start (colnr=6) or end (colnr=7). To know which input only contains integers, * a regex is used. If only numbers or spaces are found in the input string, * and if the input string is not empty, the corresponding Excel cell is * formatted as Integer. * If the cell content is not formatted as number, sorting etc. via the filters * in the headers does not work correct. */ if (checkRegex("^([0-9]+\\s*)*$", string)){ myCell.setCellValue(Integer.parseInt(string)); } /* * create a cell style that formats numbers in scientific notation (exponential) * for the score column (index 8) */ if (checkRegex("^[-+]?([0-9]*\\.?[0-9]+([eE][-+]?[0-9]+))*$", string)){ formatExponential(string); } /* * split up the line with GO classification information further such * that the splitted line of GO information can be stored in different * Excel cells instead of all information concatenated into one cell (like it is * in the original .raw file generated by iprscan). First the information * in the splitted line is stored in a double array. In a second step (at the end of the * current method, the double array content is written to Excel. */ if (string.contains("GO:")){ GOInfo = true; splitGOTerms(string); } getMax(colnr); colnr++; } line = reader.readLine(); rownr++; myRow = sheet.createRow(rownr); } } catch (FileNotFoundException e) { System.err.println("The .raw file cannot be found."); } catch (IOException e) { System.err.println("An input/output exception occurred while reading the .raw file."); } finally { if (reader != null) { try { reader.close(); } catch (IOException e){} } } } /** * Helper method for parseRaw() * Split a string containing GO information. A typical string looks like: * "Molecular Function: sequence-specific DNA binding transcription factor activity (GO:0003700), Cellular Component: nucleus (GO:0005634), Biological Process: regulation of transcription, DNA-dependent (GO:0006355), Molecular Function: sequence-specific DNA binding (GO:0043565)" * or in more general terms: * "Title1: description1 (GO number1), Title2: description2 (GO number2), Title3: description3 (GO number3)" * The string should be splitted in three parts: title, description and GO number. * In fact we are dealing with comma delimited strings, but split may not happen at every comma, * only when comma is preceded by ( * Split may for example not happen at the comma in case of "Molecular Function: transferase activity, * transferring phosphorus-containing groups (GO:0016772)" * In order to assure that the splitting occurs at the right place, the comma's at the places where splitting has to * occur are replaced by the unique splitting character ; * @return */ public void splitGOTerms(String string){ if (string != null &&!string.isEmpty() && !string.trim().isEmpty()){ String modifiedString = string.replace("),", ");"); /* * the modifiedString is splitted at the ; */ String[] splitsClassification = modifiedString.split("; "); int numberOfGoTerms = splitsClassification.length; getMaxNrOfGOTerms(numberOfGoTerms); for (int i = 0; i < splitsClassification.length; i++){ myCell = myRow.createCell(colnr); myCell.setCellValue(splitsClassification[i].substring(0, splitsClassification[i].indexOf(':'))); //title colnr++; myCell = myRow.createCell(colnr); myCell.setCellValue(splitsClassification[i].substring(splitsClassification[i].indexOf('(')+1, splitsClassification[i].indexOf(')'))); // GO term colnr++; myCell = myRow.createCell(colnr); myCell.setCellValue(splitsClassification[i].substring(splitsClassification[i].indexOf(':')+2, splitsClassification[i].indexOf('('))); // description) colnr++; } } } /** * Helper method for parseRaw() * Check whether a certain input string (stringToMatch) matches a certain regular expression. * @param regex * @param stringToMatch * @return */ public boolean checkRegex(String regex, String stringToMatch){ Pattern p = Pattern.compile(regex); Matcher m = p.matcher(stringToMatch); if(m.matches() && stringToMatch != null &&!stringToMatch.isEmpty() && !stringToMatch.trim().isEmpty()){ return true; } else { return false; } } /********************************** * header titles of the Excel sheet *********************************/ /** * the header titles are generated in the first row (index 0) of the spreadsheet * All the potential column titles are added to the titles ArrayList. The .raw file always contains * some fixed part (standard titles), but can also contain more information (titles which are not * required for every .raw file). * This last category of headers is only added in case the information occurs in the .raw file. */ public void addHeaderTitles(){ /* * standard titles */ titles.add("protein ID"); titles.add("protein crc64"); titles.add("protein length"); titles.add("match dbname"); titles.add("classification id"); titles.add("classification description"); titles.add("start"); titles.add("end"); titles.add("score"); titles.add("status"); titles.add("date"); /* * titles which are not required for every .raw file */ if (interproInfo){ // only if the .raw file contains "IPR" boolean interproInfo becomes true titles.add("interpro ID"); titles.add("interpro name"); } if (GOInfo){ for (int i = 0; i < maxNrOfGOTerms; i++){ titles.add("title"); titles.add("GO number"); titles.add("description"); } } myRow = sheet.createRow(0); // show the headers in the table for (int i = 0; i < titles.size() ; i ++){ myCell = myRow.createCell(i); myCell.setCellValue(titles.get(i)); // content of the headercell formatHeader(); // color of the headercell } } /** * method to find the number of Excel columns needed for the GO terms information (title, description and GO number)n * The method looks for the maximal number of columns needed, because sometimes there are * rows with no GO information and other rows which have GO information. * As long as in one Excel sheet there is one row with GO information, the titles for the GO information * have to be showed correctly. The method getNrOfGOTerms helps in this task. */ public void getMaxNrOfGOTerms(int nr){ if (nr > maxNrOfGOTerms){ maxNrOfGOTerms = nr; } } /******************************* * formatting of the Excel sheet ******************************/ /** * method to apply all formatting to the Excel tabsheet containing the .raw data */ public void formatStyle(){ setAutoFilters(); autoSizeColumns(); freezeRow(); } /** * make autofilters of the column headers in Excel * ref: http://stackoverflow.com/questions/3114220/poi-auto-filter */ public void setAutoFilters(){ if (GOInfo){ /* * for some reason, the first empty column contains an autofilter in the case there is * GO information. However, only the columns which are not empty should have and autofilter. * In order to avoid this small bug, the autofilter method was changed * slightly: maxNr -1 instead of maxNr * method: public XSSFAutoFilter setAutoFilter(CellRangeAddress, range); */ sheet.setAutoFilter(org.apache.poi.ss.util.CellRangeAddress.valueOf("A1:"+ (Character.toString((char)( 65+maxNr-1)))+"1")); } else { sheet.setAutoFilter(org.apache.poi.ss.util.CellRangeAddress.valueOf("A1:"+ (Character.toString((char)( 65+maxNr)))+"1")); } } /** * set the column width automatically to the width of the content */ public void autoSizeColumns(){ for(int column = 0; column < maxNr; column++){ sheet.autoSizeColumn(column); } } /** * helper method for setAutoFilters() and autoSizeColumns() to find the number * of columns present in the tab sheet of the Excel file. * Sometimes, there are columns which are empty in a certain row, but filled in * another row. We always have to cope with the most extreme situation. Therefore, * the maximum number of columns is determined. * @param nr */ public void getMax(int nr){ if (nr > maxNr){ maxNr = nr; } } /** * give the header cells a blue color and bold formatting */ public void formatHeader(){ XSSFCellStyle style = wb.createCellStyle(); XSSFFont font = wb.createFont(); font.setColor(new XSSFColor(Color.BLUE)); font.setBold(true); style.setFont(font); myCell.setCellStyle(style); } /** * freeze the header row * method: public void createFreezePane(int colSplit, int rowSplit, intleftmostColumn, int topRow) */ public void freezeRow(){ sheet.createFreezePane(0, 1, 0, 1); } /** * create a cell style that formats numbers in scientific notation (exponential) * for the score column (index 8) * Differentiate the text content (NA) from the exponential values via an if ... else * because otherwise the formatting as exponential value is not OK. */ public void formatExponential(String s){ if (s.equals("NA")){ myCell.setCellValue("NA"); } else { XSSFCellStyle cs = wb.createCellStyle(); XSSFDataFormat df = wb.createDataFormat(); cs.setDataFormat(df.getFormat("0.0E+0")); myCell.setCellValue(Double.parseDouble(s)); myCell.setCellStyle(cs); } } /********************* * getters and setters *********************/ public void setColnr(int colnr) { this.colnr = colnr; } public int getColnr() { return colnr; } /** * @param nrOfGOTerms the nrOfGOTerms to set */ public void setNrOfGOTerms(int nrOfGOTerms) { this.maxNrOfGOTerms = nrOfGOTerms; } /** * @return the nrOfGOTerms */ public int getNrOfGOTerms() { return maxNrOfGOTerms; } }