0
|
1 package be.cropdesign.iprscan;
|
|
2 /**
|
|
3 * Converts the *.raw output file of the InterProScan program to an Excel file (*.xlsx)
|
|
4 * @author: Katrien Bernaerts and Domantas Motiejunas
|
|
5 * @date: 21/06/2012
|
|
6 * @affiliation: CropDesign N.V., a BASF Plant Science Company - Technologiepark 3, 9052 Zwijnaarde - Belgium
|
|
7 */
|
|
8 import java.awt.Color;
|
|
9 import java.io.BufferedReader;
|
|
10 import java.io.FileNotFoundException;
|
|
11 import java.io.FileReader;
|
|
12 import java.io.IOException;
|
|
13 import java.util.ArrayList;
|
|
14 import java.util.regex.Matcher;
|
|
15 import java.util.regex.Pattern;
|
|
16
|
|
17 import org.apache.poi.xssf.usermodel.XSSFCell;
|
|
18 import org.apache.poi.xssf.usermodel.XSSFCellStyle;
|
|
19 import org.apache.poi.xssf.usermodel.XSSFColor;
|
|
20 import org.apache.poi.xssf.usermodel.XSSFDataFormat;
|
|
21 import org.apache.poi.xssf.usermodel.XSSFFont;
|
|
22 import org.apache.poi.xssf.usermodel.XSSFRow;
|
|
23 import org.apache.poi.xssf.usermodel.XSSFSheet;
|
|
24 import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
|
25
|
|
26 public class RawToExcel {
|
|
27
|
|
28 private int colnr;
|
|
29 private int maxNr;
|
|
30 private int rownr;
|
|
31 private int maxNrOfGOTerms;
|
|
32
|
|
33 /**
|
|
34 * fields needed to generate Excel
|
|
35 */
|
|
36 private XSSFWorkbook wb;
|
|
37 private XSSFSheet sheet;
|
|
38 private XSSFRow myRow;
|
|
39 private XSSFCell myCell;
|
|
40 private String rawFile;
|
|
41 private boolean interproInfo; // this info is not standard in every .raw file, but can occur
|
|
42 private boolean GOInfo; // this info is not standard in every .raw file, but can occur
|
|
43
|
|
44 private ArrayList<String> titles;
|
|
45
|
|
46 /**
|
|
47 * constructor
|
|
48 * @param wb
|
|
49 * @param sheet3
|
|
50 * @param myRow
|
|
51 * @param myCell
|
|
52 * @param rawFile
|
|
53 */
|
|
54 public RawToExcel(XSSFWorkbook wb, XSSFSheet sheet3,XSSFRow myRow, XSSFCell myCell, String rawFile){
|
|
55 this.wb = wb;
|
|
56 this.sheet = sheet3;
|
|
57 this.myRow = myRow;
|
|
58 this.myCell = myCell;
|
|
59 this.rawFile = rawFile;
|
|
60 maxNr = 0;
|
|
61 maxNrOfGOTerms = 0;
|
|
62 interproInfo = false;
|
|
63 GOInfo = false;
|
|
64 titles = new ArrayList<String>();
|
|
65 parseRaw();
|
|
66 addHeaderTitles();
|
|
67 formatStyle();
|
|
68 }
|
|
69
|
|
70 /**********************************
|
|
71 * parsing of the .raw file content
|
|
72 **********************************/
|
|
73 /**
|
|
74 * method to parse the data in the .raw file with a BufferedReader/FileReader. The parsed data
|
|
75 * are written to Excel using the Apache POI library.
|
|
76 */
|
|
77 public void parseRaw() {
|
|
78 BufferedReader reader = null;
|
|
79 try {
|
|
80 reader = new BufferedReader(new FileReader(rawFile));
|
|
81
|
|
82 String line = reader.readLine();
|
|
83
|
|
84 /*
|
|
85 * start with rownr = 1 to write content because an empty row has to be reserved for the titles
|
|
86 * The title row cannot be filled yet at this point because we first have to determine how many
|
|
87 * columns are needed, and by consequence, how many titles. The problem is that different .raw files
|
|
88 * can have a different amount of columns.
|
|
89 * For each line read with the buffered reader, the number of columns is compared to the maxColnr,
|
|
90 * because sometimes not all lines of the same file have the same number of columns, e.g. because the
|
|
91 * last column is empty. However, if for some rows the last column is empty, a column title is still needed
|
|
92 * for the other rows. Therefore, the method getMaxColumns(colnr) is called for each line.
|
|
93 */
|
|
94 rownr = 1;
|
|
95 while(line != null) {
|
|
96 String[] splits = line.split("\t"); // tab delimited file
|
|
97 myRow = sheet.createRow(rownr);
|
|
98 colnr = 0;
|
|
99 for (String string : splits){
|
|
100 myCell = myRow.createCell(colnr);
|
|
101 myCell.setCellValue(string);
|
|
102
|
|
103 /*
|
|
104 * interProScan info is not present in all raw files. For the creation of the right
|
|
105 * header titles, it is important to know whether the parsed raw file contained
|
|
106 * interProScan info
|
|
107 */
|
|
108 if (string.contains("IPR")){
|
|
109 interproInfo = true;
|
|
110 }
|
|
111
|
|
112 /*
|
|
113 * format the cell content as Integer for the columns protein length (colnr=2),
|
|
114 * start (colnr=6) or end (colnr=7). To know which input only contains integers,
|
|
115 * a regex is used. If only numbers or spaces are found in the input string,
|
|
116 * and if the input string is not empty, the corresponding Excel cell is
|
|
117 * formatted as Integer.
|
|
118 * If the cell content is not formatted as number, sorting etc. via the filters
|
|
119 * in the headers does not work correct.
|
|
120 */
|
|
121 if (checkRegex("^([0-9]+\\s*)*$", string)){
|
|
122 myCell.setCellValue(Integer.parseInt(string));
|
|
123 }
|
|
124
|
|
125 /*
|
|
126 * create a cell style that formats numbers in scientific notation (exponential)
|
|
127 * for the score column (index 8)
|
|
128 */
|
|
129 if (checkRegex("^[-+]?([0-9]*\\.?[0-9]+([eE][-+]?[0-9]+))*$", string)){
|
|
130 formatExponential(string);
|
|
131 }
|
|
132 /*
|
|
133 * split up the line with GO classification information further such
|
|
134 * that the splitted line of GO information can be stored in different
|
|
135 * Excel cells instead of all information concatenated into one cell (like it is
|
|
136 * in the original .raw file generated by iprscan). First the information
|
|
137 * in the splitted line is stored in a double array. In a second step (at the end of the
|
|
138 * current method, the double array content is written to Excel.
|
|
139 */
|
|
140 if (string.contains("GO:")){
|
|
141 GOInfo = true;
|
|
142 splitGOTerms(string);
|
|
143 }
|
|
144 getMax(colnr);
|
|
145 colnr++;
|
|
146 }
|
|
147 line = reader.readLine();
|
|
148 rownr++;
|
|
149 myRow = sheet.createRow(rownr);
|
|
150 }
|
|
151 } catch (FileNotFoundException e) {
|
|
152 System.err.println("The .raw file cannot be found.");
|
|
153 } catch (IOException e) {
|
|
154 System.err.println("An input/output exception occurred while reading the .raw file.");
|
|
155 }
|
|
156 finally {
|
|
157 if (reader != null) {
|
|
158 try {
|
|
159 reader.close();
|
|
160 }
|
|
161 catch (IOException e){}
|
|
162 }
|
|
163 }
|
|
164 }
|
|
165
|
|
166 /**
|
|
167 * Helper method for parseRaw()
|
|
168 * Split a string containing GO information. A typical string looks like:
|
|
169 * "Molecular Function: sequence-specific DNA binding transcription factor activity (GO:0003700), Cellular Component: nucleus (GO:0005634), Biological Process: regulation of transcription, DNA-dependent (GO:0006355), Molecular Function: sequence-specific DNA binding (GO:0043565)"
|
|
170 * or in more general terms:
|
|
171 * "Title1: description1 (GO number1), Title2: description2 (GO number2), Title3: description3 (GO number3)"
|
|
172 * The string should be splitted in three parts: title, description and GO number.
|
|
173 * In fact we are dealing with comma delimited strings, but split may not happen at every comma,
|
|
174 * only when comma is preceded by (
|
|
175 * Split may for example not happen at the comma in case of "Molecular Function: transferase activity,
|
|
176 * transferring phosphorus-containing groups (GO:0016772)"
|
|
177 * In order to assure that the splitting occurs at the right place, the comma's at the places where splitting has to
|
|
178 * occur are replaced by the unique splitting character ;
|
|
179 * @return
|
|
180 */
|
|
181 public void splitGOTerms(String string){
|
|
182 if (string != null &&!string.isEmpty() && !string.trim().isEmpty()){
|
|
183
|
|
184 String modifiedString = string.replace("),", ");");
|
|
185 /*
|
|
186 * the modifiedString is splitted at the ;
|
|
187 */
|
|
188 String[] splitsClassification = modifiedString.split("; ");
|
|
189 int numberOfGoTerms = splitsClassification.length;
|
|
190 getMaxNrOfGOTerms(numberOfGoTerms);
|
|
191 for (int i = 0; i < splitsClassification.length; i++){
|
|
192 myCell = myRow.createCell(colnr);
|
|
193 myCell.setCellValue(splitsClassification[i].substring(0, splitsClassification[i].indexOf(':'))); //title
|
|
194 colnr++;
|
|
195 myCell = myRow.createCell(colnr);
|
|
196 myCell.setCellValue(splitsClassification[i].substring(splitsClassification[i].indexOf('(')+1, splitsClassification[i].indexOf(')'))); // GO term
|
|
197 colnr++;
|
|
198 myCell = myRow.createCell(colnr);
|
|
199 myCell.setCellValue(splitsClassification[i].substring(splitsClassification[i].indexOf(':')+2, splitsClassification[i].indexOf('('))); // description)
|
|
200 colnr++;
|
|
201 }
|
|
202 }
|
|
203 }
|
|
204
|
|
205 /**
|
|
206 * Helper method for parseRaw()
|
|
207 * Check whether a certain input string (stringToMatch) matches a certain regular expression.
|
|
208 * @param regex
|
|
209 * @param stringToMatch
|
|
210 * @return
|
|
211 */
|
|
212 public boolean checkRegex(String regex, String stringToMatch){
|
|
213 Pattern p = Pattern.compile(regex);
|
|
214 Matcher m = p.matcher(stringToMatch);
|
|
215 if(m.matches() && stringToMatch != null &&!stringToMatch.isEmpty() && !stringToMatch.trim().isEmpty()){
|
|
216 return true;
|
|
217 } else {
|
|
218 return false;
|
|
219 }
|
|
220 }
|
|
221
|
|
222 /**********************************
|
|
223 * header titles of the Excel sheet
|
|
224 *********************************/
|
|
225 /**
|
|
226 * the header titles are generated in the first row (index 0) of the spreadsheet
|
|
227 * All the potential column titles are added to the titles ArrayList. The .raw file always contains
|
|
228 * some fixed part (standard titles), but can also contain more information (titles which are not
|
|
229 * required for every .raw file).
|
|
230 * This last category of headers is only added in case the information occurs in the .raw file.
|
|
231 */
|
|
232 public void addHeaderTitles(){
|
|
233 /*
|
|
234 * standard titles
|
|
235 */
|
|
236 titles.add("protein ID");
|
|
237 titles.add("protein crc64");
|
|
238 titles.add("protein length");
|
|
239 titles.add("match dbname");
|
|
240 titles.add("classification id");
|
|
241 titles.add("classification description");
|
|
242 titles.add("start");
|
|
243 titles.add("end");
|
|
244 titles.add("score");
|
|
245 titles.add("status");
|
|
246 titles.add("date");
|
|
247 /*
|
|
248 * titles which are not required for every .raw file
|
|
249 */
|
|
250 if (interproInfo){ // only if the .raw file contains "IPR" boolean interproInfo becomes true
|
|
251 titles.add("interpro ID");
|
|
252 titles.add("interpro name");
|
|
253 }
|
|
254
|
|
255 if (GOInfo){
|
|
256 for (int i = 0; i < maxNrOfGOTerms; i++){
|
|
257 titles.add("title");
|
|
258 titles.add("GO number");
|
|
259 titles.add("description");
|
|
260 }
|
|
261 }
|
|
262
|
|
263 myRow = sheet.createRow(0);
|
|
264 // show the headers in the table
|
|
265 for (int i = 0; i < titles.size() ; i ++){
|
|
266 myCell = myRow.createCell(i);
|
|
267 myCell.setCellValue(titles.get(i)); // content of the headercell
|
|
268 formatHeader(); // color of the headercell
|
|
269 }
|
|
270 }
|
|
271
|
|
272 /**
|
|
273 * method to find the number of Excel columns needed for the GO terms information (title, description and GO number)n
|
|
274 * The method looks for the maximal number of columns needed, because sometimes there are
|
|
275 * rows with no GO information and other rows which have GO information.
|
|
276 * As long as in one Excel sheet there is one row with GO information, the titles for the GO information
|
|
277 * have to be showed correctly. The method getNrOfGOTerms helps in this task.
|
|
278 */
|
|
279 public void getMaxNrOfGOTerms(int nr){
|
|
280 if (nr > maxNrOfGOTerms){
|
|
281 maxNrOfGOTerms = nr;
|
|
282 }
|
|
283 }
|
|
284
|
|
285
|
|
286 /*******************************
|
|
287 * formatting of the Excel sheet
|
|
288 ******************************/
|
|
289 /**
|
|
290 * method to apply all formatting to the Excel tabsheet containing the .raw data
|
|
291 */
|
|
292 public void formatStyle(){
|
|
293 setAutoFilters();
|
|
294 autoSizeColumns();
|
|
295 freezeRow();
|
|
296 }
|
|
297
|
|
298 /**
|
|
299 * make autofilters of the column headers in Excel
|
|
300 * ref: http://stackoverflow.com/questions/3114220/poi-auto-filter
|
|
301 */
|
|
302 public void setAutoFilters(){
|
|
303 if (GOInfo){
|
|
304 /*
|
|
305 * for some reason, the first empty column contains an autofilter in the case there is
|
|
306 * GO information. However, only the columns which are not empty should have and autofilter.
|
|
307 * In order to avoid this small bug, the autofilter method was changed
|
|
308 * slightly: maxNr -1 instead of maxNr
|
|
309 * method: public XSSFAutoFilter setAutoFilter(CellRangeAddress, range);
|
|
310 */
|
|
311 sheet.setAutoFilter(org.apache.poi.ss.util.CellRangeAddress.valueOf("A1:"+ (Character.toString((char)( 65+maxNr-1)))+"1"));
|
|
312 } else {
|
|
313 sheet.setAutoFilter(org.apache.poi.ss.util.CellRangeAddress.valueOf("A1:"+ (Character.toString((char)( 65+maxNr)))+"1"));
|
|
314 }
|
|
315 }
|
|
316
|
|
317 /**
|
|
318 * set the column width automatically to the width of the content
|
|
319 */
|
|
320 public void autoSizeColumns(){
|
|
321 for(int column = 0; column < maxNr; column++){
|
|
322 sheet.autoSizeColumn(column);
|
|
323 }
|
|
324 }
|
|
325
|
|
326 /**
|
|
327 * helper method for setAutoFilters() and autoSizeColumns() to find the number
|
|
328 * of columns present in the tab sheet of the Excel file.
|
|
329 * Sometimes, there are columns which are empty in a certain row, but filled in
|
|
330 * another row. We always have to cope with the most extreme situation. Therefore,
|
|
331 * the maximum number of columns is determined.
|
|
332 * @param nr
|
|
333 */
|
|
334 public void getMax(int nr){
|
|
335 if (nr > maxNr){
|
|
336 maxNr = nr;
|
|
337 }
|
|
338 }
|
|
339
|
|
340 /**
|
|
341 * give the header cells a blue color and bold formatting
|
|
342 */
|
|
343 public void formatHeader(){
|
|
344 XSSFCellStyle style = wb.createCellStyle();
|
|
345 XSSFFont font = wb.createFont();
|
|
346 font.setColor(new XSSFColor(Color.BLUE));
|
|
347 font.setBold(true);
|
|
348 style.setFont(font);
|
|
349 myCell.setCellStyle(style);
|
|
350 }
|
|
351
|
|
352 /**
|
|
353 * freeze the header row
|
|
354 * method: public void createFreezePane(int colSplit, int rowSplit, intleftmostColumn, int topRow)
|
|
355 */
|
|
356 public void freezeRow(){
|
|
357 sheet.createFreezePane(0, 1, 0, 1);
|
|
358 }
|
|
359
|
|
360 /**
|
|
361 * create a cell style that formats numbers in scientific notation (exponential)
|
|
362 * for the score column (index 8)
|
|
363 * Differentiate the text content (NA) from the exponential values via an if ... else
|
|
364 * because otherwise the formatting as exponential value is not OK.
|
|
365 */
|
|
366 public void formatExponential(String s){
|
|
367 if (s.equals("NA")){
|
|
368 myCell.setCellValue("NA");
|
|
369 }
|
|
370 else {
|
|
371 XSSFCellStyle cs = wb.createCellStyle();
|
|
372 XSSFDataFormat df = wb.createDataFormat();
|
|
373 cs.setDataFormat(df.getFormat("0.0E+0"));
|
|
374 myCell.setCellValue(Double.parseDouble(s));
|
|
375 myCell.setCellStyle(cs);
|
|
376 }
|
|
377 }
|
|
378
|
|
379 /*********************
|
|
380 * getters and setters
|
|
381 *********************/
|
|
382 public void setColnr(int colnr) {
|
|
383 this.colnr = colnr;
|
|
384 }
|
|
385
|
|
386 public int getColnr() {
|
|
387 return colnr;
|
|
388 }
|
|
389
|
|
390 /**
|
|
391 * @param nrOfGOTerms the nrOfGOTerms to set
|
|
392 */
|
|
393 public void setNrOfGOTerms(int nrOfGOTerms) {
|
|
394 this.maxNrOfGOTerms = nrOfGOTerms;
|
|
395 }
|
|
396
|
|
397 /**
|
|
398 * @return the nrOfGOTerms
|
|
399 */
|
|
400 public int getNrOfGOTerms() {
|
|
401 return maxNrOfGOTerms;
|
|
402 }
|
|
403 }
|