Mercurial > repos > basfplant > interproscan_to_excel
comparison export_iprscan_to_Excel/source_files/iprscanToExcel_v20/src/be/cropdesign/iprscan/RawToExcel.java @ 0:a9762cd6e2e3 draft default tip
Uploaded
author | basfplant |
---|---|
date | Tue, 05 Mar 2013 04:00:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a9762cd6e2e3 |
---|---|
1 package be.cropdesign.iprscan; | |
2 /** | |
3 * Converts the *.raw output file of the InterProScan program to an Excel file (*.xlsx) | |
4 * @author: Katrien Bernaerts and Domantas Motiejunas | |
5 * @date: 21/06/2012 | |
6 * @affiliation: CropDesign N.V., a BASF Plant Science Company - Technologiepark 3, 9052 Zwijnaarde - Belgium | |
7 */ | |
8 import java.awt.Color; | |
9 import java.io.BufferedReader; | |
10 import java.io.FileNotFoundException; | |
11 import java.io.FileReader; | |
12 import java.io.IOException; | |
13 import java.util.ArrayList; | |
14 import java.util.regex.Matcher; | |
15 import java.util.regex.Pattern; | |
16 | |
17 import org.apache.poi.xssf.usermodel.XSSFCell; | |
18 import org.apache.poi.xssf.usermodel.XSSFCellStyle; | |
19 import org.apache.poi.xssf.usermodel.XSSFColor; | |
20 import org.apache.poi.xssf.usermodel.XSSFDataFormat; | |
21 import org.apache.poi.xssf.usermodel.XSSFFont; | |
22 import org.apache.poi.xssf.usermodel.XSSFRow; | |
23 import org.apache.poi.xssf.usermodel.XSSFSheet; | |
24 import org.apache.poi.xssf.usermodel.XSSFWorkbook; | |
25 | |
26 public class RawToExcel { | |
27 | |
28 private int colnr; | |
29 private int maxNr; | |
30 private int rownr; | |
31 private int maxNrOfGOTerms; | |
32 | |
33 /** | |
34 * fields needed to generate Excel | |
35 */ | |
36 private XSSFWorkbook wb; | |
37 private XSSFSheet sheet; | |
38 private XSSFRow myRow; | |
39 private XSSFCell myCell; | |
40 private String rawFile; | |
41 private boolean interproInfo; // this info is not standard in every .raw file, but can occur | |
42 private boolean GOInfo; // this info is not standard in every .raw file, but can occur | |
43 | |
44 private ArrayList<String> titles; | |
45 | |
46 /** | |
47 * constructor | |
48 * @param wb | |
49 * @param sheet3 | |
50 * @param myRow | |
51 * @param myCell | |
52 * @param rawFile | |
53 */ | |
54 public RawToExcel(XSSFWorkbook wb, XSSFSheet sheet3,XSSFRow myRow, XSSFCell myCell, String rawFile){ | |
55 this.wb = wb; | |
56 this.sheet = sheet3; | |
57 this.myRow = myRow; | |
58 this.myCell = myCell; | |
59 this.rawFile = rawFile; | |
60 maxNr = 0; | |
61 maxNrOfGOTerms = 0; | |
62 interproInfo = false; | |
63 GOInfo = false; | |
64 titles = new ArrayList<String>(); | |
65 parseRaw(); | |
66 addHeaderTitles(); | |
67 formatStyle(); | |
68 } | |
69 | |
70 /********************************** | |
71 * parsing of the .raw file content | |
72 **********************************/ | |
73 /** | |
74 * method to parse the data in the .raw file with a BufferedReader/FileReader. The parsed data | |
75 * are written to Excel using the Apache POI library. | |
76 */ | |
77 public void parseRaw() { | |
78 BufferedReader reader = null; | |
79 try { | |
80 reader = new BufferedReader(new FileReader(rawFile)); | |
81 | |
82 String line = reader.readLine(); | |
83 | |
84 /* | |
85 * start with rownr = 1 to write content because an empty row has to be reserved for the titles | |
86 * The title row cannot be filled yet at this point because we first have to determine how many | |
87 * columns are needed, and by consequence, how many titles. The problem is that different .raw files | |
88 * can have a different amount of columns. | |
89 * For each line read with the buffered reader, the number of columns is compared to the maxColnr, | |
90 * because sometimes not all lines of the same file have the same number of columns, e.g. because the | |
91 * last column is empty. However, if for some rows the last column is empty, a column title is still needed | |
92 * for the other rows. Therefore, the method getMaxColumns(colnr) is called for each line. | |
93 */ | |
94 rownr = 1; | |
95 while(line != null) { | |
96 String[] splits = line.split("\t"); // tab delimited file | |
97 myRow = sheet.createRow(rownr); | |
98 colnr = 0; | |
99 for (String string : splits){ | |
100 myCell = myRow.createCell(colnr); | |
101 myCell.setCellValue(string); | |
102 | |
103 /* | |
104 * interProScan info is not present in all raw files. For the creation of the right | |
105 * header titles, it is important to know whether the parsed raw file contained | |
106 * interProScan info | |
107 */ | |
108 if (string.contains("IPR")){ | |
109 interproInfo = true; | |
110 } | |
111 | |
112 /* | |
113 * format the cell content as Integer for the columns protein length (colnr=2), | |
114 * start (colnr=6) or end (colnr=7). To know which input only contains integers, | |
115 * a regex is used. If only numbers or spaces are found in the input string, | |
116 * and if the input string is not empty, the corresponding Excel cell is | |
117 * formatted as Integer. | |
118 * If the cell content is not formatted as number, sorting etc. via the filters | |
119 * in the headers does not work correct. | |
120 */ | |
121 if (checkRegex("^([0-9]+\\s*)*$", string)){ | |
122 myCell.setCellValue(Integer.parseInt(string)); | |
123 } | |
124 | |
125 /* | |
126 * create a cell style that formats numbers in scientific notation (exponential) | |
127 * for the score column (index 8) | |
128 */ | |
129 if (checkRegex("^[-+]?([0-9]*\\.?[0-9]+([eE][-+]?[0-9]+))*$", string)){ | |
130 formatExponential(string); | |
131 } | |
132 /* | |
133 * split up the line with GO classification information further such | |
134 * that the splitted line of GO information can be stored in different | |
135 * Excel cells instead of all information concatenated into one cell (like it is | |
136 * in the original .raw file generated by iprscan). First the information | |
137 * in the splitted line is stored in a double array. In a second step (at the end of the | |
138 * current method, the double array content is written to Excel. | |
139 */ | |
140 if (string.contains("GO:")){ | |
141 GOInfo = true; | |
142 splitGOTerms(string); | |
143 } | |
144 getMax(colnr); | |
145 colnr++; | |
146 } | |
147 line = reader.readLine(); | |
148 rownr++; | |
149 myRow = sheet.createRow(rownr); | |
150 } | |
151 } catch (FileNotFoundException e) { | |
152 System.err.println("The .raw file cannot be found."); | |
153 } catch (IOException e) { | |
154 System.err.println("An input/output exception occurred while reading the .raw file."); | |
155 } | |
156 finally { | |
157 if (reader != null) { | |
158 try { | |
159 reader.close(); | |
160 } | |
161 catch (IOException e){} | |
162 } | |
163 } | |
164 } | |
165 | |
166 /** | |
167 * Helper method for parseRaw() | |
168 * Split a string containing GO information. A typical string looks like: | |
169 * "Molecular Function: sequence-specific DNA binding transcription factor activity (GO:0003700), Cellular Component: nucleus (GO:0005634), Biological Process: regulation of transcription, DNA-dependent (GO:0006355), Molecular Function: sequence-specific DNA binding (GO:0043565)" | |
170 * or in more general terms: | |
171 * "Title1: description1 (GO number1), Title2: description2 (GO number2), Title3: description3 (GO number3)" | |
172 * The string should be splitted in three parts: title, description and GO number. | |
173 * In fact we are dealing with comma delimited strings, but split may not happen at every comma, | |
174 * only when comma is preceded by ( | |
175 * Split may for example not happen at the comma in case of "Molecular Function: transferase activity, | |
176 * transferring phosphorus-containing groups (GO:0016772)" | |
177 * In order to assure that the splitting occurs at the right place, the comma's at the places where splitting has to | |
178 * occur are replaced by the unique splitting character ; | |
179 * @return | |
180 */ | |
181 public void splitGOTerms(String string){ | |
182 if (string != null &&!string.isEmpty() && !string.trim().isEmpty()){ | |
183 | |
184 String modifiedString = string.replace("),", ");"); | |
185 /* | |
186 * the modifiedString is splitted at the ; | |
187 */ | |
188 String[] splitsClassification = modifiedString.split("; "); | |
189 int numberOfGoTerms = splitsClassification.length; | |
190 getMaxNrOfGOTerms(numberOfGoTerms); | |
191 for (int i = 0; i < splitsClassification.length; i++){ | |
192 myCell = myRow.createCell(colnr); | |
193 myCell.setCellValue(splitsClassification[i].substring(0, splitsClassification[i].indexOf(':'))); //title | |
194 colnr++; | |
195 myCell = myRow.createCell(colnr); | |
196 myCell.setCellValue(splitsClassification[i].substring(splitsClassification[i].indexOf('(')+1, splitsClassification[i].indexOf(')'))); // GO term | |
197 colnr++; | |
198 myCell = myRow.createCell(colnr); | |
199 myCell.setCellValue(splitsClassification[i].substring(splitsClassification[i].indexOf(':')+2, splitsClassification[i].indexOf('('))); // description) | |
200 colnr++; | |
201 } | |
202 } | |
203 } | |
204 | |
205 /** | |
206 * Helper method for parseRaw() | |
207 * Check whether a certain input string (stringToMatch) matches a certain regular expression. | |
208 * @param regex | |
209 * @param stringToMatch | |
210 * @return | |
211 */ | |
212 public boolean checkRegex(String regex, String stringToMatch){ | |
213 Pattern p = Pattern.compile(regex); | |
214 Matcher m = p.matcher(stringToMatch); | |
215 if(m.matches() && stringToMatch != null &&!stringToMatch.isEmpty() && !stringToMatch.trim().isEmpty()){ | |
216 return true; | |
217 } else { | |
218 return false; | |
219 } | |
220 } | |
221 | |
222 /********************************** | |
223 * header titles of the Excel sheet | |
224 *********************************/ | |
225 /** | |
226 * the header titles are generated in the first row (index 0) of the spreadsheet | |
227 * All the potential column titles are added to the titles ArrayList. The .raw file always contains | |
228 * some fixed part (standard titles), but can also contain more information (titles which are not | |
229 * required for every .raw file). | |
230 * This last category of headers is only added in case the information occurs in the .raw file. | |
231 */ | |
232 public void addHeaderTitles(){ | |
233 /* | |
234 * standard titles | |
235 */ | |
236 titles.add("protein ID"); | |
237 titles.add("protein crc64"); | |
238 titles.add("protein length"); | |
239 titles.add("match dbname"); | |
240 titles.add("classification id"); | |
241 titles.add("classification description"); | |
242 titles.add("start"); | |
243 titles.add("end"); | |
244 titles.add("score"); | |
245 titles.add("status"); | |
246 titles.add("date"); | |
247 /* | |
248 * titles which are not required for every .raw file | |
249 */ | |
250 if (interproInfo){ // only if the .raw file contains "IPR" boolean interproInfo becomes true | |
251 titles.add("interpro ID"); | |
252 titles.add("interpro name"); | |
253 } | |
254 | |
255 if (GOInfo){ | |
256 for (int i = 0; i < maxNrOfGOTerms; i++){ | |
257 titles.add("title"); | |
258 titles.add("GO number"); | |
259 titles.add("description"); | |
260 } | |
261 } | |
262 | |
263 myRow = sheet.createRow(0); | |
264 // show the headers in the table | |
265 for (int i = 0; i < titles.size() ; i ++){ | |
266 myCell = myRow.createCell(i); | |
267 myCell.setCellValue(titles.get(i)); // content of the headercell | |
268 formatHeader(); // color of the headercell | |
269 } | |
270 } | |
271 | |
272 /** | |
273 * method to find the number of Excel columns needed for the GO terms information (title, description and GO number)n | |
274 * The method looks for the maximal number of columns needed, because sometimes there are | |
275 * rows with no GO information and other rows which have GO information. | |
276 * As long as in one Excel sheet there is one row with GO information, the titles for the GO information | |
277 * have to be showed correctly. The method getNrOfGOTerms helps in this task. | |
278 */ | |
279 public void getMaxNrOfGOTerms(int nr){ | |
280 if (nr > maxNrOfGOTerms){ | |
281 maxNrOfGOTerms = nr; | |
282 } | |
283 } | |
284 | |
285 | |
286 /******************************* | |
287 * formatting of the Excel sheet | |
288 ******************************/ | |
289 /** | |
290 * method to apply all formatting to the Excel tabsheet containing the .raw data | |
291 */ | |
292 public void formatStyle(){ | |
293 setAutoFilters(); | |
294 autoSizeColumns(); | |
295 freezeRow(); | |
296 } | |
297 | |
298 /** | |
299 * make autofilters of the column headers in Excel | |
300 * ref: http://stackoverflow.com/questions/3114220/poi-auto-filter | |
301 */ | |
302 public void setAutoFilters(){ | |
303 if (GOInfo){ | |
304 /* | |
305 * for some reason, the first empty column contains an autofilter in the case there is | |
306 * GO information. However, only the columns which are not empty should have and autofilter. | |
307 * In order to avoid this small bug, the autofilter method was changed | |
308 * slightly: maxNr -1 instead of maxNr | |
309 * method: public XSSFAutoFilter setAutoFilter(CellRangeAddress, range); | |
310 */ | |
311 sheet.setAutoFilter(org.apache.poi.ss.util.CellRangeAddress.valueOf("A1:"+ (Character.toString((char)( 65+maxNr-1)))+"1")); | |
312 } else { | |
313 sheet.setAutoFilter(org.apache.poi.ss.util.CellRangeAddress.valueOf("A1:"+ (Character.toString((char)( 65+maxNr)))+"1")); | |
314 } | |
315 } | |
316 | |
317 /** | |
318 * set the column width automatically to the width of the content | |
319 */ | |
320 public void autoSizeColumns(){ | |
321 for(int column = 0; column < maxNr; column++){ | |
322 sheet.autoSizeColumn(column); | |
323 } | |
324 } | |
325 | |
326 /** | |
327 * helper method for setAutoFilters() and autoSizeColumns() to find the number | |
328 * of columns present in the tab sheet of the Excel file. | |
329 * Sometimes, there are columns which are empty in a certain row, but filled in | |
330 * another row. We always have to cope with the most extreme situation. Therefore, | |
331 * the maximum number of columns is determined. | |
332 * @param nr | |
333 */ | |
334 public void getMax(int nr){ | |
335 if (nr > maxNr){ | |
336 maxNr = nr; | |
337 } | |
338 } | |
339 | |
340 /** | |
341 * give the header cells a blue color and bold formatting | |
342 */ | |
343 public void formatHeader(){ | |
344 XSSFCellStyle style = wb.createCellStyle(); | |
345 XSSFFont font = wb.createFont(); | |
346 font.setColor(new XSSFColor(Color.BLUE)); | |
347 font.setBold(true); | |
348 style.setFont(font); | |
349 myCell.setCellStyle(style); | |
350 } | |
351 | |
352 /** | |
353 * freeze the header row | |
354 * method: public void createFreezePane(int colSplit, int rowSplit, intleftmostColumn, int topRow) | |
355 */ | |
356 public void freezeRow(){ | |
357 sheet.createFreezePane(0, 1, 0, 1); | |
358 } | |
359 | |
360 /** | |
361 * create a cell style that formats numbers in scientific notation (exponential) | |
362 * for the score column (index 8) | |
363 * Differentiate the text content (NA) from the exponential values via an if ... else | |
364 * because otherwise the formatting as exponential value is not OK. | |
365 */ | |
366 public void formatExponential(String s){ | |
367 if (s.equals("NA")){ | |
368 myCell.setCellValue("NA"); | |
369 } | |
370 else { | |
371 XSSFCellStyle cs = wb.createCellStyle(); | |
372 XSSFDataFormat df = wb.createDataFormat(); | |
373 cs.setDataFormat(df.getFormat("0.0E+0")); | |
374 myCell.setCellValue(Double.parseDouble(s)); | |
375 myCell.setCellStyle(cs); | |
376 } | |
377 } | |
378 | |
379 /********************* | |
380 * getters and setters | |
381 *********************/ | |
382 public void setColnr(int colnr) { | |
383 this.colnr = colnr; | |
384 } | |
385 | |
386 public int getColnr() { | |
387 return colnr; | |
388 } | |
389 | |
390 /** | |
391 * @param nrOfGOTerms the nrOfGOTerms to set | |
392 */ | |
393 public void setNrOfGOTerms(int nrOfGOTerms) { | |
394 this.maxNrOfGOTerms = nrOfGOTerms; | |
395 } | |
396 | |
397 /** | |
398 * @return the nrOfGOTerms | |
399 */ | |
400 public int getNrOfGOTerms() { | |
401 return maxNrOfGOTerms; | |
402 } | |
403 } |