Mercurial > repos > md-anderson-bioinformatics > matrix_manipulation
comparison Split_ExcelTabs_IntoFiles.py @ 1:f1bcd79cd923 draft default tip
Uploaded
| author | insilico-bob | 
|---|---|
| date | Tue, 27 Nov 2018 14:20:40 -0500 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| 0:7f12c81e2083 | 1:f1bcd79cd923 | 
|---|---|
| 1 import sys | |
| 2 import os | |
| 3 #import MySQLdb | |
| 4 #import config | |
| 5 import subprocess | |
| 6 import re | |
| 7 import shutil | |
| 8 import traceback | |
| 9 #import xlsxwriter | |
| 10 import xlrd | |
| 11 | |
| 12 #http://www.blog.pythonlibrary.org/2014/04/30/reading-excel-spreadsheets-with-python-and-xlrd/ | |
| 13 | |
| 14 def File_From_Tab(infileName, outfileName, tabName,tabNumber): | |
| 15 """ | |
| 16 Open and read an Excel file | |
| 17 """ | |
| 18 book = xlrd.open_workbook(infileName) | |
| 19 # print number of sheets | |
| 20 #print book.nsheets | |
| 21 | |
| 22 # print sheet names | |
| 23 tabList= book.sheet_names() | |
| 24 #print tabList | |
| 25 #print book.sheet_names() | |
| 26 if tabName == "" and (tabNumber <1 or tabNumber > len(tabList)): | |
| 27 sys.stderr.write("\n>>>ERROR illegal tab number "+str(tabNumber)+" input when no tab name was specified\n") | |
| 28 sys.stderr.write("\n>>>Allowed tab numbers, or tab names, for this file with "+str(len(tabList))+" total tabs are:") | |
| 29 | |
| 30 for i in range(len(tabList)): | |
| 31 sys.stderr.write("\n>>> tab number "+str(i+1)+" is named "+str(tabList[i])) | |
| 32 sys.exit(-1) | |
| 33 | |
| 34 if tabName != "": # use name instead of tab number | |
| 35 found = False | |
| 36 i = 0 | |
| 37 while (i < len(tabList)) and not found: | |
| 38 i += 1 | |
| 39 if tabName == str(tabList[i-1]): | |
| 40 tabNumber = i | |
| 41 found = True | |
| 42 if not found: | |
| 43 sys.stderr("\n>>> ERROR -- Input Tab name "+tabName+" was not found\n") | |
| 44 sys.exit(-1) | |
| 45 # get the first worksheet | |
| 46 #first_sheet = book.sheet_by_index(0) | |
| 47 worksheet = book.sheet_by_index(tabNumber-1) | |
| 48 | |
| 49 outFile = open(outfileName+str(tabList[tabNumber-1]+".tsv"), 'w') | |
| 50 | |
| 51 #https://stackoverflow.com/questions/14944623/python-xrld-read-rows-and-columns | |
| 52 #workbook = xlrd.open_workbook('my_workbook.xls') | |
| 53 #worksheet = workbook.sheet_by_name('Sheet1') | |
| 54 num_rows = worksheet.nrows - 1 | |
| 55 num_cells = worksheet.ncols - 1 | |
| 56 curr_row = -1 | |
| 57 while curr_row < num_rows: | |
| 58 curr_row += 1 | |
| 59 row = worksheet.row(curr_row) | |
| 60 | |
| 61 if curr_row == 0: | |
| 62 endOfLine= False | |
| 63 allRowNumCols= len(row) | |
| 64 i= len(row)-1 | |
| 65 # find length of matrix and covariates using first row | |
| 66 # Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank | |
| 67 while i <= len(row)-1 and not endOfLine: | |
| 68 cell_type = worksheet.cell_type(curr_row, i) | |
| 69 #temp = str(worksheet.cell_value(curr_row, i)) | |
| 70 #print( " pos and cell type row one ",cell_type, i) | |
| 71 | |
| 72 if cell_type == 0 or cell_type == 6: | |
| 73 allRowNumCols -= 1 | |
| 74 i -= 1 | |
| 75 else: | |
| 76 endOfLine= True | |
| 77 | |
| 78 if allRowNumCols < 5: | |
| 79 sys.stderr.write("\nERROR First row number of columns= "+str(allRowNumCols)+" is too short, so all rows will be ignored\n") | |
| 80 sys.exit(-1) | |
| 81 elif curr_row == 0: | |
| 82 sys.stdout.write("\nALL Rows must all have the same number of columns as the First row's number columns = "+ str(allRowNumCols) +"\n") | |
| 83 | |
| 84 temp= '' | |
| 85 rowLen= 0 | |
| 86 endOfLine= False | |
| 87 | |
| 88 while rowLen < allRowNumCols and not endOfLine: | |
| 89 temp += str(worksheet.cell_value(curr_row, rowLen))+"\t" | |
| 90 #temp += str(row[rowLen])+"\t" | |
| 91 rowLen += 1 | |
| 92 | |
| 93 temp = temp[:-1]+"\n" | |
| 94 #print 'Row:', curr_row, len(row), rowLen | |
| 95 outFile.write(temp) #TODO check if rows are all same length | |
| 96 | |
| 97 sys.stdout.write("File created with "+str(curr_row)+" rows and "+str(allRowNumCols)+" columns\n") | |
| 98 # curr_cell = -1 | |
| 99 # while curr_cell < num_cells: | |
| 100 # curr_cell += 1 | |
| 101 # # Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank | |
| 102 # cell_type = worksheet.cell_type(curr_row, curr_cell) | |
| 103 # cell_value = worksheet.cell_value(curr_row, curr_cell) | |
| 104 # print ' ', cell_type, ':', cell_value | |
| 105 # # read a row | |
| 106 # print first_sheet.row_values(0) | |
| 107 # | |
| 108 # # read a cell | |
| 109 # cell = first_sheet.cell(0,0) | |
| 110 # print cell | |
| 111 # print cell.value | |
| 112 # | |
| 113 # # read a row slice | |
| 114 # print first_sheet.row_slice(rowx=0, | |
| 115 # start_colx=0, | |
| 116 # end_colx=2) | |
| 117 | |
| 118 return tabList | |
| 119 | |
| 120 | |
| 121 #====================== | |
| 122 # from RPPA callInSilicoReportWriter.py | |
| 123 # def write_xlsx_for_report(directory_for_reports, report_name, report_id, dict_cf2_values): | |
| 124 # | |
| 125 # | |
| 126 # error_write_xlsx = "" | |
| 127 # error_occurred = 0 | |
| 128 # | |
| 129 # try: | |
| 130 # path_to_dir_when_writing = os.path.join(directory_for_reports, report_name) | |
| 131 # header_path = os.path.join(directory_for_reports, report_name, "header.csv") | |
| 132 # raw_log_2_path = os.path.join(directory_for_reports, report_name, "RawLog2.csv") | |
| 133 # norm_linear_path = os.path.join(directory_for_reports, report_name, "NormLinear.csv") | |
| 134 # norm_log_2_path = os.path.join(directory_for_reports, report_name, "NormLog2.csv") | |
| 135 # norm_log_2_median_centered_path = os.path.join(directory_for_reports, report_name, "NormLog2_MedianCentered.csv") | |
| 136 # | |
| 137 # # put the cf2 values in the NormLinear file | |
| 138 # error_put_cf2_in_normLinear = write_new_normLinear_csv_file_with_cf2_values(path_to_dir_when_writing, norm_linear_path, dict_cf2_values) | |
| 139 # | |
| 140 # | |
| 141 # excel_workBook = xlsxwriter.Workbook(os.path.join(directory_for_reports, report_name,report_name + ".xlsx"), {'strings_to_numbers': True}) | |
| 142 # | |
| 143 # rawLog2_worksheet = excel_workBook.add_worksheet("RawLog2") | |
| 144 # error_rawLog2 = construct_worksheet_for_xlsx(rawLog2_worksheet, header_path, "RawLog2", raw_log_2_path) | |
| 145 # | |
| 146 # norm_linear_worksheet = excel_workBook.add_worksheet("NormLinear") | |
| 147 # error_norm_linear = construct_worksheet_for_xlsx(norm_linear_worksheet, header_path, "NormLinear", norm_linear_path) | |
| 148 # | |
| 149 # norm_log_2_worksheet = excel_workBook.add_worksheet("NormLog2") | |
| 150 # error_norm_log_2 = construct_worksheet_for_xlsx(norm_log_2_worksheet, header_path, "NormLog2", norm_log_2_path) | |
| 151 # | |
| 152 # norm_log_2_median_centered_worksheet = excel_workBook.add_worksheet("NormLog2_MedianCentered") | |
| 153 # error_norm_log_2_median_centered = construct_worksheet_for_xlsx(norm_log_2_median_centered_worksheet, header_path, "Median-Centered", norm_log_2_median_centered_path) | |
| 154 # | |
| 155 # errors_array = [error_put_cf2_in_normLinear, error_rawLog2, error_norm_linear, error_norm_log_2, error_norm_log_2_median_centered] | |
| 156 # for error in errors_array: | |
| 157 # if error != "": | |
| 158 # error_write_xlsx = error_write_xlsx + error | |
| 159 # error_occurred = 1 | |
| 160 # if error_occurred == 1: | |
| 161 # error_write_xlsx + "\nThe excel workbook for the report "+report_name+" was not written successfully.\n\n" | |
| 162 # | |
| 163 # excel_workBook.close() | |
| 164 # except Exception, e: | |
| 165 # error_occurred = 1 | |
| 166 # error_write_xlsx += str(repr(e)) + "\n\n" | |
| 167 # error_write_xlsx + "\nThe excel workbook for the report "+report_name+" was not written successfully.\n\n" | |
| 168 # try: | |
| 169 # excel_workBook.close() | |
| 170 # except Exception, f: | |
| 171 # sys.stderr.write("An unforeseen problem has occurred in write_xlsx_for_report()\n") | |
| 172 # sys.stderr.write(str(repr(f)) + "\n\n") | |
| 173 # | |
| 174 # | |
| 175 # return error_occurred, error_write_xlsx | |
| 176 # | |
| 177 # | |
| 178 # def write_new_normLinear_csv_file_with_cf2_values(path_to_dir, norm_linear_path, dict_cf2_values): | |
| 179 # errors = "" | |
| 180 # try: | |
| 181 # titles = {} | |
| 182 # new_lines_normLinear_with_cf2 = [] | |
| 183 # # read old norm linear file | |
| 184 # rf_normLinear = open(norm_linear_path, 'rU') | |
| 185 # line_num = 0 | |
| 186 # for line in rf_normLinear: | |
| 187 # line = strip_new_line_from_right_side(line) | |
| 188 # toks = line.split(",") | |
| 189 # line_num += 1 | |
| 190 # if line_num == 1: | |
| 191 # line += "1,CF2" | |
| 192 # new_lines_normLinear_with_cf2.append(line) | |
| 193 # titles = toks | |
| 194 # continue | |
| 195 # pos_rf = int(toks[titles.index('Order')]) | |
| 196 # line += "," + str(dict_cf2_values[pos_rf]) | |
| 197 # new_lines_normLinear_with_cf2.append(line) | |
| 198 # rf_normLinear.close() | |
| 199 # # rename the old normLinear file | |
| 200 # os.rename(norm_linear_path, os.path.join(path_to_dir, 'before_cf2_NormLinear.csv')) | |
| 201 # | |
| 202 # # write new normLinear with cf2 | |
| 203 # wf_new_normLinear = open(norm_linear_path, 'w') | |
| 204 # for line_writing in new_lines_normLinear_with_cf2: | |
| 205 # wf_new_normLinear.write(line_writing + "\n") | |
| 206 # wf_new_normLinear.close() | |
| 207 # except Exception, err_write_normLinear_with_cf2_values: | |
| 208 # errors = str(repr(err_write_normLinear_with_cf2_values)) | |
| 209 # | |
| 210 # return errors | |
| 211 # | |
| 212 # | |
| 213 # # This function constructs the worksheet for each tab in the excel file for a report | |
| 214 # # It puts these things in this order: | |
| 215 # # 1. Title of the tab | |
| 216 # # 2. Header for the tab | |
| 217 # # 3. Content of the tab | |
| 218 # def construct_worksheet_for_xlsx(worksheet, header_path, title_top_of_tab, tab_input_path): | |
| 219 # | |
| 220 # reload(sys) | |
| 221 # sys.setdefaultencoding('utf8') | |
| 222 # errors = "" | |
| 223 # | |
| 224 # try: | |
| 225 # # Write the title at the top of the tab | |
| 226 # worksheet.write(0,0,title_top_of_tab) | |
| 227 # | |
| 228 # # Variable to keep track of the rows | |
| 229 # row_num = 1 | |
| 230 # | |
| 231 # # Write the header stuff | |
| 232 # header_file = open(header_path, 'rU') | |
| 233 # for head_line in header_file: | |
| 234 # head_line = strip_new_line_from_right_side(head_line) | |
| 235 # head_toks = head_line.split(",") | |
| 236 # col_num = 0 | |
| 237 # for tok in head_toks: | |
| 238 # worksheet.write(row_num, col_num, tok) | |
| 239 # col_num += 1 | |
| 240 # row_num += 1 | |
| 241 # | |
| 242 # # Write the content stuff | |
| 243 # tab_input_file = open(tab_input_path, 'rU') | |
| 244 # for tab_line in tab_input_file: | |
| 245 # tab_line = strip_new_line_from_right_side(tab_line) | |
| 246 # tab_toks = tab_line.split(",") | |
| 247 # col_num = 0 | |
| 248 # for tok in tab_toks: | |
| 249 # tok = tok.decode('iso-8859-1').encode('utf-8') | |
| 250 # worksheet.write(row_num, col_num, tok) | |
| 251 # col_num += 1 | |
| 252 # row_num += 1 | |
| 253 # | |
| 254 # header_file.close() | |
| 255 # tab_input_file.close() | |
| 256 # except Exception, e: | |
| 257 # errors = errors + "\n\nAn error occurred while constructing the "+title_top_of_tab+" tab for the excel file.\n" | |
| 258 # errors = errors + "The error was :\n\t" + str(e) + "\n\n" | |
| 259 # try: | |
| 260 # header_file.close() | |
| 261 # tab_input_file.close() | |
| 262 # except NameError: | |
| 263 # x = 5 | |
| 264 # | |
| 265 return errors | |
| 266 | |
| 267 #---------------------------------------------------------------------- | |
| 268 if __name__ == "__main__": | |
| 269 | |
| 270 #try: | |
| 271 if len(sys.argv) > 4: | |
| 272 infileName = '"'+sys.argv[1]+'"' | |
| 273 tabName = '"'+sys.argv[2]+'"' | |
| 274 tabNumber = 0 | |
| 275 if tabName == '': tabNumber = int(sys.argv[3]) | |
| 276 outfileName = '"'+sys.argv[4]+'"' #TODO Later multiple outputs one per tab | |
| 277 | |
| 278 sys.stdout.write( "\nInput parameters ",str(sys.argv[1:4]),"\n" ) | |
| 279 | |
| 280 #infileName = "/Users/bobbrown/Desktop/01_Gordon_Mills__Zhiyong_Ding.xlsx" | |
| 281 #outfileName= "/Users/bobbrown/Desktop/01_Gordon_Mills__Zhiyong_Ding-Tab-Out-" | |
| 282 #tabName ="NormLog2" | |
| 283 #tabName ="" | |
| 284 #tabNumber= 10 | |
| 285 | |
| 286 status= File_From_Tab(infileName, outfileName, tabName, tabNumber ) | |
| 287 #except | |
| 288 #sys.exit(-1) | |
| 289 | |
| 290 sys.exit(0) | 
