Mercurial > repos > bornea > saint_preprocessing
diff Protein_report_processing.py @ 75:792056ff8ed5 draft
Uploaded
author | bornea |
---|---|
date | Fri, 02 Sep 2016 16:32:26 -0400 |
parents | 4f843e0c6c40 |
children | 43b9bad147df |
line wrap: on
line diff
--- a/Protein_report_processing.py Sat Aug 27 23:57:34 2016 -0400 +++ b/Protein_report_processing.py Fri Sep 02 16:32:26 2016 -0400 @@ -1,221 +0,0 @@ -import sys -import os -from time import sleep - -files = sys.argv[1] # read in a string of file names seperated by ", " -# e.g. "Default_Protein_Report.txt, Default_Protein_Report_2.txt" -#bait = sys.argv[2] # SAINT formatted bait file -# still need a way to match files to bait identifiers -# or they can just be required to be put in the order of the bait file -quant_type = sys.argv[3] # what metric to use for quantification -# "#Validated Peptides", "#Peptides", "#Unique", "#Validated PSMs", "#PSMs" -db = sys.argv[4] # fasta database used in SearchGUI and PeptideShaker -prey = sys.argv[5] -tool_path = sys.argv[7] -if db == "None": - db = str(tool_path) + "/SwissProt_HUMAN_2015_12.fasta" -make_bait = sys.argv[6] -bait_bool = sys.argv[8] - -def bait_create(baits, infile): - # Verifies the Baits are valid in the Scaffold file and writes the Bait.txt. - baits = make_bait.split() - i = 0 - bait_file_tmp = open("bait.txt", "w") - order = [] - bait_cache = [] - while i < len(baits): - if baits[i+2] == "true": - T_C = "C" - else: - T_C = "T" - bait_line = baits[i] + "\t" + baits[i+1] + "\t" + T_C + "\n" - bait_cache.append(str(bait_line)) - i = i + 3 - - for cache_line in bait_cache: - bait_file_tmp.write(cache_line) - - bait_file_tmp.close() - -if bait_bool == 'false': - bait_create(make_bait, infile) - bait = "bait.txt" -else: - bait_temp_file = open(sys.argv[9], 'r') - bait_cache = bait_temp_file.readlines() - bait_file_tmp = open("bait.txt", "wr") - for cache_line in bait_cache: - bait_file_tmp.write(cache_line) - bait_file_tmp.close() - bait = "bait.txt" - -class ReturnValue1(object): - def __init__(self, sequence, gene): - self.seqlength = sequence - self.genename = gene - -def read_tab(infile): - with open(infile,'r') as x: - output = [] - for line in x: - line = line.strip() - temp = line.split('\t') - output.append(temp) - return output -def printProgress (iteration, total, prefix = '', suffix = '', decimals = 1, barLength = 100): - """ - Call in a loop to create terminal progress bar - @params: - iteration - Required : current iteration (Int) - total - Required : total iterations (Int) - prefix - Optional : prefix string (Str) - suffix - Optional : suffix string (Str) - decimals - Optional : positive number of decimals in percent complete (Int) - barLength - Optional : character length of bar (Int) - """ - formatStr = "{0:." + str(decimals) + "f}" - percents = formatStr.format(100 * (iteration / float(total))) - filledLength = int(round(barLength * iteration / float(total))) - bar = '=' * filledLength + '-' * (barLength - filledLength) - sys.stdout.write('\r%s |%s| %s%s %s' % (prefix, bar, percents, '%', suffix)), - sys.stdout.flush() - if iteration == total: - sys.stdout.write('\n') - sys.stdout.flush() -def get_info(uniprot_accession_in,fasta_db): - # Get aminoacid lengths and gene name. - error = open('error proteins.txt', 'a+') - data = open(fasta_db, 'r') - data_lines = data.readlines() - db_len = len(data_lines) - seqlength = 0 - count = 0 - last_line = data_lines[-1] - for data_line in data_lines: - if ">sp" in data_line: - namer = data_line.split("|")[2] - if uniprot_accession_in == data_line.split("|")[1]: - match = count + 1 - if 'GN=' in data_line: - lst = data_line.split('GN=') - lst2 = lst[1].split(' ') - genename = lst2[0] - if 'GN=' not in data_line: - genename = 'NA' - while ">sp" not in data_lines[match]: - if match <= db_len: - seqlength = seqlength + len(data_lines[match].strip()) - if data_lines[match] == last_line: - break - match = match + 1 - else: - break - return ReturnValue1(seqlength, genename) - if uniprot_accession_in == namer.split(" ")[0]: - match = count + 1 - # Ensures consistent spacing throughout. - if 'GN=' in data_line: - lst = data_line.split('GN=') - lst2 = lst[1].split(' ') - genename = lst2[0] - if 'GN=' not in data_line: - genename = 'NA' - while ">sp" not in data_lines[match]: - if match <= db_len: - seqlength = seqlength + len(data_lines[match].strip()) - if data_lines[match] == last_line: - break - match = match + 1 - else: - break - return ReturnValue1(seqlength, genename) - count = count + 1 - if seqlength == 0: - error.write(uniprot_accession_in + '\t' + "Uniprot not in Fasta" + '\n') - error.close - seqlength = 'NA' - genename = 'NA' - return ReturnValue1(seqlength, genename) -def concatenate_files(file_list_string, bait_file): - file_list = file_list_string.split(",") - bait = read_tab(bait_file) - master_table = [] - header_check = 0 - file_cnt = 0 - table_cnt = 0 - for i in file_list: - table = read_tab(i) - for j in table: - if table_cnt == 0: - if header_check == 0: - header_check +=1 - j.append("Replicate") - j.append("Bait_Grouping") - master_table.append(j) - if table_cnt > 0: - j.append(bait[file_cnt][0]) - j.append(bait[file_cnt][1]) - master_table.append(j) - table_cnt +=1 - file_cnt+=1 - table_cnt = 0 - if len(master_table[0]) < len(master_table[1]): - master_table[0] = ["#"] + master_table[0] - with open("merged_PeptideShaker.txt","w") as x: - for i in master_table: - x.write("\t".join(i)) - x.write("\n") - return master_table -def make_inter(master_table,quant_type): - if len(master_table[0]) < len(master_table[1]): - master_table[0] = ["#"] + master_table[0] - replicate_index = master_table[0].index("Replicate") - grouping_index = master_table[0].index("Bait_Grouping") - accession_index = master_table[0].index("Main Accession") - quant_type = quant_type.replace("_", " ") - quant_type = r"#" + quant_type - Quant_index = master_table[0].index(quant_type) - inter_file = "" - for i in master_table[1:]: - line = [] - line.append(i[replicate_index]) - line.append(i[grouping_index]) - line.append(i[accession_index]) - line.append(i[Quant_index]) - inter_file = inter_file + "\t".join(line) + "\n" - with open("inter.txt","w") as x: - x.write(inter_file) - -def make_prey(concat_table,fasta_db): - input_data = concat_table - if len(input_data[0]) < len(input_data[1]): - input_data[0] = ["#"] + input_data[0] - accession_index = input_data[0].index("Main Accession") - proteins = [] - for i in input_data[1:]: - proteins.append(i[accession_index]) - output_file = open("prey.txt", 'w') - start = 0 - end = len(proteins) - - # Initial call to print 0% progress - printProgress(start, end, prefix = 'Progress:', suffix = 'Complete', barLength = 50) - - for protein in proteins: - seq = get_info(protein,fasta_db).seqlength - GN = get_info(protein,fasta_db).genename - if seq != 'NA': - output_file.write(protein + "\t" + str(seq) + "\t" + str(GN) + "\n") - start+=1 - printProgress(start, end, prefix = 'Progress:', suffix = 'Complete', barLength = 50) - output_file.close() -data = concatenate_files(files,bait) -make_inter(data, quant_type) -if prey == "true": - make_prey(data,db) - -os.rename("bait.txt", sys.argv[2]) -os.rename("inter.txt", sys.argv[10]) -if str(prey) != "None": - os.rename("prey.txt", sys.argv[11]) \ No newline at end of file