Mercurial > repos > bornea > merge_scaffold
diff MergeFiles.py @ 0:84564ae1cca1 draft
Uploaded
author | bornea |
---|---|
date | Tue, 26 Jul 2016 16:16:59 -0400 |
parents | |
children | c1199500c601 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MergeFiles.py Tue Jul 26 16:16:59 2016 -0400 @@ -0,0 +1,128 @@ +# -*- coding: utf-8 -*- +""" +Python-code: Merge Scaffold Samples Report files +@author = Brent Kuenzi +@email = Brent.Kuenzi@moffitt.org +""" +####################################################################################### +import sys +import urllib2 +import os.path +import pandas as pd +####################################################################################### +## Description: ## +# This program will merge either 2 or 3 scaffold +# sample report files together +## Required input: ## +infile1 = sys.argv[1] # scaffold report #1 -- filename +infile2 = sys.argv[2] # scaffold report #2 -- filename +infile3 = sys.argv[3] # scaffold report #3 -- filename or "False" +baitfile = sys.argv[4] # Bait file -- filename +outfile = sys.argv[5] # output filename +class ScaffoldReturn(object): + def __init__(self, getdata, getproteins, getheader): + self.data = getdata + self.proteins = getproteins + self.header = getheader +class ProteinInfo(object): + def __init__(self, getMW, getDescr): + self.mw = getMW + self.descr = getDescr +def readtab(infile): + with open(infile,'r') as x: # read in tab-delim text + output = [] + for line in x: + line = line.strip() + temp = line.split('\t') + output.append(temp) + return output +def read_scaffold(scaffold_input): # Get data, proteins and header from scaffold output + dupes = readtab(scaffold_input) + cnt = 0 + for i in dupes: + if "Accession Number" in i: # finds the start of header + header_start = cnt + break + cnt += 1 + header = dupes[header_start] + prot_start = header.index("Accession Number") + data = dupes[header_start+1:len(dupes)-2] # cut off blank line and END OF FILE + proteins = [] + for i in data: + i[prot_start] = i[prot_start].split()[0] # removes the (+##) that sometimes is attached + for protein in data: + proteins.append(protein[prot_start]) + return ScaffoldReturn(data, proteins, header) +def MakeDF(scaffold_input,bait_input): + bait = readtab(bait_input) + data = read_scaffold(scaffold_input).data + header = read_scaffold(scaffold_input).header + proteins = read_scaffold(scaffold_input).proteins + prot_start = header.index("Accession Number") + bait_index = [] + ind = [] + for i in bait: + if i[0] in header: + bait_index.append(header.index(i[0])) # Find just the baits defined in bait file + ind.append(i[0]) + frames = {} + for i in proteins: + protein = i.split()[0] + if protein not in frames: + frames[protein] = [] # create dictionary of proteins for each bait value + for i in data: + temp = i[prot_start] + protein = temp.split()[0] + for j in bait_index: # create dataframe + frames[protein].append(i[j]) + df = pd.DataFrame(frames,index=ind) + return df +def get_info(input1,input2,input3): + files = [input1,input2] + molwt = {} + protdesc = {} + if input3 != "False": + files.append(input3) + for i in files: + data = read_scaffold(i).data + header = read_scaffold(i).header + prot_start = header.index("Accession Number") # find header + prot = 0 + mw = 0 + cnt=0 + for j in header: # find info + if "Identified" in j: + prot=cnt + if "Molecular Weight" in j: + mw=cnt + cnt+=1 + for k in data: # append info + temp = k[prot_start] + protein = temp.split()[0] + if protein not in protdesc: + protdesc[protein] = k[prot] + molwt[protein] = k[mw] + mw = pd.DataFrame(molwt,index=["Molecular Weight"]) + descr = pd.DataFrame(protdesc,index=["Identified Proteins"]) + return ProteinInfo(mw,descr) # return info as dataframe +def MakeFile(input1,input2,input3,bait_input): + files = [input1,input2] + if input3 != "False": + files.append(input3) + DFs = [] + DFs.append(get_info(input1,input2,input3).descr) + DFs.append(get_info(input1,input2,input3).mw) + for i in files: + DFs.append(MakeDF(i,bait_input)) + final_df = pd.concat(DFs) + temp = final_df.T + temp.index.name = "Accession Number" + output = temp.fillna(0) + output["Accession Number"] = output.index.get_values() + output.index = range(1, len(output.index) + 1) + output.index.name = "#" + output.to_csv(outfile,sep="\t") + +####################################################################################### +MakeFile(input1 = infile1,input2= infile2,input3 = infile3,bait_input=baitfile, outfile="merged.txt") +#######################################################################################