view MergeFiles.py @ 6:008bc143e72e draft

Uploaded
author bornea
date Wed, 03 Aug 2016 15:06:39 -0400
parents c1199500c601
children 90045bffffaa
line wrap: on
line source

"""
Python-code: Merge Scaffold Samples Report files
@author = Brent Kuenzi
@email = Brent.Kuenzi@moffitt.org
"""
#######################################################################################
import sys
import urllib2
import os.path
import pandas
#######################################################################################
## Description: ##
#    This program will merge either 2 or 3 scaffold
#    sample report files together
## Required input: ##
infile1 = sys.argv[1] # scaffold report #1 -- filename
infile2 = sys.argv[2] # scaffold report #2 -- filename
infile3 = sys.argv[3] # scaffold report #3 -- filename or "False"
baitfile = sys.argv[4] # Bait file -- filename
outfile = sys.argv[5] # output filename
class ScaffoldReturn(object):
    def __init__(self, getdata, getproteins, getheader):
        self.data = getdata
        self.proteins = getproteins
        self.header = getheader
class ProteinInfo(object):
    def __init__(self, getMW, getDescr):
        self.mw = getMW
        self.descr = getDescr
def readtab(infile):
    with open(infile,'r') as x: # read in tab-delim text
        output = []
        for line in x:
            line = line.strip()
            temp = line.split('\t')
            output.append(temp)
    return output
def read_scaffold(scaffold_input): # Get data, proteins and header from scaffold output
    dupes = readtab(scaffold_input)
    cnt = 0
    for i in dupes:
        if "Accession Number" in i: # finds the start of header
            header_start = cnt
            break
        cnt += 1
    header = dupes[header_start]
    prot_start = header.index("Accession Number")
    data = dupes[header_start+1:len(dupes)-2] # cut off blank line and END OF FILE
    proteins = []
    for i in data:
        i[prot_start] = i[prot_start].split()[0] # removes the (+##) that sometimes is attached
    for protein in data:
        proteins.append(protein[prot_start])
    return ScaffoldReturn(data, proteins, header)
def MakeDF(scaffold_input,bait_input): 
    bait = readtab(bait_input)
    data = read_scaffold(scaffold_input).data
    header = read_scaffold(scaffold_input).header
    proteins = read_scaffold(scaffold_input).proteins
    prot_start = header.index("Accession Number")
    bait_index = []
    ind = []
    for i in bait:
        if i[0] in header:
            bait_index.append(header.index(i[0])) # Find just the baits defined in bait file
            ind.append(i[0])
    frames = {}
    for i in proteins:
        protein = i.split()[0]
        if protein not in frames:
            frames[protein] = [] # create dictionary of proteins for each bait value
    for i in data:
        temp = i[prot_start]
        protein = temp.split()[0]
        for j in bait_index: # create dataframe
            frames[protein].append(i[j])
    df = pd.DataFrame(frames,index=ind)
    return df
def get_info(input1,input2,input3):
    files = [input1,input2]
    molwt = {}
    protdesc = {}
    if input3 != "False":
        files.append(input3)
    for i in files:
        data = read_scaffold(i).data
        header = read_scaffold(i).header
        prot_start = header.index("Accession Number") # find header
        prot = 0
        mw = 0
        cnt=0
        for j in header: # find info
            if "Identified" in j:
                prot=cnt
            if "Molecular Weight" in j:
                mw=cnt
            cnt+=1
        for k in data: # append info
            temp = k[prot_start]
            protein = temp.split()[0]
            if protein not in protdesc:
                protdesc[protein] = k[prot]
                molwt[protein] = k[mw]
    mw = pd.DataFrame(molwt,index=["Molecular Weight"])
    descr = pd.DataFrame(protdesc,index=["Identified Proteins"])
    return ProteinInfo(mw,descr) # return info as dataframe
def MakeFile(input1,input2,input3,bait_input):
    files = [input1,input2]        
    if input3 != "False":
        files.append(input3)
    DFs = []
    DFs.append(get_info(input1,input2,input3).descr)
    DFs.append(get_info(input1,input2,input3).mw)
    for i in files:
        DFs.append(MakeDF(i,bait_input))
    final_df = pd.concat(DFs)
    temp = final_df.T
    temp.index.name = "Accession Number"
    output = temp.fillna(0)
    output["Accession Number"] = output.index.get_values()
    output.index = range(1, len(output.index) + 1)
    output.index.name = "#"
    output.to_csv(outfile,sep="\t")

#######################################################################################
MakeFile(input1 = infile1,input2= infile2,input3 = infile3,bait_input=baitfile, outfile="merged.txt")
#######################################################################################