diff MergeFiles.py @ 0:84564ae1cca1 draft

Uploaded
author bornea
date Tue, 26 Jul 2016 16:16:59 -0400
parents
children c1199500c601
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/MergeFiles.py	Tue Jul 26 16:16:59 2016 -0400
@@ -0,0 +1,128 @@
+# -*- coding: utf-8 -*-
+"""
+Python-code: Merge Scaffold Samples Report files
+@author = Brent Kuenzi
+@email = Brent.Kuenzi@moffitt.org
+"""
+#######################################################################################
+import sys
+import urllib2
+import os.path
+import pandas as pd
+#######################################################################################
+## Description: ##
+#    This program will merge either 2 or 3 scaffold
+#    sample report files together
+## Required input: ##
+infile1 = sys.argv[1] # scaffold report #1 -- filename
+infile2 = sys.argv[2] # scaffold report #2 -- filename
+infile3 = sys.argv[3] # scaffold report #3 -- filename or "False"
+baitfile = sys.argv[4] # Bait file -- filename
+outfile = sys.argv[5] # output filename
+class ScaffoldReturn(object):
+    def __init__(self, getdata, getproteins, getheader):
+        self.data = getdata
+        self.proteins = getproteins
+        self.header = getheader
+class ProteinInfo(object):
+    def __init__(self, getMW, getDescr):
+        self.mw = getMW
+        self.descr = getDescr
+def readtab(infile):
+    with open(infile,'r') as x: # read in tab-delim text
+        output = []
+        for line in x:
+            line = line.strip()
+            temp = line.split('\t')
+            output.append(temp)
+    return output
+def read_scaffold(scaffold_input): # Get data, proteins and header from scaffold output
+    dupes = readtab(scaffold_input)
+    cnt = 0
+    for i in dupes:
+        if "Accession Number" in i: # finds the start of header
+            header_start = cnt
+            break
+        cnt += 1
+    header = dupes[header_start]
+    prot_start = header.index("Accession Number")
+    data = dupes[header_start+1:len(dupes)-2] # cut off blank line and END OF FILE
+    proteins = []
+    for i in data:
+        i[prot_start] = i[prot_start].split()[0] # removes the (+##) that sometimes is attached
+    for protein in data:
+        proteins.append(protein[prot_start])
+    return ScaffoldReturn(data, proteins, header)
+def MakeDF(scaffold_input,bait_input): 
+    bait = readtab(bait_input)
+    data = read_scaffold(scaffold_input).data
+    header = read_scaffold(scaffold_input).header
+    proteins = read_scaffold(scaffold_input).proteins
+    prot_start = header.index("Accession Number")
+    bait_index = []
+    ind = []
+    for i in bait:
+        if i[0] in header:
+            bait_index.append(header.index(i[0])) # Find just the baits defined in bait file
+            ind.append(i[0])
+    frames = {}
+    for i in proteins:
+        protein = i.split()[0]
+        if protein not in frames:
+            frames[protein] = [] # create dictionary of proteins for each bait value
+    for i in data:
+        temp = i[prot_start]
+        protein = temp.split()[0]
+        for j in bait_index: # create dataframe
+            frames[protein].append(i[j])
+    df = pd.DataFrame(frames,index=ind)
+    return df
+def get_info(input1,input2,input3):
+    files = [input1,input2]
+    molwt = {}
+    protdesc = {}
+    if input3 != "False":
+        files.append(input3)
+    for i in files:
+        data = read_scaffold(i).data
+        header = read_scaffold(i).header
+        prot_start = header.index("Accession Number") # find header
+        prot = 0
+        mw = 0
+        cnt=0
+        for j in header: # find info
+            if "Identified" in j:
+                prot=cnt
+            if "Molecular Weight" in j:
+                mw=cnt
+            cnt+=1
+        for k in data: # append info
+            temp = k[prot_start]
+            protein = temp.split()[0]
+            if protein not in protdesc:
+                protdesc[protein] = k[prot]
+                molwt[protein] = k[mw]
+    mw = pd.DataFrame(molwt,index=["Molecular Weight"])
+    descr = pd.DataFrame(protdesc,index=["Identified Proteins"])
+    return ProteinInfo(mw,descr) # return info as dataframe
+def MakeFile(input1,input2,input3,bait_input):
+    files = [input1,input2]        
+    if input3 != "False":
+        files.append(input3)
+    DFs = []
+    DFs.append(get_info(input1,input2,input3).descr)
+    DFs.append(get_info(input1,input2,input3).mw)
+    for i in files:
+        DFs.append(MakeDF(i,bait_input))
+    final_df = pd.concat(DFs)
+    temp = final_df.T
+    temp.index.name = "Accession Number"
+    output = temp.fillna(0)
+    output["Accession Number"] = output.index.get_values()
+    output.index = range(1, len(output.index) + 1)
+    output.index.name = "#"
+    output.to_csv(outfile,sep="\t")
+
+#######################################################################################
+MakeFile(input1 = infile1,input2= infile2,input3 = infile3,bait_input=baitfile, outfile="merged.txt")
+#######################################################################################