0
|
1 # -*- coding: utf-8 -*-
|
|
2 """
|
|
3 Python-code: Merge Scaffold Samples Report files
|
|
4 @author = Brent Kuenzi
|
|
5 @email = Brent.Kuenzi@moffitt.org
|
|
6 """
|
|
7 #######################################################################################
|
|
8 import sys
|
|
9 import urllib2
|
|
10 import os.path
|
|
11 import pandas as pd
|
|
12 #######################################################################################
|
|
13 ## Description: ##
|
|
14 # This program will merge either 2 or 3 scaffold
|
|
15 # sample report files together
|
|
16 ## Required input: ##
|
|
17 infile1 = sys.argv[1] # scaffold report #1 -- filename
|
|
18 infile2 = sys.argv[2] # scaffold report #2 -- filename
|
|
19 infile3 = sys.argv[3] # scaffold report #3 -- filename or "False"
|
|
20 baitfile = sys.argv[4] # Bait file -- filename
|
|
21 outfile = sys.argv[5] # output filename
|
|
22 class ScaffoldReturn(object):
|
|
23 def __init__(self, getdata, getproteins, getheader):
|
|
24 self.data = getdata
|
|
25 self.proteins = getproteins
|
|
26 self.header = getheader
|
|
27 class ProteinInfo(object):
|
|
28 def __init__(self, getMW, getDescr):
|
|
29 self.mw = getMW
|
|
30 self.descr = getDescr
|
|
31 def readtab(infile):
|
|
32 with open(infile,'r') as x: # read in tab-delim text
|
|
33 output = []
|
|
34 for line in x:
|
|
35 line = line.strip()
|
|
36 temp = line.split('\t')
|
|
37 output.append(temp)
|
|
38 return output
|
|
39 def read_scaffold(scaffold_input): # Get data, proteins and header from scaffold output
|
|
40 dupes = readtab(scaffold_input)
|
|
41 cnt = 0
|
|
42 for i in dupes:
|
|
43 if "Accession Number" in i: # finds the start of header
|
|
44 header_start = cnt
|
|
45 break
|
|
46 cnt += 1
|
|
47 header = dupes[header_start]
|
|
48 prot_start = header.index("Accession Number")
|
|
49 data = dupes[header_start+1:len(dupes)-2] # cut off blank line and END OF FILE
|
|
50 proteins = []
|
|
51 for i in data:
|
|
52 i[prot_start] = i[prot_start].split()[0] # removes the (+##) that sometimes is attached
|
|
53 for protein in data:
|
|
54 proteins.append(protein[prot_start])
|
|
55 return ScaffoldReturn(data, proteins, header)
|
|
56 def MakeDF(scaffold_input,bait_input):
|
|
57 bait = readtab(bait_input)
|
|
58 data = read_scaffold(scaffold_input).data
|
|
59 header = read_scaffold(scaffold_input).header
|
|
60 proteins = read_scaffold(scaffold_input).proteins
|
|
61 prot_start = header.index("Accession Number")
|
|
62 bait_index = []
|
|
63 ind = []
|
|
64 for i in bait:
|
|
65 if i[0] in header:
|
|
66 bait_index.append(header.index(i[0])) # Find just the baits defined in bait file
|
|
67 ind.append(i[0])
|
|
68 frames = {}
|
|
69 for i in proteins:
|
|
70 protein = i.split()[0]
|
|
71 if protein not in frames:
|
|
72 frames[protein] = [] # create dictionary of proteins for each bait value
|
|
73 for i in data:
|
|
74 temp = i[prot_start]
|
|
75 protein = temp.split()[0]
|
|
76 for j in bait_index: # create dataframe
|
|
77 frames[protein].append(i[j])
|
|
78 df = pd.DataFrame(frames,index=ind)
|
|
79 return df
|
|
80 def get_info(input1,input2,input3):
|
|
81 files = [input1,input2]
|
|
82 molwt = {}
|
|
83 protdesc = {}
|
|
84 if input3 != "False":
|
|
85 files.append(input3)
|
|
86 for i in files:
|
|
87 data = read_scaffold(i).data
|
|
88 header = read_scaffold(i).header
|
|
89 prot_start = header.index("Accession Number") # find header
|
|
90 prot = 0
|
|
91 mw = 0
|
|
92 cnt=0
|
|
93 for j in header: # find info
|
|
94 if "Identified" in j:
|
|
95 prot=cnt
|
|
96 if "Molecular Weight" in j:
|
|
97 mw=cnt
|
|
98 cnt+=1
|
|
99 for k in data: # append info
|
|
100 temp = k[prot_start]
|
|
101 protein = temp.split()[0]
|
|
102 if protein not in protdesc:
|
|
103 protdesc[protein] = k[prot]
|
|
104 molwt[protein] = k[mw]
|
|
105 mw = pd.DataFrame(molwt,index=["Molecular Weight"])
|
|
106 descr = pd.DataFrame(protdesc,index=["Identified Proteins"])
|
|
107 return ProteinInfo(mw,descr) # return info as dataframe
|
|
108 def MakeFile(input1,input2,input3,bait_input):
|
|
109 files = [input1,input2]
|
|
110 if input3 != "False":
|
|
111 files.append(input3)
|
|
112 DFs = []
|
|
113 DFs.append(get_info(input1,input2,input3).descr)
|
|
114 DFs.append(get_info(input1,input2,input3).mw)
|
|
115 for i in files:
|
|
116 DFs.append(MakeDF(i,bait_input))
|
|
117 final_df = pd.concat(DFs)
|
|
118 temp = final_df.T
|
|
119 temp.index.name = "Accession Number"
|
|
120 output = temp.fillna(0)
|
|
121 output["Accession Number"] = output.index.get_values()
|
|
122 output.index = range(1, len(output.index) + 1)
|
|
123 output.index.name = "#"
|
|
124 output.to_csv(outfile,sep="\t")
|
|
125
|
|
126 #######################################################################################
|
|
127 MakeFile(input1 = infile1,input2= infile2,input3 = infile3,bait_input=baitfile, outfile="merged.txt")
|
|
128 #######################################################################################
|