|
0
|
1 """
|
|
|
2 Python-code: Merge Scaffold Samples Report files
|
|
|
3 @author = Brent Kuenzi
|
|
|
4 @email = Brent.Kuenzi@moffitt.org
|
|
|
5 """
|
|
|
6 #######################################################################################
|
|
|
7 import sys
|
|
|
8 import urllib2
|
|
|
9 import os.path
|
|
10
|
10 import pandas as pd
|
|
0
|
11 #######################################################################################
|
|
|
12 ## Description: ##
|
|
|
13 # This program will merge either 2 or 3 scaffold
|
|
|
14 # sample report files together
|
|
|
15 ## Required input: ##
|
|
|
16 infile1 = sys.argv[1] # scaffold report #1 -- filename
|
|
10
|
17 print infile1
|
|
0
|
18 infile2 = sys.argv[2] # scaffold report #2 -- filename
|
|
10
|
19 print infile2
|
|
0
|
20 infile3 = sys.argv[3] # scaffold report #3 -- filename or "False"
|
|
10
|
21 print infile3
|
|
0
|
22 baitfile = sys.argv[4] # Bait file -- filename
|
|
10
|
23 print baitfile
|
|
0
|
24 outfile = sys.argv[5] # output filename
|
|
|
25 class ScaffoldReturn(object):
|
|
|
26 def __init__(self, getdata, getproteins, getheader):
|
|
|
27 self.data = getdata
|
|
|
28 self.proteins = getproteins
|
|
|
29 self.header = getheader
|
|
|
30 class ProteinInfo(object):
|
|
|
31 def __init__(self, getMW, getDescr):
|
|
|
32 self.mw = getMW
|
|
|
33 self.descr = getDescr
|
|
|
34 def readtab(infile):
|
|
|
35 with open(infile,'r') as x: # read in tab-delim text
|
|
|
36 output = []
|
|
|
37 for line in x:
|
|
|
38 line = line.strip()
|
|
|
39 temp = line.split('\t')
|
|
|
40 output.append(temp)
|
|
|
41 return output
|
|
|
42 def read_scaffold(scaffold_input): # Get data, proteins and header from scaffold output
|
|
|
43 dupes = readtab(scaffold_input)
|
|
|
44 cnt = 0
|
|
|
45 for i in dupes:
|
|
|
46 if "Accession Number" in i: # finds the start of header
|
|
|
47 header_start = cnt
|
|
|
48 break
|
|
|
49 cnt += 1
|
|
|
50 header = dupes[header_start]
|
|
|
51 prot_start = header.index("Accession Number")
|
|
|
52 data = dupes[header_start+1:len(dupes)-2] # cut off blank line and END OF FILE
|
|
|
53 proteins = []
|
|
|
54 for i in data:
|
|
|
55 i[prot_start] = i[prot_start].split()[0] # removes the (+##) that sometimes is attached
|
|
|
56 for protein in data:
|
|
|
57 proteins.append(protein[prot_start])
|
|
|
58 return ScaffoldReturn(data, proteins, header)
|
|
|
59 def MakeDF(scaffold_input,bait_input):
|
|
|
60 bait = readtab(bait_input)
|
|
|
61 data = read_scaffold(scaffold_input).data
|
|
|
62 header = read_scaffold(scaffold_input).header
|
|
|
63 proteins = read_scaffold(scaffold_input).proteins
|
|
|
64 prot_start = header.index("Accession Number")
|
|
|
65 bait_index = []
|
|
|
66 ind = []
|
|
|
67 for i in bait:
|
|
|
68 if i[0] in header:
|
|
|
69 bait_index.append(header.index(i[0])) # Find just the baits defined in bait file
|
|
|
70 ind.append(i[0])
|
|
|
71 frames = {}
|
|
|
72 for i in proteins:
|
|
|
73 protein = i.split()[0]
|
|
|
74 if protein not in frames:
|
|
|
75 frames[protein] = [] # create dictionary of proteins for each bait value
|
|
|
76 for i in data:
|
|
|
77 temp = i[prot_start]
|
|
|
78 protein = temp.split()[0]
|
|
|
79 for j in bait_index: # create dataframe
|
|
|
80 frames[protein].append(i[j])
|
|
|
81 df = pd.DataFrame(frames,index=ind)
|
|
|
82 return df
|
|
|
83 def get_info(input1,input2,input3):
|
|
|
84 files = [input1,input2]
|
|
|
85 molwt = {}
|
|
|
86 protdesc = {}
|
|
|
87 if input3 != "False":
|
|
|
88 files.append(input3)
|
|
|
89 for i in files:
|
|
|
90 data = read_scaffold(i).data
|
|
|
91 header = read_scaffold(i).header
|
|
|
92 prot_start = header.index("Accession Number") # find header
|
|
|
93 prot = 0
|
|
|
94 mw = 0
|
|
|
95 cnt=0
|
|
|
96 for j in header: # find info
|
|
|
97 if "Identified" in j:
|
|
|
98 prot=cnt
|
|
|
99 if "Molecular Weight" in j:
|
|
|
100 mw=cnt
|
|
|
101 cnt+=1
|
|
|
102 for k in data: # append info
|
|
|
103 temp = k[prot_start]
|
|
|
104 protein = temp.split()[0]
|
|
|
105 if protein not in protdesc:
|
|
|
106 protdesc[protein] = k[prot]
|
|
|
107 molwt[protein] = k[mw]
|
|
|
108 mw = pd.DataFrame(molwt,index=["Molecular Weight"])
|
|
|
109 descr = pd.DataFrame(protdesc,index=["Identified Proteins"])
|
|
|
110 return ProteinInfo(mw,descr) # return info as dataframe
|
|
|
111 def MakeFile(input1,input2,input3,bait_input):
|
|
|
112 files = [input1,input2]
|
|
|
113 if input3 != "False":
|
|
|
114 files.append(input3)
|
|
|
115 DFs = []
|
|
|
116 DFs.append(get_info(input1,input2,input3).descr)
|
|
|
117 DFs.append(get_info(input1,input2,input3).mw)
|
|
|
118 for i in files:
|
|
|
119 DFs.append(MakeDF(i,bait_input))
|
|
|
120 final_df = pd.concat(DFs)
|
|
|
121 temp = final_df.T
|
|
|
122 temp.index.name = "Accession Number"
|
|
|
123 output = temp.fillna(0)
|
|
|
124 output["Accession Number"] = output.index.get_values()
|
|
|
125 output.index = range(1, len(output.index) + 1)
|
|
|
126 output.index.name = "#"
|
|
|
127 output.to_csv(outfile,sep="\t")
|
|
|
128
|
|
|
129 #######################################################################################
|
|
|
130 MakeFile(input1 = infile1,input2= infile2,input3 = infile3,bait_input=baitfile, outfile="merged.txt")
|
|
|
131 #######################################################################################
|