annotate MSMS_Extractor.py @ 5:c2f8e3164537 draft

planemo upload
author pravs
date Thu, 03 Aug 2017 18:17:59 -0400
parents aa944e3a353c
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
093015b1b904 planemo upload
pravs
parents:
diff changeset
1 #
093015b1b904 planemo upload
pravs
parents:
diff changeset
2 # Developed by Praveen Kumar
093015b1b904 planemo upload
pravs
parents:
diff changeset
3 # Galaxy-P Team (Griffin's Lab)
093015b1b904 planemo upload
pravs
parents:
diff changeset
4 # University of Minnesota
093015b1b904 planemo upload
pravs
parents:
diff changeset
5 #
093015b1b904 planemo upload
pravs
parents:
diff changeset
6 #
2
aa944e3a353c planemo upload
pravs
parents: 0
diff changeset
7 #
0
093015b1b904 planemo upload
pravs
parents:
diff changeset
8
093015b1b904 planemo upload
pravs
parents:
diff changeset
9 def main():
093015b1b904 planemo upload
pravs
parents:
diff changeset
10 from pyteomics import mzml
093015b1b904 planemo upload
pravs
parents:
diff changeset
11 import os
093015b1b904 planemo upload
pravs
parents:
diff changeset
12 import sys
093015b1b904 planemo upload
pravs
parents:
diff changeset
13 import shutil
093015b1b904 planemo upload
pravs
parents:
diff changeset
14 import subprocess
093015b1b904 planemo upload
pravs
parents:
diff changeset
15 import re
093015b1b904 planemo upload
pravs
parents:
diff changeset
16 import pandas as pd
093015b1b904 planemo upload
pravs
parents:
diff changeset
17 from operator import itemgetter
093015b1b904 planemo upload
pravs
parents:
diff changeset
18 from itertools import groupby
2
aa944e3a353c planemo upload
pravs
parents: 0
diff changeset
19 import random
aa944e3a353c planemo upload
pravs
parents: 0
diff changeset
20
aa944e3a353c planemo upload
pravs
parents: 0
diff changeset
21 if len(sys.argv) >= 7:
0
093015b1b904 planemo upload
pravs
parents:
diff changeset
22 # Start of Reading Scans from PSM file
093015b1b904 planemo upload
pravs
parents:
diff changeset
23 # Creating dictionary of PSM file: key = filename key = list of scan numbers
2
aa944e3a353c planemo upload
pravs
parents: 0
diff changeset
24
aa944e3a353c planemo upload
pravs
parents: 0
diff changeset
25 removeORretain = sys.argv[5].strip()
aa944e3a353c planemo upload
pravs
parents: 0
diff changeset
26 randomScans = int(sys.argv[6].strip())
aa944e3a353c planemo upload
pravs
parents: 0
diff changeset
27
0
093015b1b904 planemo upload
pravs
parents:
diff changeset
28 ScanFile = sys.argv[2]
093015b1b904 planemo upload
pravs
parents:
diff changeset
29 spectrumTitleList = list(pd.read_csv(ScanFile, "\t")['Spectrum Title'])
093015b1b904 planemo upload
pravs
parents:
diff changeset
30 scanFileNumber = [[".".join(each.split(".")[:-3]), int(each.split(".")[-2:-1][0])] for each in spectrumTitleList]
093015b1b904 planemo upload
pravs
parents:
diff changeset
31 scanDict = {}
093015b1b904 planemo upload
pravs
parents:
diff changeset
32 for each in scanFileNumber:
093015b1b904 planemo upload
pravs
parents:
diff changeset
33 if scanDict.has_key(each[0]):
093015b1b904 planemo upload
pravs
parents:
diff changeset
34 scanDict[each[0]].append(int(each[1]))
093015b1b904 planemo upload
pravs
parents:
diff changeset
35 else:
093015b1b904 planemo upload
pravs
parents:
diff changeset
36 scanDict[each[0]] = [int(each[1])]
093015b1b904 planemo upload
pravs
parents:
diff changeset
37 # End of Reading Scans from PSM file
093015b1b904 planemo upload
pravs
parents:
diff changeset
38
093015b1b904 planemo upload
pravs
parents:
diff changeset
39 inputPath = sys.argv[1]
093015b1b904 planemo upload
pravs
parents:
diff changeset
40 ##outPath = "/".join(sys.argv[3].split("/")[:-1])
093015b1b904 planemo upload
pravs
parents:
diff changeset
41 outPath = sys.argv[3]
093015b1b904 planemo upload
pravs
parents:
diff changeset
42 ##outFile = sys.argv[3].split("/")[-1]
093015b1b904 planemo upload
pravs
parents:
diff changeset
43 allScanList = []
093015b1b904 planemo upload
pravs
parents:
diff changeset
44 # Read all scan numbers using indexedmzML/indexList/index/offset tags
093015b1b904 planemo upload
pravs
parents:
diff changeset
45 for k in mzml.read(inputPath).iterfind('indexedmzML/indexList/index/offset'):
093015b1b904 planemo upload
pravs
parents:
diff changeset
46 if re.search("scan=(\d+)", k['idRef']):
093015b1b904 planemo upload
pravs
parents:
diff changeset
47 a = re.search("scan=(\d+)", k['idRef'])
093015b1b904 planemo upload
pravs
parents:
diff changeset
48 allScanList.append(int(a.group(1)))
5
c2f8e3164537 planemo upload
pravs
parents: 2
diff changeset
49 allScanList = list(set(allScanList))
0
093015b1b904 planemo upload
pravs
parents:
diff changeset
50 # End of Reading mzML file
093015b1b904 planemo upload
pravs
parents:
diff changeset
51
093015b1b904 planemo upload
pravs
parents:
diff changeset
52 fraction_name = sys.argv[4]
093015b1b904 planemo upload
pravs
parents:
diff changeset
53 if scanDict.has_key(fraction_name):
2
aa944e3a353c planemo upload
pravs
parents: 0
diff changeset
54 scansInList = scanDict[fraction_name]
0
093015b1b904 planemo upload
pravs
parents:
diff changeset
55 else:
2
aa944e3a353c planemo upload
pravs
parents: 0
diff changeset
56 scansInList = []
aa944e3a353c planemo upload
pravs
parents: 0
diff changeset
57 scansNotInList = list(set(allScanList) - set(scansInList))
5
c2f8e3164537 planemo upload
pravs
parents: 2
diff changeset
58 flag = 0
2
aa944e3a353c planemo upload
pravs
parents: 0
diff changeset
59 if removeORretain == "remove":
aa944e3a353c planemo upload
pravs
parents: 0
diff changeset
60 scan2retain = scansNotInList
5
c2f8e3164537 planemo upload
pravs
parents: 2
diff changeset
61 scan2retain = list(set(scan2retain))
2
aa944e3a353c planemo upload
pravs
parents: 0
diff changeset
62 scan2retain.sort()
aa944e3a353c planemo upload
pravs
parents: 0
diff changeset
63 scansRemoved = scansInList
aa944e3a353c planemo upload
pravs
parents: 0
diff changeset
64 # scan2retain contains scans that is to be retained
5
c2f8e3164537 planemo upload
pravs
parents: 2
diff changeset
65 elif removeORretain == "retain" and randomScans < len(scansNotInList):
2
aa944e3a353c planemo upload
pravs
parents: 0
diff changeset
66 # Randomly select spectra
5
c2f8e3164537 planemo upload
pravs
parents: 2
diff changeset
67 random_scans = random.sample(scansNotInList, randomScans)
2
aa944e3a353c planemo upload
pravs
parents: 0
diff changeset
68
aa944e3a353c planemo upload
pravs
parents: 0
diff changeset
69 scan2retain = random_scans + scansInList
5
c2f8e3164537 planemo upload
pravs
parents: 2
diff changeset
70 scan2retain = list(set(scan2retain))
2
aa944e3a353c planemo upload
pravs
parents: 0
diff changeset
71 scan2retain.sort()
aa944e3a353c planemo upload
pravs
parents: 0
diff changeset
72 scansRemoved = list(set(allScanList) - set(scan2retain))
aa944e3a353c planemo upload
pravs
parents: 0
diff changeset
73 # scan2retain contains scans that is to be retained
5
c2f8e3164537 planemo upload
pravs
parents: 2
diff changeset
74 else:
c2f8e3164537 planemo upload
pravs
parents: 2
diff changeset
75 flag = 1
2
aa944e3a353c planemo upload
pravs
parents: 0
diff changeset
76
5
c2f8e3164537 planemo upload
pravs
parents: 2
diff changeset
77 if flag == 1:
c2f8e3164537 planemo upload
pravs
parents: 2
diff changeset
78 scan2retain = scansInList
c2f8e3164537 planemo upload
pravs
parents: 2
diff changeset
79 scan2retain = list(set(scan2retain))
c2f8e3164537 planemo upload
pravs
parents: 2
diff changeset
80 scan2retain.sort()
c2f8e3164537 planemo upload
pravs
parents: 2
diff changeset
81 scansRemoved = list(set(allScanList) - set(scan2retain))
c2f8e3164537 planemo upload
pravs
parents: 2
diff changeset
82
c2f8e3164537 planemo upload
pravs
parents: 2
diff changeset
83 # scan2retain contains scans that is to be retained
c2f8e3164537 planemo upload
pravs
parents: 2
diff changeset
84 print >> sys.stdout,"ERROR: Number of Random Scans queried is more than available. The result has provided zero random scans."
c2f8e3164537 planemo upload
pravs
parents: 2
diff changeset
85 print >> sys.stdout,"Number of available scans for random selection: %d" % len(scansNotInList)
c2f8e3164537 planemo upload
pravs
parents: 2
diff changeset
86 print >> sys.stdout,"Try a number less than the available number. Thanks!!"
c2f8e3164537 planemo upload
pravs
parents: 2
diff changeset
87 print >> sys.stdout,"Number of Scans retained: %d" % len(scan2retain)
c2f8e3164537 planemo upload
pravs
parents: 2
diff changeset
88 else:
c2f8e3164537 planemo upload
pravs
parents: 2
diff changeset
89 # Print Stats
c2f8e3164537 planemo upload
pravs
parents: 2
diff changeset
90 print >> sys.stdout,"Total number of Scan Numbers: %d" % len(list(set(allScanList)))
c2f8e3164537 planemo upload
pravs
parents: 2
diff changeset
91 print >> sys.stdout,"Number of Scans retained: %d" % len(scan2retain)
c2f8e3164537 planemo upload
pravs
parents: 2
diff changeset
92 print >> sys.stdout,"Number of Scans removed: %d" % len(scansRemoved)
0
093015b1b904 planemo upload
pravs
parents:
diff changeset
93
093015b1b904 planemo upload
pravs
parents:
diff changeset
94
093015b1b904 planemo upload
pravs
parents:
diff changeset
95 # Identifying groups of continuous numbers in the scan2retain and creating scanString
093015b1b904 planemo upload
pravs
parents:
diff changeset
96 scanString = ""
093015b1b904 planemo upload
pravs
parents:
diff changeset
97 for a, b in groupby(enumerate(scan2retain), lambda(i,x):i-x):
093015b1b904 planemo upload
pravs
parents:
diff changeset
98 x = map(itemgetter(1), b)
093015b1b904 planemo upload
pravs
parents:
diff changeset
99 scanString = scanString + "["+str(x[0])+","+str(x[-1])+"] "
093015b1b904 planemo upload
pravs
parents:
diff changeset
100 # end identifying
5
c2f8e3164537 planemo upload
pravs
parents: 2
diff changeset
101
0
093015b1b904 planemo upload
pravs
parents:
diff changeset
102 # start create filter file
093015b1b904 planemo upload
pravs
parents:
diff changeset
103 filter_file = open("filter.txt", "w")
093015b1b904 planemo upload
pravs
parents:
diff changeset
104 filter_file.write("filter=scanNumber %s\n" % scanString)
093015b1b904 planemo upload
pravs
parents:
diff changeset
105 filter_file.close()
093015b1b904 planemo upload
pravs
parents:
diff changeset
106 # end create filter file
5
c2f8e3164537 planemo upload
pravs
parents: 2
diff changeset
107
0
093015b1b904 planemo upload
pravs
parents:
diff changeset
108 # Prepare command for msconvert
093015b1b904 planemo upload
pravs
parents:
diff changeset
109 inputFile = fraction_name+".mzML"
093015b1b904 planemo upload
pravs
parents:
diff changeset
110 os.symlink(inputPath,inputFile)
093015b1b904 planemo upload
pravs
parents:
diff changeset
111 outFile = "filtered_"+fraction_name+".mzML"
093015b1b904 planemo upload
pravs
parents:
diff changeset
112 # msconvert_command = "msconvert " + inputFile + " --filter " + "\"scanNumber " + scanString + " \" " + " --outfile " + outFile + " --mzML --zlib"
093015b1b904 planemo upload
pravs
parents:
diff changeset
113 msconvert_command = "msconvert " + inputFile + " -c filter.txt " + " --outfile " + outFile + " --mzML --zlib"
093015b1b904 planemo upload
pravs
parents:
diff changeset
114
093015b1b904 planemo upload
pravs
parents:
diff changeset
115
093015b1b904 planemo upload
pravs
parents:
diff changeset
116 # Run msconvert
093015b1b904 planemo upload
pravs
parents:
diff changeset
117 try:
093015b1b904 planemo upload
pravs
parents:
diff changeset
118 subprocess.check_output(msconvert_command, stderr=subprocess.STDOUT, shell=True)
093015b1b904 planemo upload
pravs
parents:
diff changeset
119 except subprocess.CalledProcessError as e:
093015b1b904 planemo upload
pravs
parents:
diff changeset
120 sys.stderr.write( "msconvert resulted in error: %s: %s" % ( e.returncode, e.output ))
093015b1b904 planemo upload
pravs
parents:
diff changeset
121 sys.exit(e.returncode)
093015b1b904 planemo upload
pravs
parents:
diff changeset
122 # Copy output to
093015b1b904 planemo upload
pravs
parents:
diff changeset
123 shutil.copyfile(outFile, outPath)
093015b1b904 planemo upload
pravs
parents:
diff changeset
124 else:
093015b1b904 planemo upload
pravs
parents:
diff changeset
125 print "Please contact the admin. Number of inputs are not sufficient to run the program.\n"
093015b1b904 planemo upload
pravs
parents:
diff changeset
126
093015b1b904 planemo upload
pravs
parents:
diff changeset
127 if __name__ == "__main__":
093015b1b904 planemo upload
pravs
parents:
diff changeset
128 main()
093015b1b904 planemo upload
pravs
parents:
diff changeset
129
093015b1b904 planemo upload
pravs
parents:
diff changeset
130
093015b1b904 planemo upload
pravs
parents:
diff changeset
131
093015b1b904 planemo upload
pravs
parents:
diff changeset
132