comparison MSMS_Extractor.py @ 0:093015b1b904 draft

planemo upload
author pravs
date Mon, 13 Feb 2017 02:44:05 -0500
parents
children aa944e3a353c
comparison
equal deleted inserted replaced
-1:000000000000 0:093015b1b904
1 #
2 # Developed by Praveen Kumar
3 # Galaxy-P Team (Griffin's Lab)
4 # University of Minnesota
5 #
6 #
7
8 def main():
9 from pyteomics import mzml
10 import os
11 import sys
12 import shutil
13 import subprocess
14 import re
15 import pandas as pd
16 from operator import itemgetter
17 from itertools import groupby
18 if len(sys.argv) >= 5:
19 # Start of Reading Scans from PSM file
20 # Creating dictionary of PSM file: key = filename key = list of scan numbers
21 ScanFile = sys.argv[2]
22 spectrumTitleList = list(pd.read_csv(ScanFile, "\t")['Spectrum Title'])
23 scanFileNumber = [[".".join(each.split(".")[:-3]), int(each.split(".")[-2:-1][0])] for each in spectrumTitleList]
24 scanDict = {}
25 for each in scanFileNumber:
26 if scanDict.has_key(each[0]):
27 scanDict[each[0]].append(int(each[1]))
28 else:
29 scanDict[each[0]] = [int(each[1])]
30 # End of Reading Scans from PSM file
31
32 inputPath = sys.argv[1]
33 ##outPath = "/".join(sys.argv[3].split("/")[:-1])
34 outPath = sys.argv[3]
35 ##outFile = sys.argv[3].split("/")[-1]
36 allScanList = []
37
38 # Read all scan numbers using indexedmzML/indexList/index/offset tags
39 for k in mzml.read(inputPath).iterfind('indexedmzML/indexList/index/offset'):
40 if re.search("scan=(\d+)", k['idRef']):
41 a = re.search("scan=(\d+)", k['idRef'])
42 allScanList.append(int(a.group(1)))
43 # End of Reading mzML file
44
45 fraction_name = sys.argv[4]
46 if scanDict.has_key(fraction_name):
47 scan2remove = scanDict[fraction_name]
48 else:
49 scan2remove = []
50 scan2retain = list(set(allScanList) - set(scan2remove))
51 scan2retain.sort()
52 scansRemoved = list(set(allScanList) - set(scan2retain))
53 # scan2retain contains scans that is to be retained
54
55 # Print Stats
56 print >> sys.stdout,"Total number of Scan Numbers: %d" % len(list(set(allScanList)))
57 print >> sys.stdout,"Number of Scans to remove: %d" % len(list(set(scan2remove)))
58 print >> sys.stdout,"Number of Scans retained: %d" % len(scan2retain)
59 print >> sys.stdout,"Number of Scans removed: %d" % len(scansRemoved)
60
61
62 # Identifying groups of continuous numbers in the scan2retain and creating scanString
63 scanString = ""
64 for a, b in groupby(enumerate(scan2retain), lambda(i,x):i-x):
65 x = map(itemgetter(1), b)
66 scanString = scanString + "["+str(x[0])+","+str(x[-1])+"] "
67 # end identifying
68
69 # start create filter file
70 filter_file = open("filter.txt", "w")
71 filter_file.write("filter=scanNumber %s\n" % scanString)
72 filter_file.close()
73 # end create filter file
74
75 # Prepare command for msconvert
76 inputFile = fraction_name+".mzML"
77 os.symlink(inputPath,inputFile)
78 outFile = "filtered_"+fraction_name+".mzML"
79 # msconvert_command = "msconvert " + inputFile + " --filter " + "\"scanNumber " + scanString + " \" " + " --outfile " + outFile + " --mzML --zlib"
80 msconvert_command = "msconvert " + inputFile + " -c filter.txt " + " --outfile " + outFile + " --mzML --zlib"
81
82
83 # Run msconvert
84 try:
85 subprocess.check_output(msconvert_command, stderr=subprocess.STDOUT, shell=True)
86 except subprocess.CalledProcessError as e:
87 sys.stderr.write( "msconvert resulted in error: %s: %s" % ( e.returncode, e.output ))
88 sys.exit(e.returncode)
89 # Copy output to
90 shutil.copyfile(outFile, outPath)
91
92 else:
93 print "Please contact the admin. Number of inputs are not sufficient to run the program.\n"
94
95 if __name__ == "__main__":
96 main()
97
98
99
100