comparison MSMS_Extractor.py @ 2:aa944e3a353c draft

planemo upload
author pravs
date Thu, 03 Aug 2017 13:53:09 -0400
parents 093015b1b904
children c2f8e3164537
comparison
equal deleted inserted replaced
1:f444e529363d 2:aa944e3a353c
1 # 1 #
2 # Developed by Praveen Kumar 2 # Developed by Praveen Kumar
3 # Galaxy-P Team (Griffin's Lab) 3 # Galaxy-P Team (Griffin's Lab)
4 # University of Minnesota 4 # University of Minnesota
5 #
5 # 6 #
6 # 7 #
7 8
8 def main(): 9 def main():
9 from pyteomics import mzml 10 from pyteomics import mzml
13 import subprocess 14 import subprocess
14 import re 15 import re
15 import pandas as pd 16 import pandas as pd
16 from operator import itemgetter 17 from operator import itemgetter
17 from itertools import groupby 18 from itertools import groupby
18 if len(sys.argv) >= 5: 19 import random
20
21 if len(sys.argv) >= 7:
19 # Start of Reading Scans from PSM file 22 # Start of Reading Scans from PSM file
20 # Creating dictionary of PSM file: key = filename key = list of scan numbers 23 # Creating dictionary of PSM file: key = filename key = list of scan numbers
24
25 removeORretain = sys.argv[5].strip()
26 randomScans = int(sys.argv[6].strip())
27
21 ScanFile = sys.argv[2] 28 ScanFile = sys.argv[2]
22 spectrumTitleList = list(pd.read_csv(ScanFile, "\t")['Spectrum Title']) 29 spectrumTitleList = list(pd.read_csv(ScanFile, "\t")['Spectrum Title'])
23 scanFileNumber = [[".".join(each.split(".")[:-3]), int(each.split(".")[-2:-1][0])] for each in spectrumTitleList] 30 scanFileNumber = [[".".join(each.split(".")[:-3]), int(each.split(".")[-2:-1][0])] for each in spectrumTitleList]
24 scanDict = {} 31 scanDict = {}
25 for each in scanFileNumber: 32 for each in scanFileNumber:
32 inputPath = sys.argv[1] 39 inputPath = sys.argv[1]
33 ##outPath = "/".join(sys.argv[3].split("/")[:-1]) 40 ##outPath = "/".join(sys.argv[3].split("/")[:-1])
34 outPath = sys.argv[3] 41 outPath = sys.argv[3]
35 ##outFile = sys.argv[3].split("/")[-1] 42 ##outFile = sys.argv[3].split("/")[-1]
36 allScanList = [] 43 allScanList = []
37
38 # Read all scan numbers using indexedmzML/indexList/index/offset tags 44 # Read all scan numbers using indexedmzML/indexList/index/offset tags
39 for k in mzml.read(inputPath).iterfind('indexedmzML/indexList/index/offset'): 45 for k in mzml.read(inputPath).iterfind('indexedmzML/indexList/index/offset'):
40 if re.search("scan=(\d+)", k['idRef']): 46 if re.search("scan=(\d+)", k['idRef']):
41 a = re.search("scan=(\d+)", k['idRef']) 47 a = re.search("scan=(\d+)", k['idRef'])
42 allScanList.append(int(a.group(1))) 48 allScanList.append(int(a.group(1)))
43 # End of Reading mzML file 49 # End of Reading mzML file
44 50
45 fraction_name = sys.argv[4] 51 fraction_name = sys.argv[4]
46 if scanDict.has_key(fraction_name): 52 if scanDict.has_key(fraction_name):
47 scan2remove = scanDict[fraction_name] 53 scansInList = scanDict[fraction_name]
48 else: 54 else:
49 scan2remove = [] 55 scansInList = []
50 scan2retain = list(set(allScanList) - set(scan2remove)) 56 scansNotInList = list(set(allScanList) - set(scansInList))
51 scan2retain.sort()
52 scansRemoved = list(set(allScanList) - set(scan2retain))
53 # scan2retain contains scans that is to be retained
54 57
58 if removeORretain == "remove":
59 scan2retain = scansNotInList
60 scan2retain.sort()
61 scansRemoved = scansInList
62 # scan2retain contains scans that is to be retained
63
64 elif removeORretain == "retain":
65 # Randomly select spectra
66 random_scans = list(map(lambda _: random.choice(scansNotInList), range(randomScans)))
67
68 scan2retain = random_scans + scansInList
69 scan2retain.sort()
70 scansRemoved = list(set(allScanList) - set(scan2retain))
71 # scan2retain contains scans that is to be retained
72
55 # Print Stats 73 # Print Stats
56 print >> sys.stdout,"Total number of Scan Numbers: %d" % len(list(set(allScanList))) 74 print >> sys.stdout,"Total number of Scan Numbers: %d" % len(list(set(allScanList)))
57 print >> sys.stdout,"Number of Scans to remove: %d" % len(list(set(scan2remove)))
58 print >> sys.stdout,"Number of Scans retained: %d" % len(scan2retain) 75 print >> sys.stdout,"Number of Scans retained: %d" % len(scan2retain)
59 print >> sys.stdout,"Number of Scans removed: %d" % len(scansRemoved) 76 print >> sys.stdout,"Number of Scans removed: %d" % len(scansRemoved)
60 77
61 78
62 # Identifying groups of continuous numbers in the scan2retain and creating scanString 79 # Identifying groups of continuous numbers in the scan2retain and creating scanString