# HG changeset patch # User pravs # Date 1501782789 14400 # Node ID aa944e3a353cba921daefcc89e49640f4fc3d8ac # Parent f444e529363d4b9c1f32edd9893af6a73bf0ece0 planemo upload diff -r f444e529363d -r aa944e3a353c MSMS_Extractor.py --- a/MSMS_Extractor.py Thu Feb 16 11:56:45 2017 -0500 +++ b/MSMS_Extractor.py Thu Aug 03 13:53:09 2017 -0400 @@ -4,6 +4,7 @@ # University of Minnesota # # +# def main(): from pyteomics import mzml @@ -15,9 +16,15 @@ import pandas as pd from operator import itemgetter from itertools import groupby - if len(sys.argv) >= 5: + import random + + if len(sys.argv) >= 7: # Start of Reading Scans from PSM file # Creating dictionary of PSM file: key = filename key = list of scan numbers + + removeORretain = sys.argv[5].strip() + randomScans = int(sys.argv[6].strip()) + ScanFile = sys.argv[2] spectrumTitleList = list(pd.read_csv(ScanFile, "\t")['Spectrum Title']) scanFileNumber = [[".".join(each.split(".")[:-3]), int(each.split(".")[-2:-1][0])] for each in spectrumTitleList] @@ -34,7 +41,6 @@ outPath = sys.argv[3] ##outFile = sys.argv[3].split("/")[-1] allScanList = [] - # Read all scan numbers using indexedmzML/indexList/index/offset tags for k in mzml.read(inputPath).iterfind('indexedmzML/indexList/index/offset'): if re.search("scan=(\d+)", k['idRef']): @@ -44,17 +50,28 @@ fraction_name = sys.argv[4] if scanDict.has_key(fraction_name): - scan2remove = scanDict[fraction_name] + scansInList = scanDict[fraction_name] else: - scan2remove = [] - scan2retain = list(set(allScanList) - set(scan2remove)) - scan2retain.sort() - scansRemoved = list(set(allScanList) - set(scan2retain)) - # scan2retain contains scans that is to be retained + scansInList = [] + scansNotInList = list(set(allScanList) - set(scansInList)) + if removeORretain == "remove": + scan2retain = scansNotInList + scan2retain.sort() + scansRemoved = scansInList + # scan2retain contains scans that is to be retained + + elif removeORretain == "retain": + # Randomly select spectra + random_scans = list(map(lambda _: random.choice(scansNotInList), range(randomScans))) + + scan2retain = random_scans + scansInList + scan2retain.sort() + scansRemoved = list(set(allScanList) - set(scan2retain)) + # scan2retain contains scans that is to be retained + # Print Stats print >> sys.stdout,"Total number of Scan Numbers: %d" % len(list(set(allScanList))) - print >> sys.stdout,"Number of Scans to remove: %d" % len(list(set(scan2remove))) print >> sys.stdout,"Number of Scans retained: %d" % len(scan2retain) print >> sys.stdout,"Number of Scans removed: %d" % len(scansRemoved) diff -r f444e529363d -r aa944e3a353c MSMS_Extractor.xml --- a/MSMS_Extractor.xml Thu Feb 16 11:56:45 2017 -0500 +++ b/MSMS_Extractor.xml Thu Aug 03 13:53:09 2017 -0400 @@ -1,11 +1,11 @@ - - Removes scans with identified PSMs from the mzML file(s). + + Extract scans based on PSM report from the mzML file(s). proteowizard pyteomics - + Input mzML File @@ -13,6 +13,17 @@ Input PSM Report File + + + + + + + + + + + @@ -21,6 +32,16 @@ -MSMS_Extractor reads scan numbers from the PSM report (scan numbers with identified PSM) and removes it from the mzML file. +MSMS_Extractor reads scan numbers from the PSM report (scan numbers with identified PSM) and gives option to create a new mzml file, either with those scans or without those scans. + +Remove option: +Creates a new mzml file with all the unidentified scans (removes those that are in the PSM report). + +Retain option: +Creates a new mzml file with only those scans that is present in the PSM report. In addition to this, it also has an option to add N number of randomly selected scans to the output mzml file. + + +Please Note: This tool currently works only with PeptideShaker generated PSM report file. +