Mercurial > repos > pravs > msms_extractor
view MSMS_Extractor.py @ 0:093015b1b904 draft
planemo upload
author | pravs |
---|---|
date | Mon, 13 Feb 2017 02:44:05 -0500 |
parents | |
children | aa944e3a353c |
line wrap: on
line source
# # Developed by Praveen Kumar # Galaxy-P Team (Griffin's Lab) # University of Minnesota # # def main(): from pyteomics import mzml import os import sys import shutil import subprocess import re import pandas as pd from operator import itemgetter from itertools import groupby if len(sys.argv) >= 5: # Start of Reading Scans from PSM file # Creating dictionary of PSM file: key = filename key = list of scan numbers ScanFile = sys.argv[2] spectrumTitleList = list(pd.read_csv(ScanFile, "\t")['Spectrum Title']) scanFileNumber = [[".".join(each.split(".")[:-3]), int(each.split(".")[-2:-1][0])] for each in spectrumTitleList] scanDict = {} for each in scanFileNumber: if scanDict.has_key(each[0]): scanDict[each[0]].append(int(each[1])) else: scanDict[each[0]] = [int(each[1])] # End of Reading Scans from PSM file inputPath = sys.argv[1] ##outPath = "/".join(sys.argv[3].split("/")[:-1]) outPath = sys.argv[3] ##outFile = sys.argv[3].split("/")[-1] allScanList = [] # Read all scan numbers using indexedmzML/indexList/index/offset tags for k in mzml.read(inputPath).iterfind('indexedmzML/indexList/index/offset'): if re.search("scan=(\d+)", k['idRef']): a = re.search("scan=(\d+)", k['idRef']) allScanList.append(int(a.group(1))) # End of Reading mzML file fraction_name = sys.argv[4] if scanDict.has_key(fraction_name): scan2remove = scanDict[fraction_name] else: scan2remove = [] scan2retain = list(set(allScanList) - set(scan2remove)) scan2retain.sort() scansRemoved = list(set(allScanList) - set(scan2retain)) # scan2retain contains scans that is to be retained # Print Stats print >> sys.stdout,"Total number of Scan Numbers: %d" % len(list(set(allScanList))) print >> sys.stdout,"Number of Scans to remove: %d" % len(list(set(scan2remove))) print >> sys.stdout,"Number of Scans retained: %d" % len(scan2retain) print >> sys.stdout,"Number of Scans removed: %d" % len(scansRemoved) # Identifying groups of continuous numbers in the scan2retain and creating scanString scanString = "" for a, b in groupby(enumerate(scan2retain), lambda(i,x):i-x): x = map(itemgetter(1), b) scanString = scanString + "["+str(x[0])+","+str(x[-1])+"] " # end identifying # start create filter file filter_file = open("filter.txt", "w") filter_file.write("filter=scanNumber %s\n" % scanString) filter_file.close() # end create filter file # Prepare command for msconvert inputFile = fraction_name+".mzML" os.symlink(inputPath,inputFile) outFile = "filtered_"+fraction_name+".mzML" # msconvert_command = "msconvert " + inputFile + " --filter " + "\"scanNumber " + scanString + " \" " + " --outfile " + outFile + " --mzML --zlib" msconvert_command = "msconvert " + inputFile + " -c filter.txt " + " --outfile " + outFile + " --mzML --zlib" # Run msconvert try: subprocess.check_output(msconvert_command, stderr=subprocess.STDOUT, shell=True) except subprocess.CalledProcessError as e: sys.stderr.write( "msconvert resulted in error: %s: %s" % ( e.returncode, e.output )) sys.exit(e.returncode) # Copy output to shutil.copyfile(outFile, outPath) else: print "Please contact the admin. Number of inputs are not sufficient to run the program.\n" if __name__ == "__main__": main()