Mercurial > repos > bornea > filter_fasta
view filter_fasta.py @ 1:b25b0f210e10 draft
Uploaded
author | bornea |
---|---|
date | Thu, 04 Aug 2016 12:08:49 -0400 |
parents | |
children | af454e5a9ef5 |
line wrap: on
line source
# -*- coding: utf-8 -*- """ Python-code: Merge Scaffold Samples Report files @author = Brent Kuenzi @email = Brent.Kuenzi@moffitt.org """ ####################################################################################### ## Description: ## # This program will filter a fasta file using a data file # containing an "Accession" or "Accession Number" column ## Required input: ## # 1) fasta file to be filtered # 2) data file containing a "Accession" or "Accession Number" column import sys import os def readtab(infile): # read in tab-delim text with open(infile,'r') as x: output = [] for line in x: line = line.strip() temp = line.split('\t') output.append(temp) return output def getAccessions(infile): # get list of protein accessions from your data data = readtab(infile) cnt = 0 header_start = 0 prot_start = 0 for i in data: if "Accession" in i: # finds the start of header header_start = cnt break cnt += 1 header = data[header_start] if "Accession Number" in header: prot_start = header.index("Accession Number") elif "Accession" in header: prot_start = header.index("Accession") proteins = [] for protein in data[1:]: proteins.append(protein[prot_start]) return proteins def FilterFastaSeq(infile,accession): # fasta file and UniprotID/SwissprotID input_data = readtab(infile) seq=[] header=[] temp=[] flag = 0 cnt = 0 for i in input_data: cnt+=1 if flag == 1: # once we have a hit, start adding the sequences if ">" not in i[0]: # don't add the headers to the sequence temp.append(i[0]) #print temp if i[0].startswith(">"): # is it a fasta header? if temp != []: # if it is a continued fasta header, add old sequences to the sequence list # will this cutoff the last on of the file? merged = "\n".join(temp) if merged!="": seq.append(merged) temp=[] lst = i[0].split('|') ID1 = lst[1] lst2 = lst[2].split(' ') ID2 = lst2[0] flag = 0 if ID1 in accession: # are the IDs part of your data? flag+=1 header.append(i[0]) # if it is then append it if ID2 in accession: # are the IDs part of your data? flag+=1 header.append(i[0]) # if it is then append it if cnt == len(input_data): # account for last fasta sequence in file if temp != []: merged = "\n".join(temp) if merged!="": seq.append(merged) cnt=0 with open("output.fasta","wt") as x: for i in header: x.write(i+'\n'+seq[cnt]+'\n') cnt+=1 fasta = sys.argv[1] # fasta file to filter data = sys.argv[2] # scaffold report #2 -- filename FilterFastaSeq(fasta,getAccessions(data)) os.rename("output.fasta", sys.argv[3])