view get_edge_data.py @ 4:d5464c9e1723 draft

Add support for paired collection of FASTQ (thanks to Inge Alexander Raknes).
author crs4
date Tue, 17 Mar 2015 10:44:33 -0400
parents f77ce4f92b46
children 407b894abb08
line wrap: on
line source

# -*- coding: utf-8 -*-

from ftplib import FTP
import optparse
import sys

class GetData(object):

    def __init__(self, gbkid, fnafile, pttfile, rntfile, gfffile):
        self.gbkid = gbkid
        self.fnafile = fnafile
        self.pttfile = pttfile
        self.rntfile = rntfile
        self.gfffile = gfffile
        self.ftpurl = 'ftp.ncbi.nlm.nih.gov'
        self.folder = '/genomes/Bacteria/'
        strainName = self._getStrainName()
        print strainName
        if not strainName:
            sys.exit("Unrecognized RefSeq Genomic Accession ID")
        ftp = FTP(self.ftpurl)
        ftp.login()
        newDir = self.folder + strainName
        ftp.cwd(newDir)

        directoryFiles = []
        ftp.retrlines('NLST',  directoryFiles.append)
        for fileName in directoryFiles:
            try:
                if '.fna' in fileName and self.gbkid in fileName:
                    #print "downloading", fileName
                    with open(self.fnafile, 'w') as outFile:
                        ftp.retrbinary("RETR " + fileName, outFile.write)
                elif '.ptt' in fileName and self.gbkid in fileName:
                    #print "downloading", fileName
                    with open(self.pttfile, 'w') as outFile:
                        ftp.retrbinary("RETR " + fileName, outFile.write)
                elif '.rnt' in fileName and self.gbkid in fileName:
                    #print "downloading", fileName
                    with open(self.rntfile, 'w') as outFile:
                        ftp.retrbinary("RETR " + fileName, outFile.write)
                elif '.gff' in fileName and self.gbkid in fileName:
                    #print "downloading", fileName
                    with open(self.gfffile, 'w') as outFile:
                        ftp.retrbinary("RETR " + fileName, outFile.write)
                #elif '.gbk' in fileName and self.gbkid in fileName:
                #    print "downloading", fileName
                #    with open(fileName, 'w') as outFile:
                #       ftp.retrbinary("RETR " + fileName, outFile.write)
            except:
                pass

    def _getStrainName(self):
        """ """
        ftp = FTP(self.ftpurl)
        ftp.login()
        ftp.cwd(self.folder)

        straindirectories = []
        ftp.retrlines("NLST " , straindirectories.append)
        #print "scanning directories..."
        for strainName in straindirectories:
            try:
                newDir = self.folder + strainName
                ftp.cwd(newDir)
                strainFiles = []
                ftp.retrlines('NLST',  strainFiles.append)
                for element in strainFiles:
                    if self.gbkid in element:
                        return strainName
            except:
                pass
        return None


def __main__():
    parser = optparse.OptionParser()
    parser.add_option('-i', dest='gbkid', help='RefSeq Genomic Accession ID')
    parser.add_option('--fna', dest='fnafile', help='Output FASTA file name')
    parser.add_option('--ptt', dest='pttfile', help='Output PTT file name')
    parser.add_option('--rnt', dest='rntfile', help='Output RNT file name')
    parser.add_option('--gff', dest='gfffile', help='Output GFF file name')
    (options, args) = parser.parse_args()
    if len(args) > 0:
        parser.error('Wrong number of arguments')

    GetData(options.gbkid, options.fnafile, options.pttfile, options.rntfile, options.gfffile)


if __name__ == "__main__":
    __main__()