diff get_edge_data.py @ 0:7af33315bc5e draft

Uploaded
author crs4
date Mon, 09 Sep 2013 06:11:47 -0400
parents
children f77ce4f92b46
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/get_edge_data.py	Mon Sep 09 06:11:47 2013 -0400
@@ -0,0 +1,96 @@
+# -*- coding: utf-8 -*-
+
+from ftplib import FTP
+import optparse
+import sys
+
+class GetData:
+
+    def __init__(self, gbkid, fnafile, pttfile, rntfile, gfffile):
+        self.gbkid = gbkid
+        self.fnafile = fnafile
+        self.pttfile = pttfile
+        self.rntfile = rntfile
+        self.gfffile = gfffile
+        self.ftpurl = 'ftp.ncbi.nlm.nih.gov'
+        self.folder = '/genomes/Bacteria/'
+
+    def getData(self):
+        """ """
+        strainName = self._getStrainName()
+        print strainName
+        if not strainName:
+            sys.exit("Unrecognized RefSeq Genomic Accession ID")
+        ftp = FTP(self.ftpurl)
+        ftp.login()
+        newDir = self.folder + strainName
+        ftp.cwd(newDir)
+        
+        directoryFiles = []
+        ftp.retrlines('NLST',  directoryFiles.append)
+        for fileName in directoryFiles:
+            try:
+                if '.fna' in fileName and self.gbkid in fileName:
+                    #print "downloading", fileName
+                    with open(self.fnafile, 'w') as outFile:
+                        ftp.retrbinary("RETR " + fileName, outFile.write)
+                elif '.ptt' in fileName and self.gbkid in fileName:
+                    #print "downloading", fileName
+                    with open(self.pttfile, 'w') as outFile:
+                        ftp.retrbinary("RETR " + fileName, outFile.write)
+                elif '.rnt' in fileName and self.gbkid in fileName:
+                    #print "downloading", fileName
+                    with open(self.rntfile, 'w') as outFile:
+                        ftp.retrbinary("RETR " + fileName, outFile.write)
+                elif '.gff' in fileName and self.gbkid in fileName:
+                    #print "downloading", fileName
+                    with open(self.gfffile, 'w') as outFile:
+                        ftp.retrbinary("RETR " + fileName, outFile.write)
+                #elif '.gbk' in fileName and self.gbkid in fileName:
+                #    print "downloading", fileName
+                #    with open(fileName, 'w') as outFile:
+                #       ftp.retrbinary("RETR " + fileName, outFile.write)
+            except:
+                pass
+
+    def _getStrainName(self):
+        """ """
+        ftp = FTP(self.ftpurl)
+        ftp.login()
+        ftp.cwd(self.folder)
+        
+        straindirectories = []
+        ftp.retrlines("NLST " , straindirectories.append)
+        #print "scanning directories..."
+        for strainName in straindirectories:
+            try:
+                newDir = self.folder + strainName
+                ftp.cwd(newDir)
+                strainFiles = []
+                ftp.retrlines('NLST',  strainFiles.append)
+                for element in strainFiles:
+                    if self.gbkid in element:
+                        return strainName
+            except:
+                pass
+        return None
+
+
+def __main__():
+    """ main function """
+    parser = optparse.OptionParser()
+    parser.add_option('-i', dest='gbkid', help='RefSeq Genomic Accession ID')
+    parser.add_option('--fna', dest='fnafile', help='Output FASTA file name')
+    parser.add_option('--ptt', dest='pttfile', help='Output PTT file name')
+    parser.add_option('--rnt', dest='rntfile', help='Output RNT file name')
+    parser.add_option('--gff', dest='gfffile', help='Output GFF file name')
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error('Wrong number of arguments')
+    
+    S = GetData(options.gbkid, options.fnafile, options.pttfile, options.rntfile, options.gfffile)
+    S.getData()
+
+
+if __name__ == "__main__":
+    __main__()