annotate get_edge_data.py @ 4:d5464c9e1723 draft

Add support for paired collection of FASTQ (thanks to Inge Alexander Raknes).
author crs4
date Tue, 17 Mar 2015 10:44:33 -0400
parents f77ce4f92b46
children 407b894abb08
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
7af33315bc5e Uploaded
crs4
parents:
diff changeset
1 # -*- coding: utf-8 -*-
7af33315bc5e Uploaded
crs4
parents:
diff changeset
2
7af33315bc5e Uploaded
crs4
parents:
diff changeset
3 from ftplib import FTP
7af33315bc5e Uploaded
crs4
parents:
diff changeset
4 import optparse
7af33315bc5e Uploaded
crs4
parents:
diff changeset
5 import sys
7af33315bc5e Uploaded
crs4
parents:
diff changeset
6
1
f77ce4f92b46 Use $GALAXY_SLOTS instead of $EDGE_PRO_SITE_OPTIONS. Add dependency on bowtie2. Add readme.rst .
crs4
parents: 0
diff changeset
7 class GetData(object):
0
7af33315bc5e Uploaded
crs4
parents:
diff changeset
8
7af33315bc5e Uploaded
crs4
parents:
diff changeset
9 def __init__(self, gbkid, fnafile, pttfile, rntfile, gfffile):
7af33315bc5e Uploaded
crs4
parents:
diff changeset
10 self.gbkid = gbkid
7af33315bc5e Uploaded
crs4
parents:
diff changeset
11 self.fnafile = fnafile
7af33315bc5e Uploaded
crs4
parents:
diff changeset
12 self.pttfile = pttfile
7af33315bc5e Uploaded
crs4
parents:
diff changeset
13 self.rntfile = rntfile
7af33315bc5e Uploaded
crs4
parents:
diff changeset
14 self.gfffile = gfffile
7af33315bc5e Uploaded
crs4
parents:
diff changeset
15 self.ftpurl = 'ftp.ncbi.nlm.nih.gov'
7af33315bc5e Uploaded
crs4
parents:
diff changeset
16 self.folder = '/genomes/Bacteria/'
7af33315bc5e Uploaded
crs4
parents:
diff changeset
17 strainName = self._getStrainName()
7af33315bc5e Uploaded
crs4
parents:
diff changeset
18 print strainName
7af33315bc5e Uploaded
crs4
parents:
diff changeset
19 if not strainName:
7af33315bc5e Uploaded
crs4
parents:
diff changeset
20 sys.exit("Unrecognized RefSeq Genomic Accession ID")
7af33315bc5e Uploaded
crs4
parents:
diff changeset
21 ftp = FTP(self.ftpurl)
7af33315bc5e Uploaded
crs4
parents:
diff changeset
22 ftp.login()
7af33315bc5e Uploaded
crs4
parents:
diff changeset
23 newDir = self.folder + strainName
7af33315bc5e Uploaded
crs4
parents:
diff changeset
24 ftp.cwd(newDir)
1
f77ce4f92b46 Use $GALAXY_SLOTS instead of $EDGE_PRO_SITE_OPTIONS. Add dependency on bowtie2. Add readme.rst .
crs4
parents: 0
diff changeset
25
0
7af33315bc5e Uploaded
crs4
parents:
diff changeset
26 directoryFiles = []
7af33315bc5e Uploaded
crs4
parents:
diff changeset
27 ftp.retrlines('NLST', directoryFiles.append)
7af33315bc5e Uploaded
crs4
parents:
diff changeset
28 for fileName in directoryFiles:
7af33315bc5e Uploaded
crs4
parents:
diff changeset
29 try:
7af33315bc5e Uploaded
crs4
parents:
diff changeset
30 if '.fna' in fileName and self.gbkid in fileName:
7af33315bc5e Uploaded
crs4
parents:
diff changeset
31 #print "downloading", fileName
7af33315bc5e Uploaded
crs4
parents:
diff changeset
32 with open(self.fnafile, 'w') as outFile:
7af33315bc5e Uploaded
crs4
parents:
diff changeset
33 ftp.retrbinary("RETR " + fileName, outFile.write)
7af33315bc5e Uploaded
crs4
parents:
diff changeset
34 elif '.ptt' in fileName and self.gbkid in fileName:
7af33315bc5e Uploaded
crs4
parents:
diff changeset
35 #print "downloading", fileName
7af33315bc5e Uploaded
crs4
parents:
diff changeset
36 with open(self.pttfile, 'w') as outFile:
7af33315bc5e Uploaded
crs4
parents:
diff changeset
37 ftp.retrbinary("RETR " + fileName, outFile.write)
7af33315bc5e Uploaded
crs4
parents:
diff changeset
38 elif '.rnt' in fileName and self.gbkid in fileName:
7af33315bc5e Uploaded
crs4
parents:
diff changeset
39 #print "downloading", fileName
7af33315bc5e Uploaded
crs4
parents:
diff changeset
40 with open(self.rntfile, 'w') as outFile:
7af33315bc5e Uploaded
crs4
parents:
diff changeset
41 ftp.retrbinary("RETR " + fileName, outFile.write)
7af33315bc5e Uploaded
crs4
parents:
diff changeset
42 elif '.gff' in fileName and self.gbkid in fileName:
7af33315bc5e Uploaded
crs4
parents:
diff changeset
43 #print "downloading", fileName
7af33315bc5e Uploaded
crs4
parents:
diff changeset
44 with open(self.gfffile, 'w') as outFile:
7af33315bc5e Uploaded
crs4
parents:
diff changeset
45 ftp.retrbinary("RETR " + fileName, outFile.write)
7af33315bc5e Uploaded
crs4
parents:
diff changeset
46 #elif '.gbk' in fileName and self.gbkid in fileName:
7af33315bc5e Uploaded
crs4
parents:
diff changeset
47 # print "downloading", fileName
7af33315bc5e Uploaded
crs4
parents:
diff changeset
48 # with open(fileName, 'w') as outFile:
7af33315bc5e Uploaded
crs4
parents:
diff changeset
49 # ftp.retrbinary("RETR " + fileName, outFile.write)
7af33315bc5e Uploaded
crs4
parents:
diff changeset
50 except:
7af33315bc5e Uploaded
crs4
parents:
diff changeset
51 pass
7af33315bc5e Uploaded
crs4
parents:
diff changeset
52
7af33315bc5e Uploaded
crs4
parents:
diff changeset
53 def _getStrainName(self):
7af33315bc5e Uploaded
crs4
parents:
diff changeset
54 """ """
7af33315bc5e Uploaded
crs4
parents:
diff changeset
55 ftp = FTP(self.ftpurl)
7af33315bc5e Uploaded
crs4
parents:
diff changeset
56 ftp.login()
7af33315bc5e Uploaded
crs4
parents:
diff changeset
57 ftp.cwd(self.folder)
1
f77ce4f92b46 Use $GALAXY_SLOTS instead of $EDGE_PRO_SITE_OPTIONS. Add dependency on bowtie2. Add readme.rst .
crs4
parents: 0
diff changeset
58
0
7af33315bc5e Uploaded
crs4
parents:
diff changeset
59 straindirectories = []
7af33315bc5e Uploaded
crs4
parents:
diff changeset
60 ftp.retrlines("NLST " , straindirectories.append)
7af33315bc5e Uploaded
crs4
parents:
diff changeset
61 #print "scanning directories..."
7af33315bc5e Uploaded
crs4
parents:
diff changeset
62 for strainName in straindirectories:
7af33315bc5e Uploaded
crs4
parents:
diff changeset
63 try:
7af33315bc5e Uploaded
crs4
parents:
diff changeset
64 newDir = self.folder + strainName
7af33315bc5e Uploaded
crs4
parents:
diff changeset
65 ftp.cwd(newDir)
7af33315bc5e Uploaded
crs4
parents:
diff changeset
66 strainFiles = []
7af33315bc5e Uploaded
crs4
parents:
diff changeset
67 ftp.retrlines('NLST', strainFiles.append)
7af33315bc5e Uploaded
crs4
parents:
diff changeset
68 for element in strainFiles:
7af33315bc5e Uploaded
crs4
parents:
diff changeset
69 if self.gbkid in element:
7af33315bc5e Uploaded
crs4
parents:
diff changeset
70 return strainName
7af33315bc5e Uploaded
crs4
parents:
diff changeset
71 except:
7af33315bc5e Uploaded
crs4
parents:
diff changeset
72 pass
7af33315bc5e Uploaded
crs4
parents:
diff changeset
73 return None
7af33315bc5e Uploaded
crs4
parents:
diff changeset
74
7af33315bc5e Uploaded
crs4
parents:
diff changeset
75
7af33315bc5e Uploaded
crs4
parents:
diff changeset
76 def __main__():
7af33315bc5e Uploaded
crs4
parents:
diff changeset
77 parser = optparse.OptionParser()
7af33315bc5e Uploaded
crs4
parents:
diff changeset
78 parser.add_option('-i', dest='gbkid', help='RefSeq Genomic Accession ID')
7af33315bc5e Uploaded
crs4
parents:
diff changeset
79 parser.add_option('--fna', dest='fnafile', help='Output FASTA file name')
7af33315bc5e Uploaded
crs4
parents:
diff changeset
80 parser.add_option('--ptt', dest='pttfile', help='Output PTT file name')
7af33315bc5e Uploaded
crs4
parents:
diff changeset
81 parser.add_option('--rnt', dest='rntfile', help='Output RNT file name')
7af33315bc5e Uploaded
crs4
parents:
diff changeset
82 parser.add_option('--gff', dest='gfffile', help='Output GFF file name')
7af33315bc5e Uploaded
crs4
parents:
diff changeset
83 (options, args) = parser.parse_args()
7af33315bc5e Uploaded
crs4
parents:
diff changeset
84 if len(args) > 0:
7af33315bc5e Uploaded
crs4
parents:
diff changeset
85 parser.error('Wrong number of arguments')
1
f77ce4f92b46 Use $GALAXY_SLOTS instead of $EDGE_PRO_SITE_OPTIONS. Add dependency on bowtie2. Add readme.rst .
crs4
parents: 0
diff changeset
86
f77ce4f92b46 Use $GALAXY_SLOTS instead of $EDGE_PRO_SITE_OPTIONS. Add dependency on bowtie2. Add readme.rst .
crs4
parents: 0
diff changeset
87 GetData(options.gbkid, options.fnafile, options.pttfile, options.rntfile, options.gfffile)
0
7af33315bc5e Uploaded
crs4
parents:
diff changeset
88
7af33315bc5e Uploaded
crs4
parents:
diff changeset
89
7af33315bc5e Uploaded
crs4
parents:
diff changeset
90 if __name__ == "__main__":
7af33315bc5e Uploaded
crs4
parents:
diff changeset
91 __main__()