comparison env/lib/python3.9/site-packages/galaxy/util/checkers.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 import bz2
2 import gzip
3 import re
4 import tarfile
5 import zipfile
6 from io import (
7 BytesIO,
8 StringIO
9 )
10
11 from galaxy import util
12 from galaxy.util.image_util import image_type
13
14 HTML_CHECK_LINES = 100
15 CHUNK_SIZE = 2 ** 15 # 32Kb
16 HTML_REGEXPS = (
17 re.compile(r"<A\s+[^>]*HREF[^>]+>", re.I),
18 re.compile(r"<IFRAME[^>]*>", re.I),
19 re.compile(r"<FRAMESET[^>]*>", re.I),
20 re.compile(r"<META[\W][^>]*>", re.I),
21 re.compile(r"<SCRIPT[^>]*>", re.I),
22 )
23
24
25 def check_html(name, file_path=True):
26 """
27 Returns True if the file/string contains HTML code.
28 """
29 # Handles files if file_path is True or text if file_path is False
30 if file_path:
31 temp = open(name, encoding='utf-8')
32 else:
33 temp = StringIO(util.unicodify(name))
34 try:
35 for _ in range(HTML_CHECK_LINES):
36 line = temp.readline(CHUNK_SIZE)
37 if not line:
38 break
39 if any(regexp.search(line) for regexp in HTML_REGEXPS):
40 return True
41 except UnicodeDecodeError:
42 return False
43 finally:
44 temp.close()
45 return False
46
47
48 def check_binary(name, file_path=True):
49 # Handles files if file_path is True or text if file_path is False
50 if file_path:
51 temp = open(name, "rb")
52 else:
53 temp = BytesIO(name)
54 try:
55 return util.is_binary(temp.read(1024))
56 finally:
57 temp.close()
58
59
60 def check_gzip(file_path, check_content=True):
61 # This method returns a tuple of booleans representing ( is_gzipped, is_valid )
62 # Make sure we have a gzipped file
63 try:
64 with open(file_path, "rb") as temp:
65 magic_check = temp.read(2)
66 if magic_check != util.gzip_magic:
67 return (False, False)
68 except Exception:
69 return (False, False)
70 # We support some binary data types, so check if the compressed binary file is valid
71 # If the file is Bam, it should already have been detected as such, so we'll just check
72 # for sff format.
73 try:
74 with gzip.open(file_path, 'rb') as fh:
75 header = fh.read(4)
76 if header == b'.sff':
77 return (True, True)
78 except Exception:
79 return(False, False)
80
81 if not check_content:
82 return (True, True)
83
84 with gzip.open(file_path, mode='rb') as gzipped_file:
85 chunk = gzipped_file.read(CHUNK_SIZE)
86 # See if we have a compressed HTML file
87 if check_html(chunk, file_path=False):
88 return (True, False)
89 return (True, True)
90
91
92 def check_bz2(file_path, check_content=True):
93 try:
94 with open(file_path, "rb") as temp:
95 magic_check = temp.read(3)
96 if magic_check != util.bz2_magic:
97 return (False, False)
98 except Exception:
99 return(False, False)
100
101 if not check_content:
102 return (True, True)
103
104 with bz2.BZ2File(file_path, mode='rb') as bzipped_file:
105 chunk = bzipped_file.read(CHUNK_SIZE)
106 # See if we have a compressed HTML file
107 if check_html(chunk, file_path=False):
108 return (True, False)
109 return (True, True)
110
111
112 def check_zip(file_path, check_content=True, files=1):
113 if not zipfile.is_zipfile(file_path):
114 return (False, False)
115
116 if not check_content:
117 return (True, True)
118
119 chunk = None
120 for filect, member in enumerate(iter_zip(file_path)):
121 handle, name = member
122 chunk = handle.read(CHUNK_SIZE)
123 if chunk and check_html(chunk, file_path=False):
124 return (True, False)
125 if filect >= files:
126 break
127 return (True, True)
128
129
130 def is_bz2(file_path):
131 is_bz2, is_valid = check_bz2(file_path, check_content=False)
132 return is_bz2
133
134
135 def is_gzip(file_path):
136 is_gzipped, is_valid = check_gzip(file_path, check_content=False)
137 return is_gzipped
138
139
140 def is_zip(file_path):
141 is_zipped, is_valid = check_zip(file_path, check_content=False)
142 return is_zipped
143
144
145 def is_single_file_zip(file_path):
146 for i, _ in enumerate(iter_zip(file_path)):
147 if i > 1:
148 return False
149 return True
150
151
152 def is_tar(file_path):
153 return tarfile.is_tarfile(file_path)
154
155
156 def iter_zip(file_path):
157 with zipfile.ZipFile(file_path) as z:
158 for f in filter(lambda x: not x.endswith('/'), z.namelist()):
159 yield (z.open(f), f)
160
161
162 def check_image(file_path):
163 """ Simple wrapper around image_type to yield a True/False verdict """
164 if image_type(file_path):
165 return True
166 return False
167
168
169 __all__ = (
170 'check_binary',
171 'check_bz2',
172 'check_gzip',
173 'check_html',
174 'check_image',
175 'check_zip',
176 'is_gzip',
177 'is_bz2',
178 'is_zip',
179 )