88from __future__ import absolute_import
99
1010from .snappy import (
11- stream_compress , stream_decompress , check_format , UncompressError )
12-
11+ HadoopStreamDecompressor , StreamDecompressor ,
12+ hadoop_stream_compress , hadoop_stream_decompress , raw_stream_compress ,
13+ raw_stream_decompress , stream_compress , stream_decompress ,
14+ UncompressError
15+ )
1316
14- FRAMING_FORMAT = 'framing'
1517
1618# Means format auto detection.
1719# For compression will be used framing format.
1820# In case of decompression will try to detect a format from the input stream
1921# header.
20- FORMAT_AUTO = 'auto'
21-
22- DEFAULT_FORMAT = FORMAT_AUTO
22+ DEFAULT_FORMAT = "auto"
2323
24- ALL_SUPPORTED_FORMATS = [FRAMING_FORMAT , FORMAT_AUTO ]
24+ ALL_SUPPORTED_FORMATS = ["framing" , "auto" ]
2525
2626_COMPRESS_METHODS = {
27- FRAMING_FORMAT : stream_compress ,
27+ "framing" : stream_compress ,
28+ "hadoop" : hadoop_stream_compress ,
29+ "raw" : raw_stream_compress
2830}
2931
3032_DECOMPRESS_METHODS = {
31- FRAMING_FORMAT : stream_decompress ,
33+ "framing" : stream_decompress ,
34+ "hadoop" : hadoop_stream_decompress ,
35+ "raw" : raw_stream_decompress
3236}
3337
3438# We will use framing format as the default to compression.
3539# And for decompression, if it's not defined explicitly, we will try to
3640# guess the format from the file header.
37- _DEFAULT_COMPRESS_FORMAT = FRAMING_FORMAT
41+ _DEFAULT_COMPRESS_FORMAT = "framing"
42+
43+
44+ def uvarint (fin ):
45+ result = 0
46+ shift = 0
47+ while True :
48+ byte = fin .read (1 )[0 ]
49+ result |= (byte & 0x7F ) << shift
50+ if (byte & 0x80 ) == 0 :
51+ break
52+ shift += 7
53+ return result
54+
55+
56+ def check_unframed_format (fin ):
57+ fin .seek (0 )
58+ try :
59+ size = uvarint (fin )
60+ assert size < 2 ** 32 - 1
61+ next_byte = fin .read (1 )[0 ]
62+ end = fin .seek (0 , 2 )
63+ assert size < end
64+ assert next_byte & 0b11 == 0 # must start with literal block
65+ return True
66+ except :
67+ return False
68+
3869
3970# The tuple contains an ordered sequence of a format checking function and
4071# a format-specific decompression function.
4172# Framing format has it's header, that may be recognized.
42- _DECOMPRESS_FORMAT_FUNCS = (
43- (check_format , stream_decompress ),
44- )
73+ _DECOMPRESS_FORMAT_FUNCS = {
74+ "framed" : stream_decompress ,
75+ "hadoop" : hadoop_stream_decompress ,
76+ "raw" : raw_stream_decompress
77+ }
4578
4679
4780def guess_format_by_header (fin ):
@@ -50,23 +83,25 @@ def guess_format_by_header(fin):
5083 :return: tuple of decompression method and a chunk that was taken from the
5184 input for format detection.
5285 """
53- chunk = None
54- for check_method , decompress_func in _DECOMPRESS_FORMAT_FUNCS :
55- ok , chunk = check_method (fin = fin , chunk = chunk )
56- if not ok :
57- continue
58- return decompress_func , chunk
59- raise UncompressError ("Can't detect archive format" )
86+ if StreamDecompressor .check_format (fin ):
87+ form = "framed"
88+ elif HadoopStreamDecompressor .check_format (fin ):
89+ form = "hadoop"
90+ elif check_unframed_format (fin ):
91+ form = "raw"
92+ else :
93+ raise UncompressError ("Can't detect format" )
94+ return form , _DECOMPRESS_FORMAT_FUNCS [form ]
6095
6196
6297def get_decompress_function (specified_format , fin ):
63- if specified_format == FORMAT_AUTO :
98+ if specified_format == "auto" :
6499 decompress_func , read_chunk = guess_format_by_header (fin )
65100 return decompress_func , read_chunk
66101 return _DECOMPRESS_METHODS [specified_format ], None
67102
68103
69104def get_compress_function (specified_format ):
70- if specified_format == FORMAT_AUTO :
105+ if specified_format == "auto" :
71106 return _COMPRESS_METHODS [_DEFAULT_COMPRESS_FORMAT ]
72107 return _COMPRESS_METHODS [specified_format ]
0 commit comments