|
8 | 8 | from __future__ import absolute_import |
9 | 9 |
|
10 | 10 | from .snappy import ( |
11 | | - stream_compress, stream_decompress, check_format, UncompressError) |
12 | | - |
| 11 | + HadoopStreamDecompressor, StreamDecompressor, |
| 12 | + hadoop_stream_compress, hadoop_stream_decompress, raw_stream_compress, |
| 13 | + raw_stream_decompress, stream_compress, stream_decompress, |
| 14 | + UncompressError |
| 15 | +) |
13 | 16 |
|
14 | | -FRAMING_FORMAT = 'framing' |
15 | 17 |
|
16 | 18 | # Means format auto detection. |
17 | 19 | # For compression will be used framing format. |
18 | 20 | # In case of decompression will try to detect a format from the input stream |
19 | 21 | # header. |
20 | | -FORMAT_AUTO = 'auto' |
| 22 | +DEFAULT_FORMAT = "auto" |
21 | 23 |
|
22 | | -DEFAULT_FORMAT = FORMAT_AUTO |
23 | | - |
24 | | -ALL_SUPPORTED_FORMATS = [FRAMING_FORMAT, FORMAT_AUTO] |
| 24 | +ALL_SUPPORTED_FORMATS = ["framing", "auto"] |
25 | 25 |
|
26 | 26 | _COMPRESS_METHODS = { |
27 | | - FRAMING_FORMAT: stream_compress, |
| 27 | + "framing": stream_compress, |
| 28 | + "hadoop": hadoop_stream_compress, |
| 29 | + "raw": raw_stream_compress |
28 | 30 | } |
29 | 31 |
|
30 | 32 | _DECOMPRESS_METHODS = { |
31 | | - FRAMING_FORMAT: stream_decompress, |
| 33 | + "framing": stream_decompress, |
| 34 | + "hadoop": hadoop_stream_decompress, |
| 35 | + "raw": raw_stream_decompress |
32 | 36 | } |
33 | 37 |
|
34 | 38 | # We will use framing format as the default to compression. |
35 | 39 | # And for decompression, if it's not defined explicitly, we will try to |
36 | 40 | # guess the format from the file header. |
37 | | -_DEFAULT_COMPRESS_FORMAT = FRAMING_FORMAT |
| 41 | +_DEFAULT_COMPRESS_FORMAT = "framing" |
| 42 | + |
| 43 | + |
| 44 | +def uvarint(fin): |
| 45 | + """Read uint64 nbumber from varint encoding in a stream""" |
| 46 | + result = 0 |
| 47 | + shift = 0 |
| 48 | + while True: |
| 49 | + byte = fin.read(1)[0] |
| 50 | + result |= (byte & 0x7F) << shift |
| 51 | + if (byte & 0x80) == 0: |
| 52 | + break |
| 53 | + shift += 7 |
| 54 | + return result |
| 55 | + |
| 56 | + |
| 57 | +def check_unframed_format(fin, reset=False): |
| 58 | + """Can this be read using the raw codec |
| 59 | +
|
| 60 | + This function wil return True for all snappy raw streams, but |
| 61 | + True does not mean that we can necessarily decode the stream. |
| 62 | + """ |
| 63 | + if reset: |
| 64 | + fin.seek(0) |
| 65 | + try: |
| 66 | + size = uvarint(fin) |
| 67 | + assert size < 2**32 - 1 |
| 68 | + next_byte = fin.read(1)[0] |
| 69 | + end = fin.seek(0, 2) |
| 70 | + assert size < end |
| 71 | + assert next_byte & 0b11 == 0 # must start with literal block |
| 72 | + return True |
| 73 | + except: |
| 74 | + return False |
| 75 | + |
38 | 76 |
|
39 | 77 | # The tuple contains an ordered sequence of a format checking function and |
40 | 78 | # a format-specific decompression function. |
41 | 79 | # Framing format has it's header, that may be recognized. |
42 | | -_DECOMPRESS_FORMAT_FUNCS = ( |
43 | | - (check_format, stream_decompress), |
44 | | -) |
| 80 | +_DECOMPRESS_FORMAT_FUNCS = { |
| 81 | + "framed": stream_decompress, |
| 82 | + "hadoop": hadoop_stream_decompress, |
| 83 | + "raw": raw_stream_decompress |
| 84 | +} |
45 | 85 |
|
46 | 86 |
|
47 | 87 | def guess_format_by_header(fin): |
48 | 88 | """Tries to guess a compression format for the given input file by it's |
49 | 89 | header. |
50 | | - :return: tuple of decompression method and a chunk that was taken from the |
51 | | - input for format detection. |
| 90 | +
|
| 91 | + :return: format name (str), stream decompress function (callable) |
52 | 92 | """ |
53 | | - chunk = None |
54 | | - for check_method, decompress_func in _DECOMPRESS_FORMAT_FUNCS: |
55 | | - ok, chunk = check_method(fin=fin, chunk=chunk) |
56 | | - if not ok: |
57 | | - continue |
58 | | - return decompress_func, chunk |
59 | | - raise UncompressError("Can't detect archive format") |
| 93 | + if StreamDecompressor.check_format(fin): |
| 94 | + form = "framed" |
| 95 | + elif HadoopStreamDecompressor.check_format(fin): |
| 96 | + form = "hadoop" |
| 97 | + elif check_unframed_format(fin, reset=True): |
| 98 | + form = "raw" |
| 99 | + else: |
| 100 | + raise UncompressError("Can't detect format") |
| 101 | + return form, _DECOMPRESS_FORMAT_FUNCS[form] |
60 | 102 |
|
61 | 103 |
|
62 | 104 | def get_decompress_function(specified_format, fin): |
63 | | - if specified_format == FORMAT_AUTO: |
64 | | - decompress_func, read_chunk = guess_format_by_header(fin) |
65 | | - return decompress_func, read_chunk |
66 | | - return _DECOMPRESS_METHODS[specified_format], None |
| 105 | + if specified_format == "auto": |
| 106 | + format, decompress_func = guess_format_by_header(fin) |
| 107 | + return decompress_func |
| 108 | + return _DECOMPRESS_METHODS[specified_format] |
67 | 109 |
|
68 | 110 |
|
69 | 111 | def get_compress_function(specified_format): |
70 | | - if specified_format == FORMAT_AUTO: |
| 112 | + if specified_format == "auto": |
71 | 113 | return _COMPRESS_METHODS[_DEFAULT_COMPRESS_FORMAT] |
72 | 114 | return _COMPRESS_METHODS[specified_format] |
0 commit comments