From d79d5cbcd00ba889081f69255f5bb36a89e9dc5a Mon Sep 17 00:00:00 2001 From: James Parrott <80779630+JamesParrott@users.noreply.github.com> Date: Tue, 19 May 2026 19:19:24 +0100 Subject: [PATCH] Create DbfReader class Create outline dbfReader class Fix typo Passes mypy Map DbfReader's publis methods onto Reader Passes mypy and pytest locally (not network) Raise ShapefileException if neither .shp nor .dbf downloaded --- README.md | 12 +- changelog.txt | 7 + pyproject.toml | 3 +- src/shapefile.py | 1685 +++++++++++++++++++++++---------------------- test_shapefile.py | 25 +- 5 files changed, 913 insertions(+), 819 deletions(-) diff --git a/README.md b/README.md index 6950047..d3fc392 100644 --- a/README.md +++ b/README.md @@ -7,9 +7,9 @@ The Python Shapefile Library (PyShp) reads and writes ESRI Shapefiles in pure Py ![build status](https://github.com/GeospatialPython/pyshp/actions/workflows/build.yml/badge.svg) - **Author**: [Joel Lawhead](https://github.com/GeospatialPython) -- **Maintainers**: [Karim Bahgat](https://github.com/karimbahgat) -- **Version**: 3.0.6 -- **Date**: 19th May 2026 +- **Maintainers**: [James Parrott](https://github.com/JamesParrott) & [Karim Bahgat](https://github.com/karimbahgat) +- **Version**: 3.0.7 +- **Date**: 20th May 2026 - **License**: [MIT](https://github.com/GeospatialPython/pyshp/blob/master/LICENSE.TXT) ## Contents @@ -93,6 +93,12 @@ part of your geospatial project. # Version Changes +## 3.0.7 + +### Testability / separation of concerns. + - Separate dbf only reading methods into a new dbfReader class (an instance of which is owned by the regular Shapefile Reader class). + + ## 3.0.6 ### URL Downloading diff --git a/changelog.txt b/changelog.txt index 061868e..0df69a0 100644 --- a/changelog.txt +++ b/changelog.txt @@ -1,3 +1,10 @@ +VERSION 3.0.7 + +2026-05-20 + Testability / separation of concerns: + * Separate dbf only reading methods into a new dbfReader class (an instance of which is owned by the regular Shapefile Reader class). + + VERSION 3.0.6 2026-05-19 diff --git a/pyproject.toml b/pyproject.toml index b025c38..e514c68 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,8 @@ authors = [ {name = "Joel Lawhead", email = "jlawhead@geospatialpython.com"}, ] maintainers = [ - {name = "Karim Bahgat", email = "karim.bahgat.norway@gmail.com"} + {name = "James Parrott", email = "james@jamesparrott.dev"}, + {name = "Karim Bahgat", email = "karim.bahgat.norway@gmail.com"}, ] readme = "README.md" keywords = ["gis", "geospatial", "geographic", "shapefile", "shapefiles"] diff --git a/src/shapefile.py b/src/shapefile.py index 22c32d1..11c6a0e 100644 --- a/src/shapefile.py +++ b/src/shapefile.py @@ -2,16 +2,18 @@ shapefile.py Provides read and write support for ESRI Shapefiles. authors: jlawheadgeospatialpython.com -maintainer: karim.bahgat.norwaygmail.com +maintainers: jamesjamesparrott.dev and karim.bahgat.norwaygmail.com Compatible with Python versions >=3.9 """ from __future__ import annotations -__version__ = "3.0.6" +__version__ = "3.0.7" import array +import contextlib import doctest +import functools import io import logging import os @@ -2223,7 +2225,7 @@ def _save_to_named_tmp_file( class UnsuccessfulFileDownload(Warning): pass -SUPPORTED_URL_SCHEMES = frozenset(["http", "https"]) # must be lower case +SUPPORTED_URL_SCHEMES = ("http", "https") # must be lower case DEFAULT_USER_AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36" @@ -2275,7 +2277,7 @@ def _try_to_download_binary_file( try: resp = urlopen(req) except HTTPError as e: - msg = f"{e.msg}, occurred when trying to open: {url}, reason: {e.reason}. " + msg = f"{e.msg}, {e.code} occurred when trying to open: {url}, reason: {e.reason}. " if not suppress_http_errors: e.msg = msg # Add helpful info to the default abrupt 404 message. raise e @@ -2317,338 +2319,666 @@ def _try_to_download_binary_file( return initial_bytes, cast(ReadableBinStream, resp) +CONSTITUENT_FILE_EXTS = ["shp", "shx", "dbf"] +assert all(ext.islower() for ext in CONSTITUENT_FILE_EXTS) +def _assert_ext_is_supported(ext: str) -> None: + assert ext in CONSTITUENT_FILE_EXTS -class ShapefileException(Exception): - """An exception to handle shapefile specific problems.""" - - -class _NoShpSentinel: - """For use as a default value for shp to preserve the - behaviour (from when all keyword args were gathered - in the **kwargs dict) in case someone explictly - called Reader(shp=None) to load self.shx. +def _try_get_open_constituent_file( + shapefile_name: str, + ext: str, +) -> IO[bytes] | None: + """ + Attempts to open a .shp, .dbf or .shx file, + with both lower case and upper case file extensions, + and return it. If it was not possible to open the file, None is returned. """ + # typing.LiteralString is only available from Python 3.11 onwards. + # https://docs.python.org/3/library/typing.html#typing.LiteralString + # assert ext in {'shp', 'dbf', 'shx'} + _assert_ext_is_supported(ext) + exts = {ext, ext.upper(), ext.lower()} -_NO_SHP_SENTINEL = _NoShpSentinel() + for candidate_ext in exts: + try: + return open(f"{shapefile_name}.{candidate_ext}", "rb") + except OSError: + pass + return None -class Reader: - """Reads the three files of a shapefile as a unit or - separately. If one of the three files (.shp, .shx, - .dbf) is missing no exception is thrown until you try - to call a method that depends on that particular file. - The .shx index file is used if available for efficiency - but is not required to read the geometry from the .shp - file. The "shapefile" argument in the constructor is the - name of the file you want to open, and can be the path - to a shapefile on a local filesystem, inside a zipfile, - or a url. +def ensure_within_bounds(i: int, num_records: int) -> int: + """Provides list-like handling of a record index with a clearer + error message if the index is out of bounds.""" + rmax = num_records - 1 + if abs(i) > rmax: + raise IndexError( + f"Shape or Record index: {i} out of range. Max index: {rmax}" + ) + if i < 0: + i = range(num_records)[i] + return i - You can instantiate a Reader without specifying a shapefile - and then specify one later with the load() method. - Only the shapefile headers are read upon loading. Content - within each file is only accessed when required and as - efficiently as possible. Shapefiles are usually not large - but they can be. +class DbfReader: + """Reads a dbf file. You can instantiate a DbfReader without specifying a shapefile + and then specify one later with the load() method. """ - CONSTITUENT_FILE_EXTS = ["shp", "shx", "dbf"] - assert all(ext.islower() for ext in CONSTITUENT_FILE_EXTS) - - def _assert_ext_is_supported(self, ext: str) -> None: - assert ext in self.CONSTITUENT_FILE_EXTS - def __init__( self, - shapefile_path: str | PathLike[Any] = "", - /, - *, + file_: IO[bytes], encoding: str = "utf-8", encodingErrors: str = "strict", - shp: _NoShpSentinel | BinaryFileT | None = _NO_SHP_SENTINEL, - shx: BinaryFileT | None = None, - dbf: BinaryFileT | None = None, - # Keep kwargs even though unused, to preserve PyShp 2.4 API - **kwargs: Any, ): - self.shp = None - self.shx = None - self.dbf = None - self._files_to_close: list[BinaryFileStreamT] = [] - self.shapeName = "Not specified" - self._offsets: list[int] = [] - self.shpLength: int | None = None - self.numRecords: int | None = None - self.numShapes: int | None = None + self._file = file_ self.fields: list[Field] = [] - self.__dbfHdrLength = 0 self.__fieldLookup: dict[str, int] = {} self.encoding = encoding self.encodingErrors = encodingErrors - # See if a shapefile name was passed as the first argument - if shapefile_path: - path = fsdecode_if_pathlike(shapefile_path) - if isinstance(path, str): - if ".zip" in path: - # Shapefile is inside a zipfile - if path.count(".zip") > 1: - # Multiple nested zipfiles - raise ShapefileException( - f"Reading from multiple nested zipfiles is not supported: {path}" - ) - # Split into zipfile and shapefile paths - if path.endswith(".zip"): - zpath = path - shapefile = None - else: - zpath = path[: path.find(".zip") + 4] - shapefile = path[path.find(".zip") + 4 + 1 :] - - zipfileobj: ( - tempfile._TemporaryFileWrapper[bytes] | io.BufferedReader - ) - # Create a zip file handle - urlinfo = urlparse(zpath) - - resp: ReadableBinStream | None - if urlinfo.scheme in SUPPORTED_URL_SCHEMES: - # Zipfile is from a url - # Download to a temporary file and treat as normal zipfile - sniffed_bytes, resp = _try_to_download_binary_file(urlinfo=urlinfo) - - - # Use named tmp file as source for zip file data. - zipfileobj = _save_to_named_tmp_file( - resp, - initial_bytes = sniffed_bytes, - suffix=".zip", - ) - - else: - # Zipfile is from a file - zipfileobj = open(zpath, mode="rb") - - # Open the zipfile archive - with zipfile.ZipFile(zipfileobj, "r") as archive: - if not shapefile: - # Only the zipfile path is given - # Inspect zipfile contents to find the full shapefile path - shapefiles = [ - name - for name in archive.namelist() - if (name.endswith(".SHP") or name.endswith(".shp")) - ] - # The zipfile must contain exactly one shapefile - if len(shapefiles) == 0: - raise ShapefileException( - "Zipfile does not contain any shapefiles" - ) - if len(shapefiles) == 1: - shapefile = shapefiles[0] - else: - raise ShapefileException( - f"Zipfile contains more than one shapefile: {shapefiles}. " - "Please specify the full path to the shapefile you would like to open." - ) - # Try to extract file-like objects from zipfile - shapefile = os.path.splitext(shapefile)[ - 0 - ] # root shapefile name - for lower_ext in self.CONSTITUENT_FILE_EXTS: - for cased_ext in [lower_ext, lower_ext.upper()]: - try: - member = archive.open(f"{shapefile}.{cased_ext}") - # Use read+write tempfile as source for member data. - fileobj = _save_to_named_tmp_file(member) - setattr(self, lower_ext, fileobj) - self._files_to_close.append(fileobj) - except (OSError, AttributeError, KeyError): - pass - # Close and delete the temporary zipfile - try: - zipfileobj.close() - # TODO Does catching all possible exceptions really increase - # the chances of closing the zipfile successully, or does it - # just mean .close() failures will still fail, but fail - # silently? - except: # noqa: E722 - pass - # Try to load shapefile - if self.shp or self.dbf: - # Load and exit early - self.load() - return - raise ShapefileException( - f"No shp or dbf file found in zipfile: {path}" - ) + self._dbfHeader() - if path.startswith("http"): - # Shapefile is from a url - # Download each file to temporary path and treat as normal shapefile path - urlinfo = urlparse(path) - for ext in ["shp", "shx", "dbf"]: - - sniffed_bytes, resp = _try_to_download_binary_file( - urlinfo=urlinfo, - ext=ext, - suppress_http_errors=True, - ) - if resp is None: - continue - # Use tempfile as source for url data. - fileobj = _save_to_named_tmp_file(resp, initial_bytes = sniffed_bytes) - setattr(self, ext, fileobj) - self._files_to_close.append(fileobj) - if self.shp or self.dbf: - # Load and exit early - self.load() - return - - raise ShapefileException(f"No shp or dbf file found at url: {path}") - # Local file path to a shapefile - # Load and exit early - self.load(path) - return + @property + def dbf(self) -> IO[bytes]: + if not self._file: + raise dbfFileException( + f"DbfReader requires a .dbf file or file-like object. Got: {self._file}" + ) + return self._file - if shp is not _NO_SHP_SENTINEL: - shp = cast(Union[str, PathLike[Any], IO[bytes], None], shp) - self.shp = self.__seek_0_on_file_obj_wrap_or_open_from_name("shp", shp) - self.shx = self.__seek_0_on_file_obj_wrap_or_open_from_name("shx", shx) + def __len__(self) -> int: + """Returns the number of records in the .dbf file.""" - self.dbf = self.__seek_0_on_file_obj_wrap_or_open_from_name("dbf", dbf) + return self.numRecords - # Load the files - if self.shp or self.dbf: - self._try_to_set_constituent_file_headers() + def _dbfHeader(self) -> None: + """Reads a dbf header. Xbase-related code borrows heavily from ActiveState Python Cookbook Recipe 362715 by Raymond Hettinger""" - def __seek_0_on_file_obj_wrap_or_open_from_name( - self, - ext: str, - file_: BinaryFileT | None, - ) -> None | IO[bytes]: - # assert ext in {'shp', 'dbf', 'shx'} - self._assert_ext_is_supported(ext) - if file_ is None: - return None + dbf = self.dbf + # read relevant header parts + dbf.seek(0) + self.numRecords, self.__dbfHdrLength, self._record_length = cast(tuple[int, int, int], unpack( + " str: - """ - Use some general info on the shapefile as __str__ - """ - info = ["shapefile Reader"] - if self.shp: - info.append( - f" {len(self)} shapes (type '{SHAPETYPE_LOOKUP[self.shapeType]}')" + self.fields.append(Field(name, field_type, size, decimal)) + terminator = dbf.read(1) + if terminator != b"\r": + raise ShapefileException( + "Shapefile dbf header lacks expected terminator. (likely corrupt?)" ) - if self.dbf: - info.append(f" {len(self)} records ({len(self.fields)} fields)") - return "\n".join(info) - def __enter__(self) -> Reader: + # insert deletion field at start + self.fields.insert(0, Field("DeletionFlag", FieldType.C, 1, 0)) + + # store all field positions for easy lookups + # note: fieldLookup gives the index position of a field inside Reader.fields + self.__fieldLookup = {f[0]: i for i, f in enumerate(self.fields)} + + # by default, read all fields except the deletion flag, hence "[1:]" + # note: recLookup gives the index position of a field inside a _Record list + fieldnames = [f[0] for f in self.fields[1:]] + __fieldTuples, recLookup, recStruct = self._record_fields(fieldnames) + self.__fullRecStruct = recStruct + self.__fullRecLookup = recLookup + + def _record_fmt(self, fields: Container[str] | None = None) -> tuple[str, int]: + """Calculates the format and size of a .dbf record. Optional 'fields' arg + specifies which fieldnames to unpack and which to ignore. Note that this + always includes the DeletionFlag at index 0, regardless of the 'fields' arg. """ - Enter phase of context manager. + structcodes = [f"{fieldinfo.size}s" for fieldinfo in self.fields] + if fields is not None: + # only unpack specified fields, ignore others using padbytes (x) + structcodes = [ + code + if fieldinfo.name in fields + or fieldinfo.name == "DeletionFlag" # always unpack delflag + else f"{fieldinfo.size}x" + for fieldinfo, code in zip(self.fields, structcodes) + ] + fmt = "".join(structcodes) + fmt_size = calcsize(fmt) + # total size of fields should add up to recordlength from the header + while fmt_size < self._record_length: + # if not, pad byte until reaches recordlength + fmt += "x" + fmt_size += 1 + return (fmt, fmt_size) + + def _record_fields( + self, fields: Iterable[str] | None = None + ) -> tuple[list[Field], dict[str, int], Struct]: + """Returns the necessary info required to unpack a record's fields, + restricted to a subset of fieldnames 'fields' if specified. + Returns a list of field info tuples, a name-index lookup dict, + and a Struct instance for unpacking these fields. Note that DeletionFlag + is not a valid field. """ - return self + if fields is not None: + # restrict info to the specified fields + # first ignore repeated field names (order doesn't matter) + unique_fields = list(set(fields)) + # get the struct + fmt, __fmt_size = self._record_fmt(fields=unique_fields) + recStruct = Struct(fmt) + # make sure the given fieldnames exist + for name in unique_fields: + if name not in self.__fieldLookup or name == "DeletionFlag": + raise ValueError(f'"{name}" is not a valid field name') + # fetch relevant field info tuples + fieldTuples = [] + for fieldinfo in self.fields[1:]: + name = fieldinfo[0] + if name in unique_fields: + fieldTuples.append(fieldinfo) + # store the field positions + recLookup = {f[0]: i for i, f in enumerate(fieldTuples)} + else: + # use all the dbf fields + fieldTuples = self.fields[1:] # sans deletion flag + recStruct = self.__fullRecStruct + recLookup = self.__fullRecLookup + return fieldTuples, recLookup, recStruct - # def __exit__(self, exc_type, exc_val, exc_tb) -> None: - def __exit__( + def _record( self, - exc_type: BaseException | None, - exc_val: BaseException | None, - exc_tb: TracebackType | None, - ) -> bool | None: - """ - Exit phase of context manager, close opened files. + fieldTuples: list[Field], + recLookup: dict[str, int], + recStruct: Struct, + oid: int | None = None, + ) -> _Record | None: + """Reads and returns a dbf record row as a list of values. Requires specifying + a list of field info Field namedtuples 'fieldTuples', a record name-index dict 'recLookup', + and a Struct instance 'recStruct' for unpacking these fields. """ - self.close() - return None - - def __len__(self) -> int: - """Returns the number of shapes/records in the shapefile.""" - if self.dbf: - # Preferably use dbf record count - if self.numRecords is None: - self.__dbfHeader() + f = self.dbf - # .__dbfHeader sets self.numRecords or raises Exception - return cast(int, self.numRecords) - - if self.shp: - # Otherwise use shape count - if self.shx: - if self.numShapes is None: - self.__shxHeader() + # The only format chars in from self._record_fmt, in recStruct from _record_fields, + # are s and x (ascii encoded str and pad byte) so everything in recordContents is bytes + # https://docs.python.org/3/library/struct.html#format-characters + recordContents = recStruct.unpack(f.read(recStruct.size)) - # .__shxHeader sets self.numShapes or raises Exception - return cast(int, self.numShapes) + # deletion flag field is always unpacked as first value (see _record_fmt) + if recordContents[0] != b" ": + # deleted record + return None - # Index file not available, iterate all shapes to get total count - if self.numShapes is None: - # Determine length of shp file - shp = self.shp - checkpoint = shp.tell() - shp.seek(0, 2) - shpLength = shp.tell() - shp.seek(100) - # Do a fast shape iteration until end of file. - offsets = [] - pos = shp.tell() - while pos < shpLength: - offsets.append(pos) - # Unpack the shape header only - (__recNum, recLength) = unpack_2_int32_be(shp.read(8)) - # Jump to next shape position - pos += 8 + (2 * recLength) - shp.seek(pos) - # Set numShapes and offset indices - self.numShapes = len(offsets) - self._offsets = offsets - # Return to previous file position - shp.seek(checkpoint) + # drop deletion flag from values + recordContents = recordContents[1:] - return self.numShapes + # check that values match fields + if len(fieldTuples) != len(recordContents): + raise ShapefileException( + f"Number of record values ({len(recordContents)}) is different from the requested " + f"number of fields ({len(fieldTuples)})" + ) - # No file loaded yet, treat as 'empty' shapefile - return 0 + # parse each value + record = [] + for (__name, typ, __size, decimal), value in zip(fieldTuples, recordContents): + if typ is FieldType.N or typ is FieldType.F: + # numeric or float: number stored as a string, right justified, and padded with blanks to the width of the field. + value = value.split(b"\0")[0] + value = value.replace(b"*", b"") # QGIS NULL is all '*' chars + if value == b"": + value = None + elif decimal: + try: + value = float(value) + except ValueError: + # not parseable as float, set to None + value = None + else: + # force to int + try: + # first try to force directly to int. + # forcing a large int to float and back to int + # will lose information and result in wrong nr. + value = int(value) + except ValueError: + # forcing directly to int failed, so was probably a float. + try: + value = int(float(value)) + except ValueError: + # not parseable as int, set to None + value = None + elif typ is FieldType.D: + # date: 8 bytes - date stored as a string in the format YYYYMMDD. + if ( + not value.replace(b"\x00", b"") + .replace(b" ", b"") + .replace(b"0", b"") + ): + # dbf date field has no official null value + # but can check for all hex null-chars, all spaces, or all 0s (QGIS null) + value = None + else: + try: + # return as python date object + y, m, d = int(value[:4]), int(value[4:6]), int(value[6:8]) + value = date(y, m, d) + except (TypeError, ValueError): + # if invalid date, just return as unicode string so user can decimalde + value = str(value.strip()) + elif typ is FieldType.L: + # logical: 1 byte - initialized to 0x20 (space) otherwise T or F. + if value == b" ": + value = None # space means missing or not yet set + else: + if value in b"YyTt1": + value = True + elif value in b"NnFf0": + value = False + else: + value = None # unknown value is set to missing + else: + value = value.decode(self.encoding, self.encodingErrors) + value = value.strip().rstrip( + "\x00" + ) # remove null-padding at end of strings + record.append(value) - def __iter__(self) -> Iterator[ShapeRecord]: - """Iterates through the shapes/records in the shapefile.""" - yield from self.iterShapeRecords() + return _Record(recLookup, record, oid) - @property - def __geo_interface__(self) -> GeoJSONFeatureCollectionWithBBox: - shaperecords = self.shapeRecords() - fcollection = GeoJSONFeatureCollectionWithBBox( - bbox=list(self.bbox), - **shaperecords.__geo_interface__, + def record(self, i: int = 0, fields: list[str] | None = None) -> _Record | None: + """Returns a specific dbf record based on the supplied index. + To only read some of the fields, specify the 'fields' arg as a + list of one or more fieldnames. + """ + f = self.dbf + + i = ensure_within_bounds(i, self.numRecords) + recSize = self._record_length + f.seek(0) + f.seek(self.__dbfHdrLength + (i * recSize)) + fieldTuples, recLookup, recStruct = self._record_fields(fields) + return self._record( + oid=i, fieldTuples=fieldTuples, recLookup=recLookup, recStruct=recStruct + ) + + def records(self, fields: list[str] | None = None) -> list[_Record]: + """Returns all records in a dbf file. + To only read some of the fields, specify the 'fields' arg as a + list of one or more fieldnames. + """ + + records = [] + self.dbf.seek(self.__dbfHdrLength) + fieldTuples, recLookup, recStruct = self._record_fields(fields) + + for i in range(self.numRecords): + r = self._record( + oid=i, fieldTuples=fieldTuples, recLookup=recLookup, recStruct=recStruct + ) + if r: + records.append(r) + return records + + def iterRecords( + self, + fields: list[str] | None = None, + start: int = 0, + stop: int | None = None, + ) -> Iterator[_Record | None]: + """Returns a generator of records in a dbf file. + Useful for large shapefiles or dbf files. + To only read some of the fields, specify the 'fields' arg as a + list of one or more fieldnames. + By default yields all records. Otherwise, specify start + (default: 0) or stop (default: number_of_records) + to only yield record numbers i, where + start <= i < stop, (or + start <= i < number_of_records + stop + if stop < 0). + """ + + if not isinstance(self.numRecords, int): + raise ShapefileException( + "Error when reading number of Records in dbf file header" + ) + start = ensure_within_bounds(start, self.numRecords) + if stop is None: + stop = self.numRecords + elif abs(stop) > self.numRecords: + raise IndexError( + f"abs(stop): {abs(stop)} exceeds number of records: {self.numRecords}." + ) + elif stop < 0: + stop = range(self.numRecords)[stop] + recSize = self._record_length + self.dbf.seek(self.__dbfHdrLength + (start * recSize)) + fieldTuples, recLookup, recStruct = self._record_fields(fields) + for i in range(start, stop): + r = self._record( + oid=i, fieldTuples=fieldTuples, recLookup=recLookup, recStruct=recStruct + ) + if r: + yield r + + + +class ShapefileException(Exception): + """An exception to handle shapefile specific problems.""" + +class dbfFileException(ShapefileException): + """ Indicates a problem with the .dbf file. """ + +class _NoShpSentinel: + """For use as a default value for shp to preserve the + behaviour (from when all keyword args were gathered + in the **kwargs dict) in case someone explictly + called Reader(shp=None) to load self.shx. + """ + + +_NO_SHP_SENTINEL = _NoShpSentinel() + + +class Reader: + """Reads the three files of a shapefile as a unit or + separately. If one of the three files (.shp, .shx, + .dbf) is missing no exception is thrown until you try + to call a method that depends on that particular file. + The .shx index file is used if available for efficiency + but is not required to read the geometry from the .shp + file. The "shapefile" argument in the constructor is the + name of the file you want to open, and can be the path + to a shapefile on a local filesystem, inside a zipfile, + or a url. + + You can instantiate a Reader without specifying a shapefile + and then specify one later with the load() method. + + Only the shapefile headers are read upon loading. Content + within each file is only accessed when required and as + efficiently as possible. Shapefiles are usually not large + but they can be. + """ + + + def __init__( + self, + shapefile_path: str | PathLike[Any] = "", + /, + *, + encoding: str = "utf-8", + encodingErrors: str = "strict", + shp: _NoShpSentinel | BinaryFileT | None = _NO_SHP_SENTINEL, + shx: BinaryFileT | None = None, + dbf: BinaryFileT | None = None, + # Keep kwargs even though unused, to preserve PyShp 2.4 API + **kwargs: Any, + ): + self.encoding = encoding + self.encodingErrors = encodingErrors + self._shp = None + self._shx = None + self._dbf = None + self.shapeName = "Not specified" + self._offsets: list[int] = [] + self.shpLength: int | None = None + self.numShapes: int | None = None + self._exit_stack = contextlib.ExitStack() + # See if a shapefile name was passed as the first argument + if shapefile_path: + path = fsdecode_if_pathlike(shapefile_path) + self.path = path + if isinstance(path, str): + if ".zip" in path: + self._load_from_zip(path) + # Raises if not self._shp or self._dbf + return + + if path.lower().startswith(SUPPORTED_URL_SCHEMES): + self._load_from_url(path) + # Raises if not self._shp or self._dbf + return + + + # Local file path to a shapefile + # Load and exit early + self.load(path) + + # Raises if not self._shp or self._dbf + return + + if shp is not _NO_SHP_SENTINEL: + shp = cast(Union[BinaryFileT, None], shp) + self._shp = self._seek_0_on_file_obj_wrap_or_open_from_name("shp", shp) + self._shx = self._seek_0_on_file_obj_wrap_or_open_from_name("shx", shx) + + self._dbf = self._seek_0_on_file_obj_wrap_or_open_from_name("dbf", dbf) + + # Load the files + if self._shp: + self._shpHeader() + if self._dbf: + self._get_dbf_reader() + if self._shx: + self._shxHeader() + + @functools.cache + def _get_dbf_reader(self) -> DbfReader: + if self._dbf is None: + raise ShapefileException( + "Shapefile DbfReader requires a .dbf file or file-like object." + ) + return DbfReader( + file_=self._dbf, + encoding = self.encoding, + encodingErrors = self.encodingErrors, ) - return fcollection @property - def shapeTypeName(self) -> str: - return SHAPETYPE_LOOKUP[self.shapeType] + def dbf_reader(self) -> DbfReader: + return self._get_dbf_reader() + + @functools.cached_property + def shp(self) -> IO[bytes]: + if self._shp is None: + raise ShapefileException( + "Shapefile Reader requires a .shp shapefile or file-like object." + ) + return self._shp + @functools.cached_property + def shx(self) -> IO[bytes]: + if self._shx is None: + raise ShapefileException( + "Shapefile Reader shx use requires a .shx shapefile or file-like object." + ) + return self._shx + @property + def dbf(self) -> IO[bytes]: + return self.dbf_reader.dbf + + @property + def numRecords(self) -> int | None: + if self._dbf is None: + return None + return self.dbf_reader.numRecords + + @property + def fields(self) -> list[Field]: + return self.dbf_reader.fields + + def record(self, i: int = 0, fields: list[str] | None = None) -> _Record | None: + return self.dbf_reader.record(i, fields) + + def records(self, fields: list[str] | None = None) -> list[_Record]: + return self.dbf_reader.records(fields) + + def iterRecords( + self, + fields: list[str] | None = None, + start: int = 0, + stop: int | None = None, + ) -> Iterator[_Record | None]: + return self.dbf_reader.iterRecords(fields, start, stop) + + def _seek_0_on_file_obj_wrap_or_open_from_name( + self, + ext: str, + file_: BinaryFileT | None, + ) -> None | IO[bytes]: + # assert ext in {'shp', 'dbf', 'shx'} + _assert_ext_is_supported(ext) + + if file_ is None: + return None + + if isinstance(file_, (str, PathLike)): + baseName, __ = os.path.splitext(file_) + file_obj = _try_get_open_constituent_file(baseName, ext) + if file_obj is not None: + self._exit_stack.push(file_obj.__exit__) + return file_obj + + if hasattr(file_, "read"): + # Copy if required + try: + file_.seek(0) + return file_ + except (NameError, io.UnsupportedOperation): + return io.BytesIO(file_.read()) + + raise ShapefileException( + f"Could not load shapefile constituent file from: {file_}" + ) + + + + def _load_from_url(self, url: str) -> None: + # Shapefile is from a url + # Download each file to temporary path and treat as normal shapefile path + urlinfo = urlparse(url) + shp_or_dbf_downloaded = False + for ext in CONSTITUENT_FILE_EXTS: + + sniffed_bytes, resp = _try_to_download_binary_file( + urlinfo=urlinfo, + ext=ext, + suppress_http_errors=True, + ) + if resp is None: + continue + if ext != "shx": + shp_or_dbf_downloaded = True + # Use tempfile as source for url data. + fileobj = _save_to_named_tmp_file(resp, initial_bytes = sniffed_bytes) + setattr(self, f"_{ext}", fileobj) + self._exit_stack.enter_context(fileobj) + if not shp_or_dbf_downloaded: + raise ShapefileException(f"Failed to download .shp or .dbf from: {url}") + + def _load_from_zip(self, path: str) -> None: + # Shapefile is inside a zipfile + if path.count(".zip") > 1: + # Multiple nested zipfiles + raise ShapefileException( + f"Reading from multiple nested zipfiles is not supported: {path}" + ) + # Split into zipfile and shapefile paths + if path.endswith(".zip"): + zpath = path + shapefile = None + else: + zpath = path[: path.find(".zip") + 4] + shapefile = path[path.find(".zip") + 4 + 1 :] + + zipfileobj: ( + tempfile._TemporaryFileWrapper[bytes] | io.BufferedReader + ) + # Create a zip file handle + urlinfo = urlparse(zpath) + + resp: ReadableBinStream | None + if urlinfo.scheme in SUPPORTED_URL_SCHEMES: + # Zipfile is from a url + # Download to a temporary file and treat as normal zipfile + sniffed_bytes, resp = _try_to_download_binary_file(urlinfo=urlinfo) + + + # Use named tmp file as source for zip file data. + zipfileobj = _save_to_named_tmp_file( + resp, + initial_bytes = sniffed_bytes, + suffix=".zip", + ) + + else: + # Zipfile is from a file + zipfileobj = open(zpath, mode="rb") + + # Open the zipfile archive + with zipfile.ZipFile(zipfileobj, "r") as archive: + if not shapefile: + # Only the zipfile path is given + # Inspect zipfile contents to find the full shapefile path + shapefiles = [ + name + for name in archive.namelist() + if (name.endswith(".SHP") or name.endswith(".shp")) + ] + # The zipfile must contain exactly one shapefile + if len(shapefiles) == 0: + raise ShapefileException( + "Zipfile does not contain any shapefiles" + ) + if len(shapefiles) == 1: + shapefile = shapefiles[0] + else: + raise ShapefileException( + f"Zipfile contains more than one shapefile: {shapefiles}. " + "Please specify the full path to the shapefile you would like to open." + ) + # Try to extract file-like objects from zipfile + shapefile = os.path.splitext(shapefile)[ + 0 + ] # root shapefile name + for ext in CONSTITUENT_FILE_EXTS: + for cased_ext in {ext.lower(), ext.upper(), ext}: + try: + member = archive.open(f"{shapefile}.{cased_ext}") + # Use read+write tempfile as source for member data. + fileobj = _save_to_named_tmp_file(member) + setattr(self, f"_{ext.lower()}", fileobj) + self._exit_stack.enter_context(fileobj) + except (OSError, AttributeError, KeyError): + pass + # Close and delete the temporary zipfile + try: + zipfileobj.close() + # TODO Does catching all possible exceptions really increase + # the chances of closing the zipfile successully, or does it + # just mean .close() failures will still fail, but fail + # silently? + except: # noqa: E722 + pass + def load(self, shapefile: str | None = None) -> None: """Opens a shapefile from a filename or file-like @@ -2660,120 +2990,110 @@ def load(self, shapefile: str | None = None) -> None: self.load_shp(shapeName) self.load_shx(shapeName) self.load_dbf(shapeName) - if not (self.shp or self.dbf): - raise ShapefileException( - f"Unable to open {shapeName}.dbf or {shapeName}.shp." - ) - self._try_to_set_constituent_file_headers() + if not (self._shp or self._dbf): + raise ShapefileException( + f"Unable to open {shapeName}.dbf or {shapeName}.shp." + ) - def _try_to_set_constituent_file_headers(self) -> None: - if self.shp: - self.__shpHeader() - if self.dbf: - self.__dbfHeader() - if self.shx: - self.__shxHeader() - def _try_get_open_constituent_file( - self, - shapefile_name: str, - ext: str, - ) -> IO[bytes] | None: + def load_shp(self, shapefile_name: str) -> None: """ - Attempts to open a .shp, .dbf or .shx file, - with both lower case and upper case file extensions, - and return it. If it was not possible to open the file, None is returned. + Attempts to load file with .shp extension as both lower and upper case """ - # typing.LiteralString is only available from PYthon 3.11 onwards. - # https://docs.python.org/3/library/typing.html#typing.LiteralString - # assert ext in {'shp', 'dbf', 'shx'} - self._assert_ext_is_supported(ext) + self._shp = _try_get_open_constituent_file(shapefile_name, "shp") + if self._shp: + self._exit_stack.enter_context(self._shp) + self._shpHeader() - try: - return open(f"{shapefile_name}.{ext}", "rb") - except OSError: - try: - return open(f"{shapefile_name}.{ext.upper()}", "rb") - except OSError: - return None - - def _load_constituent_file( - self, - shapefile_name: str, - ext: str, - ) -> IO[bytes] | None: + def load_shx(self, shapefile_name: str) -> None: """ - Attempts to open a .shp, .dbf or .shx file, with the extension - as both lower and upper case, and if successful append it to - self._files_to_close. + Attempts to load file with .shx extension as both lower and upper case """ - shp_dbf_or_dhx_file = self._try_get_open_constituent_file(shapefile_name, ext) - if shp_dbf_or_dhx_file is not None: - self._files_to_close.append(shp_dbf_or_dhx_file) - return shp_dbf_or_dhx_file + self._shx = _try_get_open_constituent_file(shapefile_name, "shx") + if self._shx: + self._exit_stack.enter_context(self._shx) + self._shxHeader() - def load_shp(self, shapefile_name: str) -> None: + def load_dbf(self, shapefile_name: str) -> None: """ - Attempts to load file with .shp extension as both lower and upper case + Attempts to load file with .dbf extension as both lower and upper case """ - self.shp = self._load_constituent_file(shapefile_name, "shp") + self._dbf = _try_get_open_constituent_file(shapefile_name, "dbf") + if self._dbf: + self._exit_stack.enter_context(self._dbf) + self._get_dbf_reader() + + + + + + + def __len__(self) -> int: + """Returns the number of shapes/records in the shapefile.""" + if self._dbf: + # Preferably use dbf record count + return len(self.dbf_reader) + + if self._shp: + # Otherwise use shape count + if self._shx: + if self.numShapes is None: + self._shxHeader() + + # ._shxHeader sets self.numShapes or raises Exception + return cast(int, self.numShapes) + + # Index file not available, iterate all shapes to get total count + if self.numShapes is None: + # Determine length of shp file + shp = self.shp + checkpoint = shp.tell() + shp.seek(0, 2) + shpLength = shp.tell() + shp.seek(100) + # Do a fast shape iteration until end of file. + offsets = [] + pos = shp.tell() + while pos < shpLength: + offsets.append(pos) + # Unpack the shape header only + (__recNum, recLength) = unpack_2_int32_be(shp.read(8)) + # Jump to next shape position + pos += 8 + (2 * recLength) + shp.seek(pos) + # Set numShapes and offset indices + self.numShapes = len(offsets) + self._offsets = offsets + # Return to previous file position + shp.seek(checkpoint) - def load_shx(self, shapefile_name: str) -> None: - """ - Attempts to load file with .shx extension as both lower and upper case - """ - self.shx = self._load_constituent_file(shapefile_name, "shx") + return self.numShapes - def load_dbf(self, shapefile_name: str) -> None: - """ - Attempts to load file with .dbf extension as both lower and upper case - """ - self.dbf = self._load_constituent_file(shapefile_name, "dbf") + # No file loaded yet, treat as 'empty' shapefile + return 0 - def __del__(self) -> None: - self.close() + def __iter__(self) -> Iterator[ShapeRecord]: + """Iterates through the shapes/records in the shapefile.""" + yield from self.iterShapeRecords() - def close(self) -> None: - # Close any files that the reader opened (but not those given by user) - for attribute in self._files_to_close: - if hasattr(attribute, "close"): - try: - attribute.close() - except OSError: - pass - self._files_to_close = [] + @property + def __geo_interface__(self) -> GeoJSONFeatureCollectionWithBBox: + shaperecords = self.shapeRecords() + fcollection = GeoJSONFeatureCollectionWithBBox( + bbox=list(self.bbox), + **shaperecords.__geo_interface__, + ) + return fcollection - def __getFileObj(self, f: T | None) -> T: - """Checks to see if the requested shapefile file object is - available. If not a ShapefileException is raised.""" - if not f: - raise ShapefileException( - "Shapefile Reader requires a shapefile or file-like object." - ) - if self.shp and self.shpLength is None: - self.load() - if self.dbf and len(self.fields) == 0: - self.load() - return f - - def __restrictIndex(self, i: int) -> int: - """Provides list-like handling of a record index with a clearer - error message if the index is out of bounds.""" - if self.numRecords: - rmax = self.numRecords - 1 - if abs(i) > rmax: - raise IndexError( - f"Shape or Record index: {i} out of range. Max index: {rmax}" - ) - if i < 0: - i = range(self.numRecords)[i] - return i + @property + def shapeTypeName(self) -> str: + return SHAPETYPE_LOOKUP[self.shapeType] - def __shpHeader(self) -> None: + def _shpHeader(self) -> None: """Reads the header information from a .shp file.""" if not self.shp: raise ShapefileException( - "Shapefile Reader requires a shapefile or file-like object. (no shp file found" + "Shapefile Reader requires a shapefile or file-like object (no shp file found)." ) shp = self.shp @@ -2802,62 +3122,29 @@ def __shpHeader(self) -> None: # self.mbox = MBox(mmin=m_bounds[0], mmax=m_bounds[1]) self.mbox: tuple[float | None, float | None] = (m_bounds[0], m_bounds[1]) - def __shape(self, oid: int | None = None, bbox: BBox | None = None) -> Shape | None: - """Returns the header info and geometry for a single shape.""" - - f = self.__getFileObj(self.shp) - - # shape = Shape(oid=oid) - (__recNum, recLength) = unpack_2_int32_be(f.read(8)) - # Determine the start of the next record - - # Convert from num of 16 bit words, to 8 bit bytes - recLength_bytes = 2 * recLength - - # next_shape = f.tell() + recLength_bytes - - # Read entire record into memory to avoid having to call - # seek on the file afterwards - b_io: ReadSeekableBinStream = io.BytesIO(f.read(recLength_bytes)) - b_io.seek(0) - - shapeType = unpack(" None: + def _shxHeader(self) -> None: """Reads the header information from a .shx file.""" shx = self.shx if not shx: raise ShapefileException( - "Shapefile Reader requires a shapefile or file-like object. (no shx file found" + "Shapefile Reader requires a shapefile or file-like object (no shx file found)." ) # File length (16-bit word * 2 = bytes) - header length shx.seek(24) - shxRecordLength = (unpack(">i", shx.read(4))[0] * 2) - 100 - self.numShapes = shxRecordLength // 8 + shx_records_length_B = (unpack(">i", shx.read(4))[0] * 2) - 100 + self.numShapes = shx_records_length_B // 8 - def __shxOffsets(self) -> None: + def _shxOffsets(self) -> None: """Reads the shape offset positions from a .shx file""" shx = self.shx if not shx: raise ShapefileException( - "Shapefile Reader requires a shapefile or file-like object. (no shx file found" + "Shapefile Reader requires a shapefile or file-like object (no shx file found)." ) if self.numShapes is None: raise ShapefileException( "numShapes must not be None. " - " Was there a problem with .__shxHeader() ?" + " Was there a problem with ._shxHeader() ?" f"Got: {self.numShapes=}" ) # Jump to the first record. @@ -2868,401 +3155,187 @@ def __shxOffsets(self) -> None: shxRecords.byteswap() self._offsets = [2 * el for el in shxRecords[::2]] - def __shapeIndex(self, i: int | None = None) -> int | None: + def _shape_index(self, i: int | None = None) -> int | None: """Returns the offset in a .shp file for a shape based on information in the .shx index file.""" - shx = self.shx # Return None if no shx or no index requested - if not shx or i is None: + if not self._shx or i is None: return None # At this point, we know the shx file exists if not self._offsets: - self.__shxOffsets() + self._shxOffsets() return self._offsets[i] - def shape(self, i: int = 0, bbox: BBox | None = None) -> Shape | None: - """Returns a shape object for a shape in the geometry - record file. - If the 'bbox' arg is given (list or tuple of xmin,ymin,xmax,ymax), - returns None if the shape is not within that region. - """ - shp = self.__getFileObj(self.shp) - i = self.__restrictIndex(i) - offset = self.__shapeIndex(i) - if not offset: - # Shx index not available. - # Determine length of shp file - shp.seek(0, 2) - shpLength = shp.tell() - shp.seek(100) - # Do a fast shape iteration until the requested index or end of file. - _i = 0 - offset = shp.tell() - while offset < shpLength: - if _i == i: - # Reached the requested index, exit loop with the offset value - break - # Unpack the shape header only - (__recNum, recLength) = unpack_2_int32_be(shp.read(8)) - # Jump to next shape position - offset += 8 + (2 * recLength) - shp.seek(offset) - _i += 1 - # If the index was not found, it likely means the .shp file is incomplete - if _i != i: - raise ShapefileException( - f"Shape index {i} is out of bounds; the .shp file only contains {_i} shapes" - ) - - # Seek to the offset and read the shape - shp.seek(offset) - return self.__shape(oid=i, bbox=bbox) - - def shapes(self, bbox: BBox | None = None) -> Shapes: - """Returns all shapes in a shapefile. - To only read shapes within a given spatial region, specify the 'bbox' - arg as a list or tuple of xmin,ymin,xmax,ymax. - """ - shapes = Shapes() - shapes.extend(self.iterShapes(bbox=bbox)) - return shapes - - def iterShapes(self, bbox: BBox | None = None) -> Iterator[Shape | None]: - """Returns a generator of shapes in a shapefile. Useful - for handling large shapefiles. - To only read shapes within a given spatial region, specify the 'bbox' - arg as a list or tuple of xmin,ymin,xmax,ymax. - """ - shp = self.__getFileObj(self.shp) - # Found shapefiles which report incorrect - # shp file length in the header. Can't trust - # that so we seek to the end of the file - # and figure it out. - shp.seek(0, 2) - shpLength = shp.tell() - shp.seek(100) - - if self.numShapes: - # Iterate exactly the number of shapes from shx header - for i in range(self.numShapes): - # MAYBE: check if more left of file or exit early? - shape = self.__shape(oid=i, bbox=bbox) - if shape: - yield shape - else: - # No shx file, unknown nr of shapes - # Instead iterate until reach end of file - # Collect the offset indices during iteration - i = 0 - offsets = [] - pos = shp.tell() - while pos < shpLength: - offsets.append(pos) - shape = self.__shape(oid=i, bbox=bbox) - pos = shp.tell() - if shape: - yield shape - i += 1 - # Entire shp file consumed - # Update the number of shapes and list of offsets - assert i == len(offsets) - self.numShapes = i - self._offsets = offsets - - def __dbfHeader(self) -> None: - """Reads a dbf header. Xbase-related code borrows heavily from ActiveState Python Cookbook Recipe 362715 by Raymond Hettinger""" - - if not self.dbf: - raise ShapefileException( - "Shapefile Reader requires a shapefile or file-like object. (no dbf file found)" - ) - dbf = self.dbf - # read relevant header parts - dbf.seek(0) - self.numRecords, self.__dbfHdrLength, self.__recordLength = unpack( - " None: + self.close() - # store all field positions for easy lookups - # note: fieldLookup gives the index position of a field inside Reader.fields - self.__fieldLookup = {f[0]: i for i, f in enumerate(self.fields)} + def close(self) -> None: + self._exit_stack.close() + # Close any files that the reader opened (but not those given by user) + # for file_ in [self._shp, self._dbf, self._shx]: + # if file_ is None: + # continue - # by default, read all fields except the deletion flag, hence "[1:]" - # note: recLookup gives the index position of a field inside a _Record list - fieldnames = [f[0] for f in self.fields[1:]] - __fieldTuples, recLookup, recStruct = self.__recordFields(fieldnames) - self.__fullRecStruct = recStruct - self.__fullRecLookup = recLookup + # if hasattr(file_, "close"): + # try: + # file_.close() + # except OSError: + # pass - def __recordFmt(self, fields: Container[str] | None = None) -> tuple[str, int]: - """Calculates the format and size of a .dbf record. Optional 'fields' arg - specifies which fieldnames to unpack and which to ignore. Note that this - always includes the DeletionFlag at index 0, regardless of the 'fields' arg. - """ - if self.numRecords is None: - self.__dbfHeader() - structcodes = [f"{fieldinfo.size}s" for fieldinfo in self.fields] - if fields is not None: - # only unpack specified fields, ignore others using padbytes (x) - structcodes = [ - code - if fieldinfo.name in fields - or fieldinfo.name == "DeletionFlag" # always unpack delflag - else f"{fieldinfo.size}x" - for fieldinfo, code in zip(self.fields, structcodes) - ] - fmt = "".join(structcodes) - fmtSize = calcsize(fmt) - # total size of fields should add up to recordlength from the header - while fmtSize < self.__recordLength: - # if not, pad byte until reaches recordlength - fmt += "x" - fmtSize += 1 - return (fmt, fmtSize) - def __recordFields( - self, fields: Iterable[str] | None = None - ) -> tuple[list[Field], dict[str, int], Struct]: - """Returns the necessary info required to unpack a record's fields, - restricted to a subset of fieldnames 'fields' if specified. - Returns a list of field info tuples, a name-index lookup dict, - and a Struct instance for unpacking these fields. Note that DeletionFlag - is not a valid field. - """ - if fields is not None: - # restrict info to the specified fields - # first ignore repeated field names (order doesn't matter) - unique_fields = list(set(fields)) - # get the struct - fmt, __fmtSize = self.__recordFmt(fields=unique_fields) - recStruct = Struct(fmt) - # make sure the given fieldnames exist - for name in unique_fields: - if name not in self.__fieldLookup or name == "DeletionFlag": - raise ValueError(f'"{name}" is not a valid field name') - # fetch relevant field info tuples - fieldTuples = [] - for fieldinfo in self.fields[1:]: - name = fieldinfo[0] - if name in unique_fields: - fieldTuples.append(fieldinfo) - # store the field positions - recLookup = {f[0]: i for i, f in enumerate(fieldTuples)} - else: - # use all the dbf fields - fieldTuples = self.fields[1:] # sans deletion flag - recStruct = self.__fullRecStruct - recLookup = self.__fullRecLookup - return fieldTuples, recLookup, recStruct + def __str__(self) -> str: + """ + Use some general info on the shapefile as __str__ + """ + info = ["shapefile Reader"] + if self.shp: + info.append( + f" {len(self)} shapes (type '{SHAPETYPE_LOOKUP[self.shapeType]}')" + ) + if self.dbf: + info.append(f" {len(self)} records ({len(self.fields)} fields)") + return "\n".join(info) + + def __enter__(self) -> Reader: + """ + Enter phase of context manager. + """ + self._exit_stack.__enter__() + return self - def __record( + def __exit__( self, - fieldTuples: list[Field], - recLookup: dict[str, int], - recStruct: Struct, - oid: int | None = None, - ) -> _Record | None: - """Reads and returns a dbf record row as a list of values. Requires specifying - a list of field info Field namedtuples 'fieldTuples', a record name-index dict 'recLookup', - and a Struct instance 'recStruct' for unpacking these fields. + exc_type: BaseException | None, + exc_val: BaseException | None, + exc_tb: TracebackType | None, + ) -> bool | None: """ - f = self.__getFileObj(self.dbf) + Exit phase of context manager, close opened files. + """ + self.close() + return None - # The only format chars in from self.__recordFmt, in recStruct from __recordFields, - # are s and x (ascii encoded str and pad byte) so everything in recordContents is bytes - # https://docs.python.org/3/library/struct.html#format-characters - recordContents = recStruct.unpack(f.read(recStruct.size)) + def _shape(self, oid: int | None = None, bbox: BBox | None = None) -> Shape | None: + """Returns the header info and geometry for a single shape.""" - # deletion flag field is always unpacked as first value (see __recordFmt) - if recordContents[0] != b" ": - # deleted record - return None + # shape = Shape(oid=oid) + (__recNum, recLength) = unpack_2_int32_be(self.shp.read(8)) + # Determine the start of the next record - # drop deletion flag from values - recordContents = recordContents[1:] + # Convert from num of 16 bit words, to 8 bit bytes + recLength_bytes = 2 * recLength - # check that values match fields - if len(fieldTuples) != len(recordContents): - raise ShapefileException( - f"Number of record values ({len(recordContents)}) is different from the requested " - f"number of fields ({len(fieldTuples)})" - ) + # next_shape = self.shp.tell() + recLength_bytes - # parse each value - record = [] - for (__name, typ, __size, decimal), value in zip(fieldTuples, recordContents): - if typ is FieldType.N or typ is FieldType.F: - # numeric or float: number stored as a string, right justified, and padded with blanks to the width of the field. - value = value.split(b"\0")[0] - value = value.replace(b"*", b"") # QGIS NULL is all '*' chars - if value == b"": - value = None - elif decimal: - try: - value = float(value) - except ValueError: - # not parseable as float, set to None - value = None - else: - # force to int - try: - # first try to force directly to int. - # forcing a large int to float and back to int - # will lose information and result in wrong nr. - value = int(value) - except ValueError: - # forcing directly to int failed, so was probably a float. - try: - value = int(float(value)) - except ValueError: - # not parseable as int, set to None - value = None - elif typ is FieldType.D: - # date: 8 bytes - date stored as a string in the format YYYYMMDD. - if ( - not value.replace(b"\x00", b"") - .replace(b" ", b"") - .replace(b"0", b"") - ): - # dbf date field has no official null value - # but can check for all hex null-chars, all spaces, or all 0s (QGIS null) - value = None - else: - try: - # return as python date object - y, m, d = int(value[:4]), int(value[4:6]), int(value[6:8]) - value = date(y, m, d) - except (TypeError, ValueError): - # if invalid date, just return as unicode string so user can decimalde - value = str(value.strip()) - elif typ is FieldType.L: - # logical: 1 byte - initialized to 0x20 (space) otherwise T or F. - if value == b" ": - value = None # space means missing or not yet set - else: - if value in b"YyTt1": - value = True - elif value in b"NnFf0": - value = False - else: - value = None # unknown value is set to missing - else: - value = value.decode(self.encoding, self.encodingErrors) - value = value.strip().rstrip( - "\x00" - ) # remove null-padding at end of strings - record.append(value) + # Read entire record into memory to avoid having to call + # seek on the file afterwards + b_io: ReadSeekableBinStream = io.BytesIO(self.shp.read(recLength_bytes)) + b_io.seek(0) - return _Record(recLookup, record, oid) + shapeType = unpack(" _Record | None: - """Returns a specific dbf record based on the supplied index. - To only read some of the fields, specify the 'fields' arg as a - list of one or more fieldnames. - """ - f = self.__getFileObj(self.dbf) - if self.numRecords is None: - self.__dbfHeader() - i = self.__restrictIndex(i) - recSize = self.__recordLength - f.seek(0) - f.seek(self.__dbfHdrLength + (i * recSize)) - fieldTuples, recLookup, recStruct = self.__recordFields(fields) - return self.__record( - oid=i, fieldTuples=fieldTuples, recLookup=recLookup, recStruct=recStruct + ShapeClass = SHAPE_CLASS_FROM_SHAPETYPE[shapeType] + shape = ShapeClass.from_byte_stream( + shapeType, b_io, recLength_bytes, oid=oid, bbox=bbox ) - def records(self, fields: list[str] | None = None) -> list[_Record]: - """Returns all records in a dbf file. - To only read some of the fields, specify the 'fields' arg as a - list of one or more fieldnames. + # Seek to the end of this record as defined by the record header because + # the shapefile spec doesn't require the actual content to meet the header + # definition. Probably allowed for lazy feature deletion. + # f.seek(next_shape) + + return shape + + + def shape(self, i: int = 0, bbox: BBox | None = None) -> Shape | None: + """Returns a shape object for a shape in the geometry + record file. + If the 'bbox' arg is given (list or tuple of xmin,ymin,xmax,ymax), + returns None if the shape is not within that region. """ - if self.numRecords is None: - self.__dbfHeader() - records = [] - f = self.__getFileObj(self.dbf) - f.seek(self.__dbfHdrLength) - fieldTuples, recLookup, recStruct = self.__recordFields(fields) - # self.__dbfHeader() sets self.numRecords, so it's fine to cast it to int - # (to tell mypy it's not None). - for i in range(cast(int, self.numRecords)): - r = self.__record( - oid=i, fieldTuples=fieldTuples, recLookup=recLookup, recStruct=recStruct - ) - if r: - records.append(r) - return records + N = len(self) + if N == 0: + raise ShapefileException("No shapes loaded.") + i = ensure_within_bounds(i, N) + offset = self._shape_index(i) + if not offset: + # Shx index not available. + # Determine length of shp file + self.shp.seek(0, 2) + shp_length_B = self.shp.tell() + self.shp.seek(100) + # Do a fast shape iteration until the requested index or end of file. + _i = 0 + offset = self.shp.tell() + while offset < shp_length_B: + if _i == i: + # Reached the requested index, exit loop with the offset value + break + # Unpack the shape header only + (__recNum, recLength) = unpack_2_int32_be(self.shp.read(8)) + # Jump to next shape position + offset += 8 + (2 * recLength) + self.shp.seek(offset) + _i += 1 + # If the index was not found, it likely means the .shp file is incomplete + if _i != i: + raise ShapefileException( + f"Shape index {i} is out of bounds; the .shp file only contains {_i} shapes" + ) - def iterRecords( - self, - fields: list[str] | None = None, - start: int = 0, - stop: int | None = None, - ) -> Iterator[_Record | None]: - """Returns a generator of records in a dbf file. - Useful for large shapefiles or dbf files. - To only read some of the fields, specify the 'fields' arg as a - list of one or more fieldnames. - By default yields all records. Otherwise, specify start - (default: 0) or stop (default: number_of_records) - to only yield record numbers i, where - start <= i < stop, (or - start <= i < number_of_records + stop - if stop < 0). + # Seek to the offset and read the shape + self.shp.seek(offset) + return self._shape(oid=i, bbox=bbox) + + def shapes(self, bbox: BBox | None = None) -> Shapes: + """Returns all shapes in a shapefile. + To only read shapes within a given spatial region, specify the 'bbox' + arg as a list or tuple of xmin,ymin,xmax,ymax. """ - if self.numRecords is None: - self.__dbfHeader() - if not isinstance(self.numRecords, int): - raise ShapefileException( - "Error when reading number of Records in dbf file header" - ) - f = self.__getFileObj(self.dbf) - start = self.__restrictIndex(start) - if stop is None: - stop = self.numRecords - elif abs(stop) > self.numRecords: - raise IndexError( - f"abs(stop): {abs(stop)} exceeds number of records: {self.numRecords}." - ) - elif stop < 0: - stop = range(self.numRecords)[stop] - recSize = self.__recordLength - f.seek(self.__dbfHdrLength + (start * recSize)) - fieldTuples, recLookup, recStruct = self.__recordFields(fields) - for i in range(start, stop): - r = self.__record( - oid=i, fieldTuples=fieldTuples, recLookup=recLookup, recStruct=recStruct - ) - if r: - yield r + shapes = Shapes() + shapes.extend(self.iterShapes(bbox=bbox)) + return shapes + + def iterShapes(self, bbox: BBox | None = None) -> Iterator[Shape | None]: + """Returns a generator of shapes in a shapefile. Useful + for handling large shapefiles. + To only read shapes within a given spatial region, specify the 'bbox' + arg as a list or tuple of xmin,ymin,xmax,ymax. + """ + # Found shapefiles which report incorrect + # shp file length in the header. Can't trust + # that so we seek to the end of the file + # and figure it out. + self.shp.seek(0, 2) + shp_length_B = self.shp.tell() + self.shp.seek(100) + + if self.numShapes: + # Iterate exactly the number of shapes from shx header + for i in range(self.numShapes): + # MAYBE: check if more left of file or exit early? + shape = self._shape(oid=i, bbox=bbox) + if shape: + yield shape + else: + # No shx file, unknown nr of shapes + # Instead iterate until reach end of file + # Collect the offset indices during iteration + i = 0 + offsets = [] + pos = self.shp.tell() + while pos < shp_length_B: + offsets.append(pos) + shape = self._shape(oid=i, bbox=bbox) + pos = self.shp.tell() + if shape: + yield shape + i += 1 + # Entire shp file consumed + # Update the number of shapes and list of offsets + assert i == len(offsets) + self.numShapes = i + self._offsets = offsets def shapeRecord( self, @@ -3277,10 +3350,12 @@ def shapeRecord( If the 'bbox' arg is given (list or tuple of xmin,ymin,xmax,ymax), returns None if the shape is not within that region. """ - i = self.__restrictIndex(i) + if self.numRecords is None: + raise ShapefileException("A .dbf file is required to read Records, for ShapeRecords") + i = ensure_within_bounds(i, self.numRecords) shape = self.shape(i, bbox=bbox) if shape: - record = self.record(i, fields=fields) + record = self.dbf_reader.record(i, fields=fields) return ShapeRecord(shape=shape, record=record) return None @@ -3313,19 +3388,19 @@ def iterShapeRecords( if bbox is None: # iterate through all shapes and records for shape, record in zip( - self.iterShapes(), self.iterRecords(fields=fields) + self.iterShapes(), self.dbf_reader.iterRecords(fields=fields) ): yield ShapeRecord(shape=shape, record=record) else: # only iterate where shape.bbox overlaps with the given bbox - # TODO: internal __record method should be faster but would have to + # TODO: internal _record method should be faster but would have to # make sure to seek to correct file location... - # fieldTuples,recLookup,recStruct = self.__recordFields(fields) + # fieldTuples,recLookup,recStruct = self._record_fields(fields) for shape in self.iterShapes(bbox=bbox): if shape: - # record = self.__record(oid=i, fieldTuples=fieldTuples, recLookup=recLookup, recStruct=recStruct) - record = self.record(i=shape.oid, fields=fields) + # record = self._record(oid=i, fieldTuples=fieldTuples, recLookup=recLookup, recStruct=recStruct) + record = self.dbf_reader.record(i=shape.oid, fields=fields) yield ShapeRecord(shape=shape, record=record) @@ -3441,13 +3516,13 @@ def close(self) -> None: ) # Fill in the blank headers if self.shp and shp_open: - self.__shapefileHeader(self.shp, headerType="shp") + self._shapefileHeader(self.shp, headerType="shp") if self.shx and shx_open: - self.__shapefileHeader(self.shx, headerType="shx") + self._shapefileHeader(self.shx, headerType="shx") # Update the dbf header with final length etc if self.dbf and dbf_open: - self.__dbfHeader() + self._dbfHeader() # Flush files for attribute in (self.shp, self.shx, self.dbf): @@ -3492,7 +3567,7 @@ def __getFileObj( return f raise ShapefileException(f"Unsupported file-like object: {f}") - def __shpFileLength(self) -> int: + def _shp_file_length_B(self) -> int: """Calculates the file length of the shp file.""" shp = self.__getFileObj(self.shp) @@ -3568,7 +3643,7 @@ def mbox(self) -> MBox | None: """Returns the current m extremes for the shapefile.""" return self._mbox - def __shapefileHeader( + def _shapefileHeader( self, fileObj: WriteSeekableBinStream | None, headerType: Literal["shp", "dbf", "shx"] = "shp", @@ -3583,7 +3658,7 @@ def __shapefileHeader( f.write(pack(">6i", 9994, 0, 0, 0, 0, 0)) # File length (Bytes / 2 = 16-bit words) if headerType == "shp": - f.write(pack(">i", self.__shpFileLength())) + f.write(pack(">i", self._shp_file_length_B())) elif headerType == "shx": f.write(pack(">i", ((100 + (self.shpNum * 8)) // 2))) # Version, Shape type @@ -3640,7 +3715,7 @@ def __shapefileHeader( "Failed to write shapefile elevation and measure values. Floats required." ) - def __dbfHeader(self) -> None: + def _dbfHeader(self) -> None: """Writes the dbf header and field descriptors.""" f = self.__getFileObj(self.dbf) f.seek(0) @@ -3835,7 +3910,7 @@ def __dbfRecord(self, record: list[RecordValue]) -> None: # first records, so all fields should be set # allowing us to write the dbf header # cannot change the fields after this point - self.__dbfHeader() + self._dbfHeader() # first byte of the record is deletion flag, always disabled f.write(b" ") # begin diff --git a/test_shapefile.py b/test_shapefile.py index 7d5f865..e03e0df 100644 --- a/test_shapefile.py +++ b/test_shapefile.py @@ -504,7 +504,9 @@ def Reader(url): with Reader(url) as sf: for __recShape in sf.iterShapeRecords(): pass - assert sf.shp.closed is sf.shx.closed is sf.dbf.closed is True + assert sf.shp.closed + assert sf._shx is None or sf.shx.closed + assert sf.dbf.closed # test without extension url = "https://github.com/nvkelso/natural-earth-vector/blob/master/110m_cultural/ne_110m_admin_0_tiny_countries?raw=true" @@ -512,7 +514,9 @@ def Reader(url): for __recShape in sf.iterShapeRecords(): pass assert len(sf) > 0 - assert sf.shp.closed is sf.shx.closed is sf.dbf.closed is True + assert sf.shp.closed + assert sf._shx is None or sf.shx.closed + assert sf.dbf.closed # test no files found url = "https://raw.githubusercontent.com/nvkelso/natural-earth-vector/master/README.md" @@ -766,7 +770,7 @@ def test_reader_shp_shx_only(): def test_reader_shp_shx_only_from_Paths(): """ Assert that specifying just the - shp and shx argument to the shapefile reader as Paths + shp and shx arguments to the shapefile reader as Paths reads just the shp and shx file. """ with shapefile.Reader( @@ -780,7 +784,7 @@ def test_reader_shp_shx_only_from_Paths(): def test_reader_shp_dbf_only(): """ Assert that specifying just the - shp and shx argument to the shapefile reader + shp and dbf arguments to the shapefile reader reads just the shp and dbf file. """ with shapefile.Reader( @@ -796,7 +800,7 @@ def test_reader_shp_dbf_only(): def test_reader_shp_dbf_only_from_Paths(): """ Assert that specifying just the - shp and shx argument to the shapefile reader as Paths + shp and dbf arguments to the shapefile reader as Paths reads just the shp and dbf file. """ with shapefile.Reader( @@ -891,8 +895,9 @@ def test_reader_filelike_shp_only(): def test_reader_shapefile_delayed_load(): """ - Assert that the filename's extension is - ignored when reading a shapefile. + Assert that both: + i) reading a shape from an uninitialised Reader() raises ShapefileException and, + ii) it can still load a shapefile for reading afterwards, via .load(...). """ with shapefile.Reader() as sf: # assert that data request raises exception, since no file has been provided yet @@ -1535,7 +1540,7 @@ def test_write_shp_only(tmpdir): # test that can read shapes with shapefile.Reader(shp=filename + ".shp") as reader: - assert reader.shp and not reader.shx and not reader.dbf + assert reader._shp and not reader._shx and not reader._dbf assert (reader.numRecords, reader.numShapes) == ( None, None, @@ -1571,7 +1576,7 @@ def test_write_shp_shx_only(tmpdir): # test that can read shapes and offsets with shapefile.Reader(shp=filename + ".shp", shx=filename + ".shx") as reader: - assert reader.shp and reader.shx and not reader.dbf + assert reader.shp and reader.shx and not reader._dbf assert (reader.numRecords, reader.numShapes) == (None, 1) reader.shape(0) # trigger reading of shx offsets assert len(reader._offsets) == 1 @@ -1605,7 +1610,7 @@ def test_write_shp_dbf_only(tmpdir): # test that can read records and shapes with shapefile.Reader(shp=filename + ".shp", dbf=filename + ".dbf") as reader: - assert reader.shp and not reader.shx and reader.dbf + assert reader.shp and not reader._shx and reader.dbf assert (reader.numRecords, reader.numShapes) == ( 1, None,